From d234a2b06f9b6a2b8b9cccc3fdfe77337a89268f Mon Sep 17 00:00:00 2001
From: Nic Crane <thisisnic@gmail.com>
Date: Fri, 22 Oct 2021 09:16:27 -0400
Subject: [PATCH 001/194] ARROW-14416: [R] Fix package installation on the
 Raspberry Pi

I've added -latomic to PKG_LIBS, will need to test this though

Closes #11506 from thisisnic/ARROW-14416_pi

Authored-by: Nic Crane <thisisnic@gmail.com>
Signed-off-by: Neal Richardson <neal.p.richardson@gmail.com>
---
 r/configure | 1 +
 1 file changed, 1 insertion(+)

diff --git a/r/configure b/r/configure
index cd2314949bf2d..13177c5875f35 100755
--- a/r/configure
+++ b/r/configure
@@ -204,6 +204,7 @@ fi
 # See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81358 for similar example
 if grep raspbian /etc/os-release >/dev/null 2>&1; then
   PKG_CFLAGS="$PKG_CFLAGS -DARROW_CXXFLAGS=-latomic"
+  PKG_LIBS="-latomic $PKG_LIBS"
 fi
 
 # If libarrow uses the old GLIBCXX ABI, so we have to use it too

From 9ce3440670a9bbb0204e88af5466e39612b649d5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dragos=20Moldovan-Gr=C3=BCnfeld?= <dragos.mold@gmail.com>
Date: Fri, 22 Oct 2021 15:47:33 +0100
Subject: [PATCH 002/194] ARROW-13156 [R] bindings for str_count
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes #11473 from dragosmg/ARROW-13156_str_count_bindings

Lead-authored-by: Dragos Moldovan-Grünfeld <dragos.mold@gmail.com>
Co-authored-by: Dragoș Moldovan-Grünfeld <dragos.mold@gmail.com>
Signed-off-by: Nic Crane <thisisnic@gmail.com>
---
 r/R/dplyr-functions.R                      | 13 +++++
 r/tests/testthat/test-dplyr-funcs-string.R | 60 ++++++++++++++++++++++
 2 files changed, 73 insertions(+)

diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R
index dbb9d5f46f603..717cdae966275 100644
--- a/r/R/dplyr-functions.R
+++ b/r/R/dplyr-functions.R
@@ -645,6 +645,19 @@ nse_funcs$str_ends <- function(string, pattern, negate = FALSE) {
   out
 }
 
+nse_funcs$str_count <- function(string, pattern) {
+  opts <- get_stringr_pattern_options(enexpr(pattern))
+  if (!is.string(pattern)) {
+    arrow_not_supported("`pattern` must be a length 1 character vector; other values")
+  }
+  arrow_fun <- ifelse(opts$fixed, "count_substring", "count_substring_regex")
+  Expression$create(
+    arrow_fun,
+    string,
+    options = list(pattern = opts$pattern, ignore_case = opts$ignore_case)
+  )
+}
+
 # String function helpers
 
 # format `pattern` as needed for case insensitivity and literal matching by RE2
diff --git a/r/tests/testthat/test-dplyr-funcs-string.R b/r/tests/testthat/test-dplyr-funcs-string.R
index dd59b5ac55da5..333735be4f093 100644
--- a/r/tests/testthat/test-dplyr-funcs-string.R
+++ b/r/tests/testthat/test-dplyr-funcs-string.R
@@ -1336,3 +1336,63 @@ test_that("str_starts, str_ends, startsWith, endsWith", {
     df
   )
 })
+
+test_that("str_count", {
+  df <- tibble(
+    cities = c("Kolkata", "Dar es Salaam", "Tel Aviv", "San Antonio", "Cluj Napoca", "Bern", "Bogota"),
+    dots = c("a.", "...", ".a.a", "a..a.", "ab...", "dse....", ".f..d..")
+  )
+
+  expect_dplyr_equal(
+    input %>%
+      mutate(a_count = str_count(cities, pattern = "a")) %>%
+      collect(),
+    df
+  )
+
+  expect_dplyr_equal(
+    input %>%
+      mutate(p_count = str_count(cities, pattern = "d")) %>%
+      collect(),
+    df
+  )
+
+  expect_dplyr_equal(
+    input %>%
+      mutate(p_count = str_count(cities,
+        pattern = regex("d", ignore_case = TRUE)
+      )) %>%
+      collect(),
+    df
+  )
+
+  expect_dplyr_equal(
+    input %>%
+      mutate(e_count = str_count(cities, pattern = "u")) %>%
+      collect(),
+    df
+  )
+
+  # nse_funcs$str_count() is not vectorised over pattern
+  expect_dplyr_equal(
+    input %>%
+      mutate(let_count = str_count(cities, pattern = c("a", "b", "e", "g", "p", "n", "s"))) %>%
+      collect(),
+    df,
+    warning = TRUE
+  )
+
+  expect_dplyr_equal(
+    input %>%
+      mutate(dots_count = str_count(dots, ".")) %>%
+      collect(),
+    df
+  )
+
+  expect_dplyr_equal(
+    input %>%
+      mutate(dots_count = str_count(dots, fixed("."))) %>%
+      collect(),
+    df
+  )
+})

From 176c1132e578ecf0b81429246f95742f250e9305 Mon Sep 17 00:00:00 2001
From: Phillip Cloud <417981+cpcloud@users.noreply.github.com>
Date: Fri, 22 Oct 2021 12:06:44 -0400
Subject: [PATCH 003/194] ARROW-14350: [IR] Add filter expression to Source
 node

This PR adds a `filter` expression to `Source` nodes to support
consumers that implement predicate pushdown.

Closes #11438 from cpcloud/ARROW-14350

Authored-by: Phillip Cloud <417981+cpcloud@users.noreply.github.com>
Signed-off-by: Benjamin Kietzman <bengilgit@gmail.com>
---
 cpp/src/generated/Relation_generated.h | 21 ++++++++++++++++++++-
 experimental/computeir/Relation.fbs    |  7 +++++++
 2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/cpp/src/generated/Relation_generated.h b/cpp/src/generated/Relation_generated.h
index 6c9d9bc927a95..2c58784e0c45b 100644
--- a/cpp/src/generated/Relation_generated.h
+++ b/cpp/src/generated/Relation_generated.h
@@ -1327,7 +1327,8 @@ struct Source FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_BASE = 4,
     VT_NAME = 6,
-    VT_SCHEMA = 8
+    VT_FILTER = 8,
+    VT_SCHEMA = 10
   };
   const org::apache::arrow::computeir::flatbuf::RelBase *base() const {
     return GetPointer<const org::apache::arrow::computeir::flatbuf::RelBase *>(VT_BASE);
@@ -1335,6 +1336,15 @@ struct Source FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const flatbuffers::String *name() const {
     return GetPointer<const flatbuffers::String *>(VT_NAME);
   }
+  /// An optional expression used to filter out rows directly from the source.
+  ///
+  /// Useful for consumers that implement predicate pushdown.
+  ///
+  /// A missing filter value indicates no filter, i.e., all rows are
+  /// returned from the source.
+  const org::apache::arrow::computeir::flatbuf::Expression *filter() const {
+    return GetPointer<const org::apache::arrow::computeir::flatbuf::Expression *>(VT_FILTER);
+  }
   const org::apache::arrow::flatbuf::Schema *schema() const {
     return GetPointer<const org::apache::arrow::flatbuf::Schema *>(VT_SCHEMA);
   }
@@ -1344,6 +1354,8 @@ struct Source FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            verifier.VerifyTable(base()) &&
            VerifyOffsetRequired(verifier, VT_NAME) &&
            verifier.VerifyString(name()) &&
+           VerifyOffset(verifier, VT_FILTER) &&
+           verifier.VerifyTable(filter()) &&
            VerifyOffsetRequired(verifier, VT_SCHEMA) &&
            verifier.VerifyTable(schema()) &&
            verifier.EndTable();
@@ -1360,6 +1372,9 @@ struct SourceBuilder {
   void add_name(flatbuffers::Offset<flatbuffers::String> name) {
     fbb_.AddOffset(Source::VT_NAME, name);
   }
+  void add_filter(flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::Expression> filter) {
+    fbb_.AddOffset(Source::VT_FILTER, filter);
+  }
   void add_schema(flatbuffers::Offset<org::apache::arrow::flatbuf::Schema> schema) {
     fbb_.AddOffset(Source::VT_SCHEMA, schema);
   }
@@ -1382,9 +1397,11 @@ inline flatbuffers::Offset<Source> CreateSource(
     flatbuffers::FlatBufferBuilder &_fbb,
     flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::RelBase> base = 0,
     flatbuffers::Offset<flatbuffers::String> name = 0,
+    flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::Expression> filter = 0,
     flatbuffers::Offset<org::apache::arrow::flatbuf::Schema> schema = 0) {
   SourceBuilder builder_(_fbb);
   builder_.add_schema(schema);
+  builder_.add_filter(filter);
   builder_.add_name(name);
   builder_.add_base(base);
   return builder_.Finish();
@@ -1394,12 +1411,14 @@ inline flatbuffers::Offset<Source> CreateSourceDirect(
     flatbuffers::FlatBufferBuilder &_fbb,
     flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::RelBase> base = 0,
     const char *name = nullptr,
+    flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::Expression> filter = 0,
     flatbuffers::Offset<org::apache::arrow::flatbuf::Schema> schema = 0) {
   auto name__ = name ? _fbb.CreateString(name) : 0;
   return org::apache::arrow::computeir::flatbuf::CreateSource(
       _fbb,
       base,
       name__,
+      filter,
       schema);
 }
 
diff --git a/experimental/computeir/Relation.fbs b/experimental/computeir/Relation.fbs
index ab0156e0f1b05..3af159a033952 100644
--- a/experimental/computeir/Relation.fbs
+++ b/experimental/computeir/Relation.fbs
@@ -197,6 +197,13 @@ table LiteralRelation {
 table Source {
   base: RelBase (required);
   name: string (required);
+  /// An optional expression used to filter out rows directly from the source.
+  ///
+  /// Useful for consumers that implement predicate pushdown.
+  ///
+  /// A missing filter value indicates no filter, i.e., all rows are
+  /// returned from the source.
+  filter: Expression;
   schema: org.apache.arrow.flatbuf.Schema (required);
 }
 

From ee18c08c6e65ac4968e7615cbae3558d85cf5e02 Mon Sep 17 00:00:00 2001
From: Phillip Cloud <417981+cpcloud@users.noreply.github.com>
Date: Fri, 22 Oct 2021 12:11:33 -0400
Subject: [PATCH 004/194] ARROW-14349: [IR] Remove RelBase

This PR removes `RelBase` and associated column (re)mapping types.
Producers/consumers are expected to use projections for reordering for the time
being.

Closes #11435 from cpcloud/ARROW-14349

Authored-by: Phillip Cloud <417981+cpcloud@users.noreply.github.com>
Signed-off-by: Benjamin Kietzman <bengilgit@gmail.com>
---
 cpp/src/generated/Relation_generated.h | 483 ++++++-------------------
 experimental/computeir/Relation.fbs    |  74 ++--
 2 files changed, 138 insertions(+), 419 deletions(-)

diff --git a/cpp/src/generated/Relation_generated.h b/cpp/src/generated/Relation_generated.h
index 2c58784e0c45b..0dbbc86ed5cc7 100644
--- a/cpp/src/generated/Relation_generated.h
+++ b/cpp/src/generated/Relation_generated.h
@@ -16,18 +16,9 @@ namespace arrow {
 namespace computeir {
 namespace flatbuf {
 
-struct Remap;
-struct RemapBuilder;
-
-struct PassThrough;
-struct PassThroughBuilder;
-
 struct RelId;
 struct RelIdBuilder;
 
-struct RelBase;
-struct RelBaseBuilder;
-
 struct Filter;
 struct FilterBuilder;
 
@@ -64,55 +55,6 @@ struct SourceBuilder;
 struct Relation;
 struct RelationBuilder;
 
-/// A union for the different colum remapping variants
-enum class Emit : uint8_t {
-  NONE = 0,
-  Remap = 1,
-  PassThrough = 2,
-  MIN = NONE,
-  MAX = PassThrough
-};
-
-inline const Emit (&EnumValuesEmit())[3] {
-  static const Emit values[] = {
-    Emit::NONE,
-    Emit::Remap,
-    Emit::PassThrough
-  };
-  return values;
-}
-
-inline const char * const *EnumNamesEmit() {
-  static const char * const names[4] = {
-    "NONE",
-    "Remap",
-    "PassThrough",
-    nullptr
-  };
-  return names;
-}
-
-inline const char *EnumNameEmit(Emit e) {
-  if (flatbuffers::IsOutRange(e, Emit::NONE, Emit::PassThrough)) return "";
-  const size_t index = static_cast<size_t>(e);
-  return EnumNamesEmit()[index];
-}
-
-template<typename T> struct EmitTraits {
-  static const Emit enum_value = Emit::NONE;
-};
-
-template<> struct EmitTraits<org::apache::arrow::computeir::flatbuf::Remap> {
-  static const Emit enum_value = Emit::Remap;
-};
-
-template<> struct EmitTraits<org::apache::arrow::computeir::flatbuf::PassThrough> {
-  static const Emit enum_value = Emit::PassThrough;
-};
-
-bool VerifyEmit(flatbuffers::Verifier &verifier, const void *obj, Emit type);
-bool VerifyEmitVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types);
-
 enum class JoinKind : uint8_t {
   Anti = 0,
   Cross = 1,
@@ -290,97 +232,6 @@ template<> struct RelationImplTraits<org::apache::arrow::computeir::flatbuf::Sou
 bool VerifyRelationImpl(flatbuffers::Verifier &verifier, const void *obj, RelationImpl type);
 bool VerifyRelationImplVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types);
 
-/// A data type indicating that a different mapping of columns
-/// should occur in the output.
-///
-/// For example:
-///
-/// Given a query `SELECT b, a FROM t` where `t` has columns a, b, c
-/// the mapping value for the projection would equal [1, 0].
-struct Remap FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
-  typedef RemapBuilder Builder;
-  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
-    VT_MAPPING = 4
-  };
-  const flatbuffers::Vector<flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::FieldIndex>> *mapping() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::FieldIndex>> *>(VT_MAPPING);
-  }
-  bool Verify(flatbuffers::Verifier &verifier) const {
-    return VerifyTableStart(verifier) &&
-           VerifyOffsetRequired(verifier, VT_MAPPING) &&
-           verifier.VerifyVector(mapping()) &&
-           verifier.VerifyVectorOfTables(mapping()) &&
-           verifier.EndTable();
-  }
-};
-
-struct RemapBuilder {
-  typedef Remap Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_mapping(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::FieldIndex>>> mapping) {
-    fbb_.AddOffset(Remap::VT_MAPPING, mapping);
-  }
-  explicit RemapBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
-    start_ = fbb_.StartTable();
-  }
-  RemapBuilder &operator=(const RemapBuilder &);
-  flatbuffers::Offset<Remap> Finish() {
-    const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<Remap>(end);
-    fbb_.Required(o, Remap::VT_MAPPING);
-    return o;
-  }
-};
-
-inline flatbuffers::Offset<Remap> CreateRemap(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::FieldIndex>>> mapping = 0) {
-  RemapBuilder builder_(_fbb);
-  builder_.add_mapping(mapping);
-  return builder_.Finish();
-}
-
-inline flatbuffers::Offset<Remap> CreateRemapDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    const std::vector<flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::FieldIndex>> *mapping = nullptr) {
-  auto mapping__ = mapping ? _fbb.CreateVector<flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::FieldIndex>>(*mapping) : 0;
-  return org::apache::arrow::computeir::flatbuf::CreateRemap(
-      _fbb,
-      mapping__);
-}
-
-struct PassThrough FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
-  typedef PassThroughBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
-    return VerifyTableStart(verifier) &&
-           verifier.EndTable();
-  }
-};
-
-struct PassThroughBuilder {
-  typedef PassThrough Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit PassThroughBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
-    start_ = fbb_.StartTable();
-  }
-  PassThroughBuilder &operator=(const PassThroughBuilder &);
-  flatbuffers::Offset<PassThrough> Finish() {
-    const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<PassThrough>(end);
-    return o;
-  }
-};
-
-inline flatbuffers::Offset<PassThrough> CreatePassThrough(
-    flatbuffers::FlatBufferBuilder &_fbb) {
-  PassThroughBuilder builder_(_fbb);
-  return builder_.Finish();
-}
-
 /// An identifier for relations in a query.
 ///
 /// A table is used here to allow plan implementations optionality.
@@ -426,101 +277,18 @@ inline flatbuffers::Offset<RelId> CreateRelId(
   return builder_.Finish();
 }
 
-/// Fields common to every relational operator
-struct RelBase FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
-  typedef RelBaseBuilder Builder;
-  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
-    VT_OUTPUT_MAPPING_TYPE = 4,
-    VT_OUTPUT_MAPPING = 6,
-    VT_ID = 8
-  };
-  org::apache::arrow::computeir::flatbuf::Emit output_mapping_type() const {
-    return static_cast<org::apache::arrow::computeir::flatbuf::Emit>(GetField<uint8_t>(VT_OUTPUT_MAPPING_TYPE, 0));
-  }
-  /// Output remapping of ordinal columns for a given operation
-  const void *output_mapping() const {
-    return GetPointer<const void *>(VT_OUTPUT_MAPPING);
-  }
-  template<typename T> const T *output_mapping_as() const;
-  const org::apache::arrow::computeir::flatbuf::Remap *output_mapping_as_Remap() const {
-    return output_mapping_type() == org::apache::arrow::computeir::flatbuf::Emit::Remap ? static_cast<const org::apache::arrow::computeir::flatbuf::Remap *>(output_mapping()) : nullptr;
-  }
-  const org::apache::arrow::computeir::flatbuf::PassThrough *output_mapping_as_PassThrough() const {
-    return output_mapping_type() == org::apache::arrow::computeir::flatbuf::Emit::PassThrough ? static_cast<const org::apache::arrow::computeir::flatbuf::PassThrough *>(output_mapping()) : nullptr;
-  }
-  /// An identifiier for a relation. The identifier should be unique over the
-  /// entire plan. Optional.
-  const org::apache::arrow::computeir::flatbuf::RelId *id() const {
-    return GetPointer<const org::apache::arrow::computeir::flatbuf::RelId *>(VT_ID);
-  }
-  bool Verify(flatbuffers::Verifier &verifier) const {
-    return VerifyTableStart(verifier) &&
-           VerifyField<uint8_t>(verifier, VT_OUTPUT_MAPPING_TYPE) &&
-           VerifyOffsetRequired(verifier, VT_OUTPUT_MAPPING) &&
-           VerifyEmit(verifier, output_mapping(), output_mapping_type()) &&
-           VerifyOffset(verifier, VT_ID) &&
-           verifier.VerifyTable(id()) &&
-           verifier.EndTable();
-  }
-};
-
-template<> inline const org::apache::arrow::computeir::flatbuf::Remap *RelBase::output_mapping_as<org::apache::arrow::computeir::flatbuf::Remap>() const {
-  return output_mapping_as_Remap();
-}
-
-template<> inline const org::apache::arrow::computeir::flatbuf::PassThrough *RelBase::output_mapping_as<org::apache::arrow::computeir::flatbuf::PassThrough>() const {
-  return output_mapping_as_PassThrough();
-}
-
-struct RelBaseBuilder {
-  typedef RelBase Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_output_mapping_type(org::apache::arrow::computeir::flatbuf::Emit output_mapping_type) {
-    fbb_.AddElement<uint8_t>(RelBase::VT_OUTPUT_MAPPING_TYPE, static_cast<uint8_t>(output_mapping_type), 0);
-  }
-  void add_output_mapping(flatbuffers::Offset<void> output_mapping) {
-    fbb_.AddOffset(RelBase::VT_OUTPUT_MAPPING, output_mapping);
-  }
-  void add_id(flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::RelId> id) {
-    fbb_.AddOffset(RelBase::VT_ID, id);
-  }
-  explicit RelBaseBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
-    start_ = fbb_.StartTable();
-  }
-  RelBaseBuilder &operator=(const RelBaseBuilder &);
-  flatbuffers::Offset<RelBase> Finish() {
-    const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<RelBase>(end);
-    fbb_.Required(o, RelBase::VT_OUTPUT_MAPPING);
-    return o;
-  }
-};
-
-inline flatbuffers::Offset<RelBase> CreateRelBase(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    org::apache::arrow::computeir::flatbuf::Emit output_mapping_type = org::apache::arrow::computeir::flatbuf::Emit::NONE,
-    flatbuffers::Offset<void> output_mapping = 0,
-    flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::RelId> id = 0) {
-  RelBaseBuilder builder_(_fbb);
-  builder_.add_id(id);
-  builder_.add_output_mapping(output_mapping);
-  builder_.add_output_mapping_type(output_mapping_type);
-  return builder_.Finish();
-}
-
 /// Filter operation
 struct Filter FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef FilterBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
-    VT_BASE = 4,
+    VT_ID = 4,
     VT_REL = 6,
     VT_PREDICATE = 8
   };
-  /// Common options
-  const org::apache::arrow::computeir::flatbuf::RelBase *base() const {
-    return GetPointer<const org::apache::arrow::computeir::flatbuf::RelBase *>(VT_BASE);
+  /// An identifiier for the relation. The identifier should be unique over the
+  /// entire plan. Optional.
+  const org::apache::arrow::computeir::flatbuf::RelId *id() const {
+    return GetPointer<const org::apache::arrow::computeir::flatbuf::RelId *>(VT_ID);
   }
   /// Child relation
   const org::apache::arrow::computeir::flatbuf::Relation *rel() const {
@@ -534,8 +302,8 @@ struct Filter FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
-           VerifyOffsetRequired(verifier, VT_BASE) &&
-           verifier.VerifyTable(base()) &&
+           VerifyOffset(verifier, VT_ID) &&
+           verifier.VerifyTable(id()) &&
            VerifyOffsetRequired(verifier, VT_REL) &&
            verifier.VerifyTable(rel()) &&
            VerifyOffsetRequired(verifier, VT_PREDICATE) &&
@@ -548,8 +316,8 @@ struct FilterBuilder {
   typedef Filter Table;
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
-  void add_base(flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::RelBase> base) {
-    fbb_.AddOffset(Filter::VT_BASE, base);
+  void add_id(flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::RelId> id) {
+    fbb_.AddOffset(Filter::VT_ID, id);
   }
   void add_rel(flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::Relation> rel) {
     fbb_.AddOffset(Filter::VT_REL, rel);
@@ -565,7 +333,6 @@ struct FilterBuilder {
   flatbuffers::Offset<Filter> Finish() {
     const auto end = fbb_.EndTable(start_);
     auto o = flatbuffers::Offset<Filter>(end);
-    fbb_.Required(o, Filter::VT_BASE);
     fbb_.Required(o, Filter::VT_REL);
     fbb_.Required(o, Filter::VT_PREDICATE);
     return o;
@@ -574,13 +341,13 @@ struct FilterBuilder {
 
 inline flatbuffers::Offset<Filter> CreateFilter(
     flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::RelBase> base = 0,
+    flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::RelId> id = 0,
     flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::Relation> rel = 0,
     flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::Expression> predicate = 0) {
   FilterBuilder builder_(_fbb);
   builder_.add_predicate(predicate);
   builder_.add_rel(rel);
-  builder_.add_base(base);
+  builder_.add_id(id);
   return builder_.Finish();
 }
 
@@ -588,13 +355,14 @@ inline flatbuffers::Offset<Filter> CreateFilter(
 struct Project FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef ProjectBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
-    VT_BASE = 4,
+    VT_ID = 4,
     VT_REL = 6,
     VT_EXPRESSIONS = 8
   };
-  /// Common options
-  const org::apache::arrow::computeir::flatbuf::RelBase *base() const {
-    return GetPointer<const org::apache::arrow::computeir::flatbuf::RelBase *>(VT_BASE);
+  /// An identifiier for the relation. The identifier should be unique over the
+  /// entire plan. Optional.
+  const org::apache::arrow::computeir::flatbuf::RelId *id() const {
+    return GetPointer<const org::apache::arrow::computeir::flatbuf::RelId *>(VT_ID);
   }
   /// Child relation
   const org::apache::arrow::computeir::flatbuf::Relation *rel() const {
@@ -607,8 +375,8 @@ struct Project FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
-           VerifyOffsetRequired(verifier, VT_BASE) &&
-           verifier.VerifyTable(base()) &&
+           VerifyOffset(verifier, VT_ID) &&
+           verifier.VerifyTable(id()) &&
            VerifyOffsetRequired(verifier, VT_REL) &&
            verifier.VerifyTable(rel()) &&
            VerifyOffsetRequired(verifier, VT_EXPRESSIONS) &&
@@ -622,8 +390,8 @@ struct ProjectBuilder {
   typedef Project Table;
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
-  void add_base(flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::RelBase> base) {
-    fbb_.AddOffset(Project::VT_BASE, base);
+  void add_id(flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::RelId> id) {
+    fbb_.AddOffset(Project::VT_ID, id);
   }
   void add_rel(flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::Relation> rel) {
     fbb_.AddOffset(Project::VT_REL, rel);
@@ -639,7 +407,6 @@ struct ProjectBuilder {
   flatbuffers::Offset<Project> Finish() {
     const auto end = fbb_.EndTable(start_);
     auto o = flatbuffers::Offset<Project>(end);
-    fbb_.Required(o, Project::VT_BASE);
     fbb_.Required(o, Project::VT_REL);
     fbb_.Required(o, Project::VT_EXPRESSIONS);
     return o;
@@ -648,25 +415,25 @@ struct ProjectBuilder {
 
 inline flatbuffers::Offset<Project> CreateProject(
     flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::RelBase> base = 0,
+    flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::RelId> id = 0,
     flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::Relation> rel = 0,
     flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::Expression>>> expressions = 0) {
   ProjectBuilder builder_(_fbb);
   builder_.add_expressions(expressions);
   builder_.add_rel(rel);
-  builder_.add_base(base);
+  builder_.add_id(id);
   return builder_.Finish();
 }
 
 inline flatbuffers::Offset<Project> CreateProjectDirect(
     flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::RelBase> base = 0,
+    flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::RelId> id = 0,
     flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::Relation> rel = 0,
     const std::vector<flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::Expression>> *expressions = nullptr) {
   auto expressions__ = expressions ? _fbb.CreateVector<flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::Expression>>(*expressions) : 0;
   return org::apache::arrow::computeir::flatbuf::CreateProject(
       _fbb,
-      base,
+      id,
       rel,
       expressions__);
 }
@@ -731,14 +498,15 @@ inline flatbuffers::Offset<Grouping> CreateGroupingDirect(
 struct Aggregate FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef AggregateBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
-    VT_BASE = 4,
+    VT_ID = 4,
     VT_REL = 6,
     VT_MEASURES = 8,
     VT_GROUPINGS = 10
   };
-  /// Common options
-  const org::apache::arrow::computeir::flatbuf::RelBase *base() const {
-    return GetPointer<const org::apache::arrow::computeir::flatbuf::RelBase *>(VT_BASE);
+  /// An identifiier for the relation. The identifier should be unique over the
+  /// entire plan. Optional.
+  const org::apache::arrow::computeir::flatbuf::RelId *id() const {
+    return GetPointer<const org::apache::arrow::computeir::flatbuf::RelId *>(VT_ID);
   }
   /// Child relation
   const org::apache::arrow::computeir::flatbuf::Relation *rel() const {
@@ -768,8 +536,8 @@ struct Aggregate FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
-           VerifyOffsetRequired(verifier, VT_BASE) &&
-           verifier.VerifyTable(base()) &&
+           VerifyOffset(verifier, VT_ID) &&
+           verifier.VerifyTable(id()) &&
            VerifyOffsetRequired(verifier, VT_REL) &&
            verifier.VerifyTable(rel()) &&
            VerifyOffsetRequired(verifier, VT_MEASURES) &&
@@ -786,8 +554,8 @@ struct AggregateBuilder {
   typedef Aggregate Table;
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
-  void add_base(flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::RelBase> base) {
-    fbb_.AddOffset(Aggregate::VT_BASE, base);
+  void add_id(flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::RelId> id) {
+    fbb_.AddOffset(Aggregate::VT_ID, id);
   }
   void add_rel(flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::Relation> rel) {
     fbb_.AddOffset(Aggregate::VT_REL, rel);
@@ -806,7 +574,6 @@ struct AggregateBuilder {
   flatbuffers::Offset<Aggregate> Finish() {
     const auto end = fbb_.EndTable(start_);
     auto o = flatbuffers::Offset<Aggregate>(end);
-    fbb_.Required(o, Aggregate::VT_BASE);
     fbb_.Required(o, Aggregate::VT_REL);
     fbb_.Required(o, Aggregate::VT_MEASURES);
     fbb_.Required(o, Aggregate::VT_GROUPINGS);
@@ -816,7 +583,7 @@ struct AggregateBuilder {
 
 inline flatbuffers::Offset<Aggregate> CreateAggregate(
     flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::RelBase> base = 0,
+    flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::RelId> id = 0,
     flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::Relation> rel = 0,
     flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::Expression>>> measures = 0,
     flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::Grouping>>> groupings = 0) {
@@ -824,13 +591,13 @@ inline flatbuffers::Offset<Aggregate> CreateAggregate(
   builder_.add_groupings(groupings);
   builder_.add_measures(measures);
   builder_.add_rel(rel);
-  builder_.add_base(base);
+  builder_.add_id(id);
   return builder_.Finish();
 }
 
 inline flatbuffers::Offset<Aggregate> CreateAggregateDirect(
     flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::RelBase> base = 0,
+    flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::RelId> id = 0,
     flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::Relation> rel = 0,
     const std::vector<flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::Expression>> *measures = nullptr,
     const std::vector<flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::Grouping>> *groupings = nullptr) {
@@ -838,7 +605,7 @@ inline flatbuffers::Offset<Aggregate> CreateAggregateDirect(
   auto groupings__ = groupings ? _fbb.CreateVector<flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::Grouping>>(*groupings) : 0;
   return org::apache::arrow::computeir::flatbuf::CreateAggregate(
       _fbb,
-      base,
+      id,
       rel,
       measures__,
       groupings__);
@@ -848,15 +615,16 @@ inline flatbuffers::Offset<Aggregate> CreateAggregateDirect(
 struct Join FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef JoinBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
-    VT_BASE = 4,
+    VT_ID = 4,
     VT_LEFT = 6,
     VT_RIGHT = 8,
     VT_ON_EXPRESSION = 10,
     VT_JOIN_KIND = 12
   };
-  /// Common options
-  const org::apache::arrow::computeir::flatbuf::RelBase *base() const {
-    return GetPointer<const org::apache::arrow::computeir::flatbuf::RelBase *>(VT_BASE);
+  /// An identifiier for the relation. The identifier should be unique over the
+  /// entire plan. Optional.
+  const org::apache::arrow::computeir::flatbuf::RelId *id() const {
+    return GetPointer<const org::apache::arrow::computeir::flatbuf::RelId *>(VT_ID);
   }
   /// Left relation
   const org::apache::arrow::computeir::flatbuf::Relation *left() const {
@@ -878,8 +646,8 @@ struct Join FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
-           VerifyOffsetRequired(verifier, VT_BASE) &&
-           verifier.VerifyTable(base()) &&
+           VerifyOffset(verifier, VT_ID) &&
+           verifier.VerifyTable(id()) &&
            VerifyOffsetRequired(verifier, VT_LEFT) &&
            verifier.VerifyTable(left()) &&
            VerifyOffsetRequired(verifier, VT_RIGHT) &&
@@ -895,8 +663,8 @@ struct JoinBuilder {
   typedef Join Table;
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
-  void add_base(flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::RelBase> base) {
-    fbb_.AddOffset(Join::VT_BASE, base);
+  void add_id(flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::RelId> id) {
+    fbb_.AddOffset(Join::VT_ID, id);
   }
   void add_left(flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::Relation> left) {
     fbb_.AddOffset(Join::VT_LEFT, left);
@@ -918,7 +686,6 @@ struct JoinBuilder {
   flatbuffers::Offset<Join> Finish() {
     const auto end = fbb_.EndTable(start_);
     auto o = flatbuffers::Offset<Join>(end);
-    fbb_.Required(o, Join::VT_BASE);
     fbb_.Required(o, Join::VT_LEFT);
     fbb_.Required(o, Join::VT_RIGHT);
     fbb_.Required(o, Join::VT_ON_EXPRESSION);
@@ -928,7 +695,7 @@ struct JoinBuilder {
 
 inline flatbuffers::Offset<Join> CreateJoin(
     flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::RelBase> base = 0,
+    flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::RelId> id = 0,
     flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::Relation> left = 0,
     flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::Relation> right = 0,
     flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::Expression> on_expression = 0,
@@ -937,7 +704,7 @@ inline flatbuffers::Offset<Join> CreateJoin(
   builder_.add_on_expression(on_expression);
   builder_.add_right(right);
   builder_.add_left(left);
-  builder_.add_base(base);
+  builder_.add_id(id);
   builder_.add_join_kind(join_kind);
   return builder_.Finish();
 }
@@ -946,13 +713,14 @@ inline flatbuffers::Offset<Join> CreateJoin(
 struct OrderBy FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef OrderByBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
-    VT_BASE = 4,
+    VT_ID = 4,
     VT_REL = 6,
     VT_KEYS = 8
   };
-  /// Common options
-  const org::apache::arrow::computeir::flatbuf::RelBase *base() const {
-    return GetPointer<const org::apache::arrow::computeir::flatbuf::RelBase *>(VT_BASE);
+  /// An identifiier for the relation. The identifier should be unique over the
+  /// entire plan. Optional.
+  const org::apache::arrow::computeir::flatbuf::RelId *id() const {
+    return GetPointer<const org::apache::arrow::computeir::flatbuf::RelId *>(VT_ID);
   }
   /// Child relation
   const org::apache::arrow::computeir::flatbuf::Relation *rel() const {
@@ -965,8 +733,8 @@ struct OrderBy FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
-           VerifyOffsetRequired(verifier, VT_BASE) &&
-           verifier.VerifyTable(base()) &&
+           VerifyOffset(verifier, VT_ID) &&
+           verifier.VerifyTable(id()) &&
            VerifyOffsetRequired(verifier, VT_REL) &&
            verifier.VerifyTable(rel()) &&
            VerifyOffsetRequired(verifier, VT_KEYS) &&
@@ -980,8 +748,8 @@ struct OrderByBuilder {
   typedef OrderBy Table;
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
-  void add_base(flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::RelBase> base) {
-    fbb_.AddOffset(OrderBy::VT_BASE, base);
+  void add_id(flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::RelId> id) {
+    fbb_.AddOffset(OrderBy::VT_ID, id);
   }
   void add_rel(flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::Relation> rel) {
     fbb_.AddOffset(OrderBy::VT_REL, rel);
@@ -997,7 +765,6 @@ struct OrderByBuilder {
   flatbuffers::Offset<OrderBy> Finish() {
     const auto end = fbb_.EndTable(start_);
     auto o = flatbuffers::Offset<OrderBy>(end);
-    fbb_.Required(o, OrderBy::VT_BASE);
     fbb_.Required(o, OrderBy::VT_REL);
     fbb_.Required(o, OrderBy::VT_KEYS);
     return o;
@@ -1006,25 +773,25 @@ struct OrderByBuilder {
 
 inline flatbuffers::Offset<OrderBy> CreateOrderBy(
     flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::RelBase> base = 0,
+    flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::RelId> id = 0,
     flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::Relation> rel = 0,
     flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::SortKey>>> keys = 0) {
   OrderByBuilder builder_(_fbb);
   builder_.add_keys(keys);
   builder_.add_rel(rel);
-  builder_.add_base(base);
+  builder_.add_id(id);
   return builder_.Finish();
 }
 
 inline flatbuffers::Offset<OrderBy> CreateOrderByDirect(
     flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::RelBase> base = 0,
+    flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::RelId> id = 0,
     flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::Relation> rel = 0,
     const std::vector<flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::SortKey>> *keys = nullptr) {
   auto keys__ = keys ? _fbb.CreateVector<flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::SortKey>>(*keys) : 0;
   return org::apache::arrow::computeir::flatbuf::CreateOrderBy(
       _fbb,
-      base,
+      id,
       rel,
       keys__);
 }
@@ -1033,14 +800,15 @@ inline flatbuffers::Offset<OrderBy> CreateOrderByDirect(
 struct Limit FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef LimitBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
-    VT_BASE = 4,
+    VT_ID = 4,
     VT_REL = 6,
     VT_OFFSET = 8,
     VT_COUNT = 10
   };
-  /// Common options
-  const org::apache::arrow::computeir::flatbuf::RelBase *base() const {
-    return GetPointer<const org::apache::arrow::computeir::flatbuf::RelBase *>(VT_BASE);
+  /// An identifiier for the relation. The identifier should be unique over the
+  /// entire plan. Optional.
+  const org::apache::arrow::computeir::flatbuf::RelId *id() const {
+    return GetPointer<const org::apache::arrow::computeir::flatbuf::RelId *>(VT_ID);
   }
   /// Child relation
   const org::apache::arrow::computeir::flatbuf::Relation *rel() const {
@@ -1056,8 +824,8 @@ struct Limit FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
-           VerifyOffsetRequired(verifier, VT_BASE) &&
-           verifier.VerifyTable(base()) &&
+           VerifyOffset(verifier, VT_ID) &&
+           verifier.VerifyTable(id()) &&
            VerifyOffsetRequired(verifier, VT_REL) &&
            verifier.VerifyTable(rel()) &&
            VerifyField<uint32_t>(verifier, VT_OFFSET) &&
@@ -1070,8 +838,8 @@ struct LimitBuilder {
   typedef Limit Table;
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
-  void add_base(flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::RelBase> base) {
-    fbb_.AddOffset(Limit::VT_BASE, base);
+  void add_id(flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::RelId> id) {
+    fbb_.AddOffset(Limit::VT_ID, id);
   }
   void add_rel(flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::Relation> rel) {
     fbb_.AddOffset(Limit::VT_REL, rel);
@@ -1090,7 +858,6 @@ struct LimitBuilder {
   flatbuffers::Offset<Limit> Finish() {
     const auto end = fbb_.EndTable(start_);
     auto o = flatbuffers::Offset<Limit>(end);
-    fbb_.Required(o, Limit::VT_BASE);
     fbb_.Required(o, Limit::VT_REL);
     return o;
   }
@@ -1098,7 +865,7 @@ struct LimitBuilder {
 
 inline flatbuffers::Offset<Limit> CreateLimit(
     flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::RelBase> base = 0,
+    flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::RelId> id = 0,
     flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::Relation> rel = 0,
     uint32_t offset = 0,
     uint32_t count = 0) {
@@ -1106,7 +873,7 @@ inline flatbuffers::Offset<Limit> CreateLimit(
   builder_.add_count(count);
   builder_.add_offset(offset);
   builder_.add_rel(rel);
-  builder_.add_base(base);
+  builder_.add_id(id);
   return builder_.Finish();
 }
 
@@ -1114,13 +881,14 @@ inline flatbuffers::Offset<Limit> CreateLimit(
 struct SetOperation FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef SetOperationBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
-    VT_BASE = 4,
+    VT_ID = 4,
     VT_RELS = 6,
     VT_SET_OP = 8
   };
-  /// Common options
-  const org::apache::arrow::computeir::flatbuf::RelBase *base() const {
-    return GetPointer<const org::apache::arrow::computeir::flatbuf::RelBase *>(VT_BASE);
+  /// An identifiier for the relation. The identifier should be unique over the
+  /// entire plan. Optional.
+  const org::apache::arrow::computeir::flatbuf::RelId *id() const {
+    return GetPointer<const org::apache::arrow::computeir::flatbuf::RelId *>(VT_ID);
   }
   /// Child relations
   const flatbuffers::Vector<flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::Relation>> *rels() const {
@@ -1132,8 +900,8 @@ struct SetOperation FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
-           VerifyOffsetRequired(verifier, VT_BASE) &&
-           verifier.VerifyTable(base()) &&
+           VerifyOffset(verifier, VT_ID) &&
+           verifier.VerifyTable(id()) &&
            VerifyOffsetRequired(verifier, VT_RELS) &&
            verifier.VerifyVector(rels()) &&
            verifier.VerifyVectorOfTables(rels()) &&
@@ -1146,8 +914,8 @@ struct SetOperationBuilder {
   typedef SetOperation Table;
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
-  void add_base(flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::RelBase> base) {
-    fbb_.AddOffset(SetOperation::VT_BASE, base);
+  void add_id(flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::RelId> id) {
+    fbb_.AddOffset(SetOperation::VT_ID, id);
   }
   void add_rels(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::Relation>>> rels) {
     fbb_.AddOffset(SetOperation::VT_RELS, rels);
@@ -1163,7 +931,6 @@ struct SetOperationBuilder {
   flatbuffers::Offset<SetOperation> Finish() {
     const auto end = fbb_.EndTable(start_);
     auto o = flatbuffers::Offset<SetOperation>(end);
-    fbb_.Required(o, SetOperation::VT_BASE);
     fbb_.Required(o, SetOperation::VT_RELS);
     return o;
   }
@@ -1171,25 +938,25 @@ struct SetOperationBuilder {
 
 inline flatbuffers::Offset<SetOperation> CreateSetOperation(
     flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::RelBase> base = 0,
+    flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::RelId> id = 0,
     flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::Relation>>> rels = 0,
     org::apache::arrow::computeir::flatbuf::SetOpKind set_op = org::apache::arrow::computeir::flatbuf::SetOpKind::Union) {
   SetOperationBuilder builder_(_fbb);
   builder_.add_rels(rels);
-  builder_.add_base(base);
+  builder_.add_id(id);
   builder_.add_set_op(set_op);
   return builder_.Finish();
 }
 
 inline flatbuffers::Offset<SetOperation> CreateSetOperationDirect(
     flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::RelBase> base = 0,
+    flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::RelId> id = 0,
     const std::vector<flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::Relation>> *rels = nullptr,
     org::apache::arrow::computeir::flatbuf::SetOpKind set_op = org::apache::arrow::computeir::flatbuf::SetOpKind::Union) {
   auto rels__ = rels ? _fbb.CreateVector<flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::Relation>>(*rels) : 0;
   return org::apache::arrow::computeir::flatbuf::CreateSetOperation(
       _fbb,
-      base,
+      id,
       rels__,
       set_op);
 }
@@ -1254,12 +1021,13 @@ inline flatbuffers::Offset<LiteralColumn> CreateLiteralColumnDirect(
 struct LiteralRelation FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef LiteralRelationBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
-    VT_BASE = 4,
+    VT_ID = 4,
     VT_COLUMNS = 6
   };
-  /// Common options
-  const org::apache::arrow::computeir::flatbuf::RelBase *base() const {
-    return GetPointer<const org::apache::arrow::computeir::flatbuf::RelBase *>(VT_BASE);
+  /// An identifiier for the relation. The identifier should be unique over the
+  /// entire plan. Optional.
+  const org::apache::arrow::computeir::flatbuf::RelId *id() const {
+    return GetPointer<const org::apache::arrow::computeir::flatbuf::RelId *>(VT_ID);
   }
   /// The columns of this literal relation.
   const flatbuffers::Vector<flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::LiteralColumn>> *columns() const {
@@ -1267,8 +1035,8 @@ struct LiteralRelation FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
-           VerifyOffsetRequired(verifier, VT_BASE) &&
-           verifier.VerifyTable(base()) &&
+           VerifyOffset(verifier, VT_ID) &&
+           verifier.VerifyTable(id()) &&
            VerifyOffsetRequired(verifier, VT_COLUMNS) &&
            verifier.VerifyVector(columns()) &&
            verifier.VerifyVectorOfTables(columns()) &&
@@ -1280,8 +1048,8 @@ struct LiteralRelationBuilder {
   typedef LiteralRelation Table;
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
-  void add_base(flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::RelBase> base) {
-    fbb_.AddOffset(LiteralRelation::VT_BASE, base);
+  void add_id(flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::RelId> id) {
+    fbb_.AddOffset(LiteralRelation::VT_ID, id);
   }
   void add_columns(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::LiteralColumn>>> columns) {
     fbb_.AddOffset(LiteralRelation::VT_COLUMNS, columns);
@@ -1294,7 +1062,6 @@ struct LiteralRelationBuilder {
   flatbuffers::Offset<LiteralRelation> Finish() {
     const auto end = fbb_.EndTable(start_);
     auto o = flatbuffers::Offset<LiteralRelation>(end);
-    fbb_.Required(o, LiteralRelation::VT_BASE);
     fbb_.Required(o, LiteralRelation::VT_COLUMNS);
     return o;
   }
@@ -1302,22 +1069,22 @@ struct LiteralRelationBuilder {
 
 inline flatbuffers::Offset<LiteralRelation> CreateLiteralRelation(
     flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::RelBase> base = 0,
+    flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::RelId> id = 0,
     flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::LiteralColumn>>> columns = 0) {
   LiteralRelationBuilder builder_(_fbb);
   builder_.add_columns(columns);
-  builder_.add_base(base);
+  builder_.add_id(id);
   return builder_.Finish();
 }
 
 inline flatbuffers::Offset<LiteralRelation> CreateLiteralRelationDirect(
     flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::RelBase> base = 0,
+    flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::RelId> id = 0,
     const std::vector<flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::LiteralColumn>> *columns = nullptr) {
   auto columns__ = columns ? _fbb.CreateVector<flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::LiteralColumn>>(*columns) : 0;
   return org::apache::arrow::computeir::flatbuf::CreateLiteralRelation(
       _fbb,
-      base,
+      id,
       columns__);
 }
 
@@ -1325,13 +1092,15 @@ inline flatbuffers::Offset<LiteralRelation> CreateLiteralRelationDirect(
 struct Source FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef SourceBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
-    VT_BASE = 4,
+    VT_ID = 4,
     VT_NAME = 6,
     VT_FILTER = 8,
     VT_SCHEMA = 10
   };
-  const org::apache::arrow::computeir::flatbuf::RelBase *base() const {
-    return GetPointer<const org::apache::arrow::computeir::flatbuf::RelBase *>(VT_BASE);
+  /// An identifiier for the relation. The identifier should be unique over the
+  /// entire plan. Optional.
+  const org::apache::arrow::computeir::flatbuf::RelId *id() const {
+    return GetPointer<const org::apache::arrow::computeir::flatbuf::RelId *>(VT_ID);
   }
   const flatbuffers::String *name() const {
     return GetPointer<const flatbuffers::String *>(VT_NAME);
@@ -1350,8 +1119,8 @@ struct Source FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
-           VerifyOffsetRequired(verifier, VT_BASE) &&
-           verifier.VerifyTable(base()) &&
+           VerifyOffset(verifier, VT_ID) &&
+           verifier.VerifyTable(id()) &&
            VerifyOffsetRequired(verifier, VT_NAME) &&
            verifier.VerifyString(name()) &&
            VerifyOffset(verifier, VT_FILTER) &&
@@ -1366,8 +1135,8 @@ struct SourceBuilder {
   typedef Source Table;
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
-  void add_base(flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::RelBase> base) {
-    fbb_.AddOffset(Source::VT_BASE, base);
+  void add_id(flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::RelId> id) {
+    fbb_.AddOffset(Source::VT_ID, id);
   }
   void add_name(flatbuffers::Offset<flatbuffers::String> name) {
     fbb_.AddOffset(Source::VT_NAME, name);
@@ -1386,7 +1155,6 @@ struct SourceBuilder {
   flatbuffers::Offset<Source> Finish() {
     const auto end = fbb_.EndTable(start_);
     auto o = flatbuffers::Offset<Source>(end);
-    fbb_.Required(o, Source::VT_BASE);
     fbb_.Required(o, Source::VT_NAME);
     fbb_.Required(o, Source::VT_SCHEMA);
     return o;
@@ -1395,7 +1163,7 @@ struct SourceBuilder {
 
 inline flatbuffers::Offset<Source> CreateSource(
     flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::RelBase> base = 0,
+    flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::RelId> id = 0,
     flatbuffers::Offset<flatbuffers::String> name = 0,
     flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::Expression> filter = 0,
     flatbuffers::Offset<org::apache::arrow::flatbuf::Schema> schema = 0) {
@@ -1403,20 +1171,20 @@ inline flatbuffers::Offset<Source> CreateSource(
   builder_.add_schema(schema);
   builder_.add_filter(filter);
   builder_.add_name(name);
-  builder_.add_base(base);
+  builder_.add_id(id);
   return builder_.Finish();
 }
 
 inline flatbuffers::Offset<Source> CreateSourceDirect(
     flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::RelBase> base = 0,
+    flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::RelId> id = 0,
     const char *name = nullptr,
     flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::Expression> filter = 0,
     flatbuffers::Offset<org::apache::arrow::flatbuf::Schema> schema = 0) {
   auto name__ = name ? _fbb.CreateString(name) : 0;
   return org::apache::arrow::computeir::flatbuf::CreateSource(
       _fbb,
-      base,
+      id,
       name__,
       filter,
       schema);
@@ -1541,35 +1309,6 @@ inline flatbuffers::Offset<Relation> CreateRelation(
   return builder_.Finish();
 }
 
-inline bool VerifyEmit(flatbuffers::Verifier &verifier, const void *obj, Emit type) {
-  switch (type) {
-    case Emit::NONE: {
-      return true;
-    }
-    case Emit::Remap: {
-      auto ptr = reinterpret_cast<const org::apache::arrow::computeir::flatbuf::Remap *>(obj);
-      return verifier.VerifyTable(ptr);
-    }
-    case Emit::PassThrough: {
-      auto ptr = reinterpret_cast<const org::apache::arrow::computeir::flatbuf::PassThrough *>(obj);
-      return verifier.VerifyTable(ptr);
-    }
-    default: return true;
-  }
-}
-
-inline bool VerifyEmitVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types) {
-  if (!values || !types) return !values && !types;
-  if (values->size() != types->size()) return false;
-  for (flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
-    if (!VerifyEmit(
-        verifier,  values->Get(i), types->GetEnum<Emit>(i))) {
-      return false;
-    }
-  }
-  return true;
-}
-
 inline bool VerifyRelationImpl(flatbuffers::Verifier &verifier, const void *obj, RelationImpl type) {
   switch (type) {
     case RelationImpl::NONE: {
diff --git a/experimental/computeir/Relation.fbs b/experimental/computeir/Relation.fbs
index 3af159a033952..12092ec9296bf 100644
--- a/experimental/computeir/Relation.fbs
+++ b/experimental/computeir/Relation.fbs
@@ -21,26 +21,6 @@ include "Expression.fbs";
 
 namespace org.apache.arrow.computeir.flatbuf;
 
-/// A data type indicating that a different mapping of columns
-/// should occur in the output.
-///
-/// For example:
-///
-/// Given a query `SELECT b, a FROM t` where `t` has columns a, b, c
-/// the mapping value for the projection would equal [1, 0].
-table Remap {
-  mapping: [FieldIndex] (required);
-}
-
-// Pass through indicates that no output remapping should occur.
-table PassThrough {}
-
-/// A union for the different colum remapping variants
-union Emit {
-  Remap,
-  PassThrough,
-}
-
 /// An identifier for relations in a query.
 ///
 /// A table is used here to allow plan implementations optionality.
@@ -48,20 +28,11 @@ table RelId {
   id: uint64;
 }
 
-/// Fields common to every relational operator
-table RelBase {
-  /// Output remapping of ordinal columns for a given operation
-  output_mapping: Emit (required);
-
-  /// An identifiier for a relation. The identifier should be unique over the
-  /// entire plan. Optional.
-  id: RelId;
-}
-
 /// Filter operation
 table Filter {
-  /// Common options
-  base: RelBase (required);
+  /// An identifiier for the relation. The identifier should be unique over the
+  /// entire plan. Optional.
+  id: RelId;
   /// Child relation
   rel: Relation (required);
   /// The expression which will be evaluated against input rows
@@ -72,8 +43,9 @@ table Filter {
 
 /// Projection
 table Project {
-  /// Common options
-  base: RelBase (required);
+  /// An identifiier for the relation. The identifier should be unique over the
+  /// entire plan. Optional.
+  id: RelId;
   /// Child relation
   rel: Relation (required);
   /// Expressions which will be evaluated to produce to
@@ -89,8 +61,9 @@ table Grouping {
 
 /// Aggregate operation
 table Aggregate {
-  /// Common options
-  base: RelBase (required);
+  /// An identifiier for the relation. The identifier should be unique over the
+  /// entire plan. Optional.
+  id: RelId;
   /// Child relation
   rel: Relation (required);
   /// Expressions which will be evaluated to produce to
@@ -125,8 +98,9 @@ enum JoinKind : uint8 {
 
 /// Join between two tables
 table Join {
-  /// Common options
-  base: RelBase (required);
+  /// An identifiier for the relation. The identifier should be unique over the
+  /// entire plan. Optional.
+  id: RelId;
   /// Left relation
   left: Relation (required);
   /// Right relation
@@ -141,8 +115,9 @@ table Join {
 
 /// Order by relation
 table OrderBy {
-  /// Common options
-  base: RelBase (required);
+  /// An identifiier for the relation. The identifier should be unique over the
+  /// entire plan. Optional.
+  id: RelId;
   /// Child relation
   rel: Relation (required);
   /// Define sort order for rows of output.
@@ -152,8 +127,9 @@ table OrderBy {
 
 /// Limit operation
 table Limit {
-  /// Common options
-  base: RelBase (required);
+  /// An identifiier for the relation. The identifier should be unique over the
+  /// entire plan. Optional.
+  id: RelId;
   /// Child relation
   rel: Relation (required);
   /// Starting index of rows
@@ -171,8 +147,9 @@ enum SetOpKind : uint8 {
 
 /// A set operation on two or more relations
 table SetOperation {
-  /// Common options
-  base: RelBase (required);
+  /// An identifiier for the relation. The identifier should be unique over the
+  /// entire plan. Optional.
+  id: RelId;
   /// Child relations
   rels: [Relation] (required);
   /// The kind of set operation
@@ -187,15 +164,18 @@ table LiteralColumn {
 
 /// Literal relation
 table LiteralRelation {
-  /// Common options
-  base: RelBase (required);
+  /// An identifiier for the relation. The identifier should be unique over the
+  /// entire plan. Optional.
+  id: RelId;
   /// The columns of this literal relation.
   columns: [LiteralColumn] (required);
 }
 
 /// An external source of tabular data
 table Source {
-  base: RelBase (required);
+  /// An identifiier for the relation. The identifier should be unique over the
+  /// entire plan. Optional.
+  id: RelId;
   name: string (required);
   /// An optional expression used to filter out rows directly from the source.
   ///

From 03669438bbce53078616c7f943a63fb0c11db196 Mon Sep 17 00:00:00 2001
From: Eric Erhardt <eric.erhardt@microsoft.com>
Date: Fri, 22 Oct 2021 11:12:33 -0500
Subject: [PATCH 005/194] MINOR: [Docs][C#] Update C# documentation

Updating the C# README and feature status matrix to match the current implementation.

Also resolving the question in https://github.com/apache/arrow/issues/11367.

Closes #11378 from eerhardt/UpdateDocs

Authored-by: Eric Erhardt <eric.erhardt@microsoft.com>
Signed-off-by: Eric Erhardt <eric.erhardt@microsoft.com>
---
 csharp/README.md       | 19 +++++++------------
 docs/source/status.rst | 10 +++++-----
 2 files changed, 12 insertions(+), 17 deletions(-)

diff --git a/csharp/README.md b/csharp/README.md
index 2a60cd27c5c84..3d0681279a324 100644
--- a/csharp/README.md
+++ b/csharp/README.md
@@ -21,12 +21,13 @@
 
 An implementation of Arrow targeting .NET Standard.
 
-This implementation is under development and may not be suitable for use in production environments.
+See our current [feature matrix](https://github.com/apache/arrow/blob/master/docs/source/status.rst)
+for currently available features.
 
 # Implementation
 
 - Arrow 0.11 (specification)
-- C# 7.2
+- C# 8
 - .NET Standard 1.3
 - Asynchronous I/O
 - Uses modern .NET runtime features such as **Span&lt;T&gt;**, **Memory&lt;T&gt;**, **MemoryManager&lt;T&gt;**, and **System.Buffers** primitives for memory allocation, memory storage, and fast serialization.
@@ -34,8 +35,8 @@ This implementation is under development and may not be suitable for use in prod
 
 # Known Issues
 
-- Can not read Arrow files containing dictionary batches, tensors, or tables.
-- Can not easily modify allocation strategy without implementing a custom memory pool. All allocations are currently 64-byte aligned and padded to 8-bytes.
+- Cannot read Arrow files containing tensors.
+- Cannot easily modify allocation strategy without implementing a custom memory pool. All allocations are currently 64-byte aligned and padded to 8-bytes.
 - Default memory allocation strategy uses an over-allocation strategy with pointer fixing, which results in significant memory overhead for small buffers. A buffer that requires a single byte for storage may be backed by an allocation of up to 64-bytes to satisfy alignment requirements.
 - There are currently few builder APIs available for specific array types. Arrays must be built manually with an arrow buffer builder abstraction.
 - FlatBuffer code generation is not included in the build process.
@@ -44,8 +45,6 @@ This implementation is under development and may not be suitable for use in prod
 - Throws exceptions that are non-specific to the Arrow implementation in some circumstances where it probably should (eg. does not throw ArrowException exceptions)
 - Lack of code documentation
 - Lack of usage examples
-- Lack of comprehensive unit tests
-- Lack of comprehensive benchmarks
 
 # Usage
 
@@ -57,7 +56,7 @@ This implementation is under development and may not be suitable for use in prod
 
     public static async Task<RecordBatch> ReadArrowAsync(string filename)
     {
-        using (var stream = File.OpenRead("test.arrow"))
+        using (var stream = File.OpenRead(filename))
         using (var reader = new ArrowFileReader(stream))
         {
             var recordBatch = await reader.ReadNextRecordBatchAsync();
@@ -113,10 +112,8 @@ This implementation is under development and may not be suitable for use in prod
 - Serialization
     - Exhaustive validation
     - Dictionary Batch
-        - Can not serialize or deserialize files or streams containing dictionary batches
+        - Cannot serialize files or streams containing dictionary batches
     - Dictionary Encoding
-	- Schema Metadata
-	- Schema Field Metadata
 - Types
     - Tensor
     - Table
@@ -125,11 +122,9 @@ This implementation is under development and may not be suitable for use in prod
         - Dense
         - Sparse
     - Half-Float
-    - Dictionary
 - Array Operations
 	- Equality / Comparison
 	- Casting
-	- Builders
 - Compute
     - There is currently no API available for a compute / kernel abstraction.
 
diff --git a/docs/source/status.rst b/docs/source/status.rst
index 8e3e998dfb9c2..879f20f81faa8 100644
--- a/docs/source/status.rst
+++ b/docs/source/status.rst
@@ -56,7 +56,7 @@ Data Types
 +-------------------+-------+-------+-------+------------+-------+-------+-------+
 | Interval          | ✓     | ✓     | ✓     |            |       |  ✓    | ✓     |
 +-------------------+-------+-------+-------+------------+-------+-------+-------+
-| Fixed Size Binary | ✓     | ✓     | ✓     | ✓          |       |  ✓    | ✓     |
+| Fixed Size Binary | ✓     | ✓     | ✓     | ✓          |  ✓    |  ✓    | ✓     |
 +-------------------+-------+-------+-------+------------+-------+-------+-------+
 | Binary            | ✓     | ✓     | ✓     | ✓          |  ✓    |  ✓    | ✓     |
 +-------------------+-------+-------+-------+------------+-------+-------+-------+
@@ -90,7 +90,7 @@ Data Types
 | Data type         | C++   | Java  | Go    | JavaScript | C#    | Rust  | Julia |
 | (special)         |       |       |       |            |       |       |       |
 +===================+=======+=======+=======+============+=======+=======+=======+
-| Dictionary        | ✓     | ✓ (1) |       | ✓ (1)      |       | ✓ (1) | ✓     |
+| Dictionary        | ✓     | ✓ (1) |       | ✓ (1)      | ✓ (1) | ✓ (1) | ✓     |
 +-------------------+-------+-------+-------+------------+-------+-------+-------+
 | Extension         | ✓     | ✓     | ✓     |            |       |       | ✓     |
 +-------------------+-------+-------+-------+------------+-------+-------+-------+
@@ -116,11 +116,11 @@ IPC Format
 +-----------------------------+-------+-------+-------+------------+-------+-------+-------+
 | Record batches              | ✓     | ✓     | ✓     | ✓          |  ✓    |  ✓    | ✓     |
 +-----------------------------+-------+-------+-------+------------+-------+-------+-------+
-| Dictionaries                | ✓     | ✓     | ✓     | ✓          |       |  ✓    | ✓     |
+| Dictionaries                | ✓     | ✓     | ✓     | ✓          |  ✓    |  ✓    | ✓     |
 +-----------------------------+-------+-------+-------+------------+-------+-------+-------+
 | Replacement dictionaries    | ✓     | ✓     |       |            |       |       | ✓     |
 +-----------------------------+-------+-------+-------+------------+-------+-------+-------+
-| Delta dictionaries          | ✓ (1) |       |       |            |       |       | ✓     |
+| Delta dictionaries          | ✓ (1) |       |       |            |  ✓    |       | ✓     |
 +-----------------------------+-------+-------+-------+------------+-------+-------+-------+
 | Tensors                     | ✓     |       |       |            |       |       |       |
 +-----------------------------+-------+-------+-------+------------+-------+-------+-------+
@@ -130,7 +130,7 @@ IPC Format
 +-----------------------------+-------+-------+-------+------------+-------+-------+-------+
 | Endianness conversion       | ✓ (2) |       |       |            |       |       |       |
 +-----------------------------+-------+-------+-------+------------+-------+-------+-------+
-| Custom schema metadata      | ✓     | ✓     | ✓     |            |       |  ✓    | ✓     |
+| Custom schema metadata      | ✓     | ✓     | ✓     |            |  ✓    |  ✓    | ✓     |
 +-----------------------------+-------+-------+-------+------------+-------+-------+-------+
 
 Notes:

From d04a46bb8c71a33c043b64ad596f5af43f42580c Mon Sep 17 00:00:00 2001
From: Jayjeet Chakraborty <jc.github@rediffmail.com>
Date: Fri, 22 Oct 2021 08:17:08 -1000
Subject: [PATCH 006/194] ARROW-13607: [C++] Add Skyhook to Arrow

Closes #10913 from JayjeetAtGithub/skyhook/pr_1

Authored-by: Jayjeet Chakraborty <jc.github@rediffmail.com>
Signed-off-by: Weston Pace <weston.pace@gmail.com>
---
 ci/docker/ubuntu-20.04-cpp.dockerfile         |   5 +
 ci/scripts/cpp_build.sh                       |   1 +
 ci/scripts/generate_dataset.py                |  47 +++
 ci/scripts/install_ceph.sh                    |  28 ++
 ci/scripts/integration_skyhook.sh             | 141 +++++++++
 cpp/CMakeLists.txt                            |  11 +
 cpp/cmake_modules/DefineOptions.cmake         |   2 +
 cpp/cmake_modules/Findlibrados.cmake          |  34 +++
 cpp/src/arrow/dataset/dataset.h               |   3 +
 cpp/src/arrow/dataset/scanner_internal.h      |   5 +
 cpp/src/skyhook/CMakeLists.txt                |  87 ++++++
 cpp/src/skyhook/client/CMakeLists.txt         |  18 ++
 cpp/src/skyhook/client/file_skyhook.cc        | 182 ++++++++++++
 cpp/src/skyhook/client/file_skyhook.h         | 108 +++++++
 cpp/src/skyhook/cls/cls_skyhook.cc            | 267 ++++++++++++++++++
 cpp/src/skyhook/cls/cls_skyhook_test.cc       | 207 ++++++++++++++
 cpp/src/skyhook/protocol/ScanRequest.fbs      |  29 ++
 .../skyhook/protocol/ScanRequest_generated.h  | 167 +++++++++++
 cpp/src/skyhook/protocol/rados_protocol.cc    |  99 +++++++
 cpp/src/skyhook/protocol/rados_protocol.h     | 103 +++++++
 cpp/src/skyhook/protocol/skyhook_protocol.cc  | 136 +++++++++
 cpp/src/skyhook/protocol/skyhook_protocol.h   | 116 ++++++++
 .../skyhook/protocol/skyhook_protocol_test.cc |  71 +++++
 cpp/src/skyhook/skyhook.pc.in                 |  26 ++
 dev/tasks/tasks.yml                           |   9 +
 docker-compose.yml                            |   3 +
 26 files changed, 1905 insertions(+)
 create mode 100644 ci/scripts/generate_dataset.py
 create mode 100755 ci/scripts/install_ceph.sh
 create mode 100755 ci/scripts/integration_skyhook.sh
 create mode 100644 cpp/cmake_modules/Findlibrados.cmake
 create mode 100644 cpp/src/skyhook/CMakeLists.txt
 create mode 100644 cpp/src/skyhook/client/CMakeLists.txt
 create mode 100644 cpp/src/skyhook/client/file_skyhook.cc
 create mode 100644 cpp/src/skyhook/client/file_skyhook.h
 create mode 100644 cpp/src/skyhook/cls/cls_skyhook.cc
 create mode 100644 cpp/src/skyhook/cls/cls_skyhook_test.cc
 create mode 100644 cpp/src/skyhook/protocol/ScanRequest.fbs
 create mode 100644 cpp/src/skyhook/protocol/ScanRequest_generated.h
 create mode 100644 cpp/src/skyhook/protocol/rados_protocol.cc
 create mode 100644 cpp/src/skyhook/protocol/rados_protocol.h
 create mode 100644 cpp/src/skyhook/protocol/skyhook_protocol.cc
 create mode 100644 cpp/src/skyhook/protocol/skyhook_protocol.h
 create mode 100644 cpp/src/skyhook/protocol/skyhook_protocol_test.cc
 create mode 100644 cpp/src/skyhook/skyhook.pc.in

diff --git a/ci/docker/ubuntu-20.04-cpp.dockerfile b/ci/docker/ubuntu-20.04-cpp.dockerfile
index de872da9a8f75..5a48c648e3bfe 100644
--- a/ci/docker/ubuntu-20.04-cpp.dockerfile
+++ b/ci/docker/ubuntu-20.04-cpp.dockerfile
@@ -78,6 +78,7 @@ RUN apt-get update -y -q && \
         liblz4-dev \
         libprotobuf-dev \
         libprotoc-dev \
+        libradospp-dev \
         libre2-dev \
         libsnappy-dev \
         libssl-dev \
@@ -89,6 +90,8 @@ RUN apt-get update -y -q && \
         pkg-config \
         protobuf-compiler \
         python3-pip \
+        python3-rados \
+        rados-objclass-dev \
         rapidjson-dev \
         tzdata \
         wget && \
@@ -99,6 +102,8 @@ COPY ci/scripts/install_minio.sh /arrow/ci/scripts/
 RUN /arrow/ci/scripts/install_minio.sh ${arch} linux latest /usr/local
 COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts/
 RUN /arrow/ci/scripts/install_gcs_testbench.sh ${arch} default
+COPY ci/scripts/install_ceph.sh /arrow/ci/scripts/
+RUN /arrow/ci/scripts/install_ceph.sh
 
 # Prioritize system packages and local installation
 # The following dependencies will be downloaded due to missing/invalid packages
diff --git a/ci/scripts/cpp_build.sh b/ci/scripts/cpp_build.sh
index a11dd23b7f7fe..0ea9b1b89dc47 100755
--- a/ci/scripts/cpp_build.sh
+++ b/ci/scripts/cpp_build.sh
@@ -91,6 +91,7 @@ cmake -G "${CMAKE_GENERATOR:-Ninja}" \
       -DARROW_PYTHON=${ARROW_PYTHON:-OFF} \
       -DARROW_RUNTIME_SIMD_LEVEL=${ARROW_RUNTIME_SIMD_LEVEL:-MAX} \
       -DARROW_S3=${ARROW_S3:-OFF} \
+      -DARROW_SKYHOOK=${ARROW_SKYHOOK:-OFF} \
       -DARROW_TEST_LINKAGE=${ARROW_TEST_LINKAGE:-shared} \
       -DARROW_TEST_MEMCHECK=${ARROW_TEST_MEMCHECK:-OFF} \
       -DARROW_USE_ASAN=${ARROW_USE_ASAN:-OFF} \
diff --git a/ci/scripts/generate_dataset.py b/ci/scripts/generate_dataset.py
new file mode 100644
index 0000000000000..42ee0763a1b25
--- /dev/null
+++ b/ci/scripts/generate_dataset.py
@@ -0,0 +1,47 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+import os
+import shutil
+import random
+
+import pandas as pd
+
+if __name__ == "__main__":
+    # generate the test dataframe
+    data = {
+        "total_amount": list(),
+        "fare_amount": list()
+    }
+    for i in range(0, 500):
+        data['total_amount'].append(random.randint(1,11)*5)
+        data['fare_amount'].append(random.randint(1,11)*3)
+    df = pd.DataFrame(data)
+
+    # dump the dataframe to a parquet file
+    df.to_parquet("skyhook_test_data.parquet")
+
+    # create the dataset by copying the parquet files
+    shutil.rmtree("nyc", ignore_errors=True)
+    payment_type = ["1", "2", "3", "4"]
+    vendor_id = ["1", "2"]
+    for p in payment_type:
+        for v in vendor_id:
+            path = f"nyc/payment_type={p}/VendorID={v}"
+            os.makedirs(path, exist_ok=True)
+            shutil.copyfile("skyhook_test_data.parquet", os.path.join(path, f"{p}.{v}.parquet"))
diff --git a/ci/scripts/install_ceph.sh b/ci/scripts/install_ceph.sh
new file mode 100755
index 0000000000000..d9abef0619408
--- /dev/null
+++ b/ci/scripts/install_ceph.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -ex
+
+ARCH=$(uname -m)
+if [ "$ARCH" != "x86_64" ]; then
+  exit 0
+fi
+
+apt update
+apt install -y attr ceph-common ceph-fuse ceph-mds ceph-mgr ceph-mon ceph-osd
diff --git a/ci/scripts/integration_skyhook.sh b/ci/scripts/integration_skyhook.sh
new file mode 100755
index 0000000000000..6c3011f9c63ed
--- /dev/null
+++ b/ci/scripts/integration_skyhook.sh
@@ -0,0 +1,141 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This script spawns a single-node Ceph cluster, creates a CephFS mount,
+# generates a Parquet dataset, and runs the SkyhookDM integration tests.
+# Taken from https://github.com/ceph/go-ceph/blob/master/micro-osd.sh
+
+set -e
+set -x
+set -u
+
+if [ "${ARROW_SKYHOOK:-OFF}" != "ON" ]; then
+  exit 0
+fi
+
+ARROW_BUILD_DIR=${1}/cpp
+DIR=/tmp/integration_skyhook
+
+# set environment variables
+pkill ceph || true
+rm -rf ${DIR}/*
+LOG_DIR=${DIR}/log
+MON_DATA=${DIR}/mon
+MDS_DATA=${DIR}/mds
+MOUNTPT=${MDS_DATA}/mnt
+OSD_DATA=${DIR}/osd
+mkdir -p ${LOG_DIR} ${MON_DATA} ${OSD_DATA} ${MDS_DATA} ${MOUNTPT}
+MDS_NAME="Z"
+MON_NAME="a"
+MGR_NAME="x"
+MIRROR_ID="m"
+
+# cluster wide parameters
+cat >> ${DIR}/ceph.conf <<EOF
+[global]
+fsid = $(uuidgen)
+osd crush chooseleaf type = 0
+run dir = ${DIR}/run
+auth cluster required = none
+auth service required = none
+auth client required = none
+osd pool default size = 1
+mon host = ${HOSTNAME}
+[mds.${MDS_NAME}]
+host = ${HOSTNAME}
+[mon.${MON_NAME}]
+log file = ${LOG_DIR}/mon.log
+chdir = ""
+mon cluster log file = ${LOG_DIR}/mon-cluster.log
+mon data = ${MON_DATA}
+mon data avail crit = 0
+mon addr = ${HOSTNAME}
+mon allow pool delete = true
+[osd.0]
+log file = ${LOG_DIR}/osd.log
+chdir = ""
+osd data = ${OSD_DATA}
+osd journal = ${OSD_DATA}.journal
+osd journal size = 100
+osd objectstore = memstore
+osd class load list = *
+osd class default list = *
+EOF
+
+export CEPH_CONF=${DIR}/ceph.conf
+cp $CEPH_CONF /etc/ceph/ceph.conf
+
+# start an osd
+ceph-mon --id ${MON_NAME} --mkfs --keyring /dev/null
+touch ${MON_DATA}/keyring
+ceph-mon --id ${MON_NAME}
+
+# start an osd
+OSD_ID=$(ceph osd create)
+ceph osd crush add osd.${OSD_ID} 1 root=default
+ceph-osd --id ${OSD_ID} --mkjournal --mkfs
+ceph-osd --id ${OSD_ID} || ceph-osd --id ${OSD_ID} || ceph-osd --id ${OSD_ID}
+
+# start an mds for cephfs
+ceph auth get-or-create mds.${MDS_NAME} mon 'profile mds' mgr 'profile mds' mds 'allow *' osd 'allow *' > ${MDS_DATA}/keyring
+ceph osd pool create cephfs_data 8
+ceph osd pool create cephfs_metadata 8
+ceph fs new cephfs cephfs_metadata cephfs_data
+ceph fs ls
+ceph-mds -i ${MDS_NAME}
+ceph status
+while [[ ! $(ceph mds stat | grep "up:active") ]]; do sleep 1; done
+
+# start a manager
+ceph-mgr --id ${MGR_NAME}
+
+# test the setup
+ceph --version
+ceph status
+
+apt update
+apt install -y python3-pip
+
+pushd ${ARROW_BUILD_DIR}
+    # create the rados-classes, if not there already
+    mkdir -p /usr/lib/x86_64-linux-gnu/rados-classes/
+    cp debug/libcls_skyhook* /usr/lib/x86_64-linux-gnu/rados-classes/
+
+    # mount a ceph filesystem to /mnt/cephfs in the user-space using ceph-fuse
+    mkdir -p /mnt/cephfs
+    ceph-fuse /mnt/cephfs
+    sleep 5
+
+    # download an example dataset and copy into the mounted dir
+    pip3 install pyarrow pandas
+    python3 /arrow/ci/scripts/generate_dataset.py
+    cp -r nyc /mnt/cephfs/
+    sleep 10
+
+    # run the tests
+    SKYHOOK_CLS_TEST=debug/skyhook-cls-test
+    if [ -f "$SKYHOOK_CLS_TEST" ]; then
+        debug/skyhook-cls-test
+    fi
+
+    SKYHOOK_PROTOCOL_TEST=debug/skyhook-protocol-test
+    if [ -f "$SKYHOOK_PROTOCOL_TEST" ]; then
+        debug/skyhook-protocol-test
+    fi
+popd
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index c787794d39de6..3c05f235df3dd 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -351,6 +351,13 @@ if(ARROW_ENGINE)
   set(ARROW_COMPUTE ON)
 endif()
 
+if(ARROW_SKYHOOK)
+  set(ARROW_DATASET ON)
+  set(ARROW_PARQUET ON)
+  set(ARROW_WITH_LZ4 ON)
+  set(ARROW_WITH_SNAPPY ON)
+endif()
+
 if(ARROW_DATASET)
   set(ARROW_COMPUTE ON)
   set(ARROW_FILESYSTEM ON)
@@ -938,6 +945,10 @@ if(ARROW_GANDIVA)
   add_subdirectory(src/gandiva)
 endif()
 
+if(ARROW_SKYHOOK)
+  add_subdirectory(src/skyhook)
+endif()
+
 if(ARROW_BUILD_EXAMPLES)
   add_custom_target(runexample ctest -L example)
   add_subdirectory(examples/arrow)
diff --git a/cpp/cmake_modules/DefineOptions.cmake b/cpp/cmake_modules/DefineOptions.cmake
index 3568887fa261f..f81a1b1577901 100644
--- a/cpp/cmake_modules/DefineOptions.cmake
+++ b/cpp/cmake_modules/DefineOptions.cmake
@@ -266,6 +266,8 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}")
 
   define_option(ARROW_S3 "Build Arrow with S3 support (requires the AWS SDK for C++)" OFF)
 
+  define_option(ARROW_SKYHOOK "Build the Skyhook libraries" OFF)
+
   define_option(ARROW_TENSORFLOW "Build Arrow with TensorFlow support enabled" OFF)
 
   define_option(ARROW_TESTING "Build the Arrow testing libraries" OFF)
diff --git a/cpp/cmake_modules/Findlibrados.cmake b/cpp/cmake_modules/Findlibrados.cmake
new file mode 100644
index 0000000000000..695d73fae1cb8
--- /dev/null
+++ b/cpp/cmake_modules/Findlibrados.cmake
@@ -0,0 +1,34 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+find_path(LIBRADOS_INCLUDE_DIR rados/librados.hpp)
+
+find_library(LIBRADOS_LIBRARY NAMES rados)
+
+mark_as_advanced(LIBRADOS_LIBRARY LIBRADOS_INCLUDE_DIR)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(librados DEFAULT_MSG LIBRADOS_LIBRARY
+                                  LIBRADOS_INCLUDE_DIR)
+
+if(librados_FOUND)
+  add_library(librados::rados UNKNOWN IMPORTED)
+  set_target_properties(librados::rados
+                        PROPERTIES IMPORTED_LOCATION "${LIBRADOS_LIBRARY}"
+                                   INTERFACE_INCLUDE_DIRECTORIES
+                                   "${LIBRADOS_INCLUDE_DIR}")
+endif()
diff --git a/cpp/src/arrow/dataset/dataset.h b/cpp/src/arrow/dataset/dataset.h
index 11210fdc27b81..a02954a23c83d 100644
--- a/cpp/src/arrow/dataset/dataset.h
+++ b/cpp/src/arrow/dataset/dataset.h
@@ -90,6 +90,9 @@ class ARROW_DS_EXPORT Fragment : public std::enable_shared_from_this<Fragment> {
 
   virtual ~Fragment() = default;
 
+  /// \brief Decide whether to apply filters and projections to this Fragment.
+  bool apply_compute = true;
+
  protected:
   Fragment() = default;
   explicit Fragment(compute::Expression partition_expression,
diff --git a/cpp/src/arrow/dataset/scanner_internal.h b/cpp/src/arrow/dataset/scanner_internal.h
index 7a43feb61179c..2c78d1b277444 100644
--- a/cpp/src/arrow/dataset/scanner_internal.h
+++ b/cpp/src/arrow/dataset/scanner_internal.h
@@ -185,6 +185,11 @@ inline Result<ScanTaskIterator> GetScanTaskIterator(
   auto fn = [options](std::shared_ptr<Fragment> fragment) -> Result<ScanTaskIterator> {
     ARROW_ASSIGN_OR_RAISE(auto scan_task_it, fragment->Scan(options));
 
+    // Skip applying compute on fragments if disabled.
+    if (!fragment->apply_compute) {
+      return std::move(scan_task_it);
+    }
+
     auto partition = fragment->partition_expression();
     // Apply the filter and/or projection to incoming RecordBatches by
     // wrapping the ScanTask with a FilterAndProjectScanTask
diff --git a/cpp/src/skyhook/CMakeLists.txt b/cpp/src/skyhook/CMakeLists.txt
new file mode 100644
index 0000000000000..22a414c5f26ac
--- /dev/null
+++ b/cpp/src/skyhook/CMakeLists.txt
@@ -0,0 +1,87 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitationsn
+# under the License.
+
+# add the client subdirectory
+add_subdirectory(client)
+
+# define the targets to build
+add_custom_target(arrow_skyhook_client)
+add_custom_target(cls_skyhook)
+
+# define the dependencies
+find_package(librados REQUIRED)
+set(ARROW_SKYHOOK_LINK_STATIC arrow_dataset_static librados::rados)
+set(ARROW_SKYHOOK_LINK_SHARED arrow_dataset_shared librados::rados)
+
+# define the client and cls sources
+set(ARROW_SKYHOOK_CLIENT_SOURCES client/file_skyhook.cc protocol/rados_protocol.cc
+                                 protocol/skyhook_protocol.cc)
+set(ARROW_SKYHOOK_CLS_SOURCES cls/cls_skyhook.cc protocol/rados_protocol.cc
+                              protocol/skyhook_protocol.cc)
+
+# define the client library
+add_arrow_lib(arrow_skyhook_client
+              PKG_CONFIG_NAME
+              skyhook
+              SOURCES
+              ${ARROW_SKYHOOK_CLIENT_SOURCES}
+              OUTPUTS
+              ARROW_SKYHOOK_CLIENT_LIBRARIES
+              SHARED_LINK_LIBS
+              ${ARROW_SKYHOOK_LINK_SHARED}
+              STATIC_LINK_LIBS
+              ${ARROW_SKYHOOK_LINK_STATIC})
+
+#  define the cls library
+add_arrow_lib(cls_skyhook
+              SOURCES
+              ${ARROW_SKYHOOK_CLS_SOURCES}
+              OUTPUTS
+              ARROW_SKYHOOK_CLS_LIBRARIES
+              SHARED_LINK_LIBS
+              ${ARROW_SKYHOOK_LINK_SHARED}
+              STATIC_LINK_LIBS
+              ${ARROW_SKYHOOK_LINK_STATIC})
+
+# finish building the project
+add_dependencies(arrow_skyhook_client ${ARROW_SKYHOOK_CLIENT_LIBRARIES})
+add_dependencies(cls_skyhook ${ARROW_SKYHOOK_CLS_LIBRARIES})
+
+# define the test builds
+if(ARROW_TEST_LINKAGE STREQUAL "static")
+  set(ARROW_SKYHOOK_TEST_LINK_LIBS arrow_dataset_static ${ARROW_TEST_STATIC_LINK_LIBS})
+else()
+  set(ARROW_SKYHOOK_TEST_LINK_LIBS arrow_dataset_shared ${ARROW_TEST_SHARED_LINK_LIBS})
+endif()
+list(APPEND ARROW_SKYHOOK_TEST_LINK_LIBS ${ARROW_SKYHOOK_CLIENT_LIBRARIES})
+
+# build the cls and protocol tests
+add_arrow_test(cls_test
+               SOURCES
+               cls/cls_skyhook_test.cc
+               EXTRA_LINK_LIBS
+               ${ARROW_SKYHOOK_TEST_LINK_LIBS}
+               PREFIX
+               "skyhook")
+
+add_arrow_test(protocol_test
+               SOURCES
+               protocol/skyhook_protocol_test.cc
+               EXTRA_LINK_LIBS
+               ${ARROW_SKYHOOK_TEST_LINK_LIBS}
+               PREFIX
+               "skyhook")
diff --git a/cpp/src/skyhook/client/CMakeLists.txt b/cpp/src/skyhook/client/CMakeLists.txt
new file mode 100644
index 0000000000000..6255d9ad39c5c
--- /dev/null
+++ b/cpp/src/skyhook/client/CMakeLists.txt
@@ -0,0 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+arrow_install_all_headers("skyhook/client")
diff --git a/cpp/src/skyhook/client/file_skyhook.cc b/cpp/src/skyhook/client/file_skyhook.cc
new file mode 100644
index 0000000000000..f8b57f441d200
--- /dev/null
+++ b/cpp/src/skyhook/client/file_skyhook.cc
@@ -0,0 +1,182 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#include "skyhook/client/file_skyhook.h"
+#include "skyhook/protocol/rados_protocol.h"
+#include "skyhook/protocol/skyhook_protocol.h"
+
+#include "arrow/compute/exec/expression.h"
+#include "arrow/dataset/file_base.h"
+#include "arrow/dataset/file_ipc.h"
+#include "arrow/dataset/file_parquet.h"
+#include "arrow/util/compression.h"
+
+namespace skyhook {
+
+/// A ScanTask to scan a file fragment in Skyhook format.
+class SkyhookScanTask : public arrow::dataset::ScanTask {
+ public:
+  SkyhookScanTask(std::shared_ptr<arrow::dataset::ScanOptions> options,
+                  std::shared_ptr<arrow::dataset::Fragment> fragment,
+                  arrow::dataset::FileSource source,
+                  std::shared_ptr<skyhook::SkyhookDirectObjectAccess> doa,
+                  skyhook::SkyhookFileType::type file_format,
+                  arrow::compute::Expression partition_expression)
+      : ScanTask(std::move(options), std::move(fragment)),
+        source_(std::move(source)),
+        doa_(std::move(doa)),
+        file_format_(file_format),
+        partition_expression_(std::move(partition_expression)) {}
+
+  arrow::Result<arrow::RecordBatchIterator> Execute() override {
+    /// Retrieve the size of the file using POSIX `stat`.
+    struct stat st {};
+    RETURN_NOT_OK(doa_->Stat(source_.path(), st));
+
+    /// Create a ScanRequest instance.
+    skyhook::ScanRequest req;
+    req.filter_expression = options_->filter;
+    req.partition_expression = partition_expression_;
+    req.projection_schema = options_->projected_schema;
+    req.dataset_schema = options_->dataset_schema;
+    req.file_size = st.st_size;
+    req.file_format = file_format_;
+
+    /// Serialize the ScanRequest into a ceph bufferlist.
+    ceph::bufferlist request;
+    RETURN_NOT_OK(skyhook::SerializeScanRequest(req, &request));
+
+    /// Execute the Ceph object class method `scan_op`.
+    ceph::bufferlist result;
+    RETURN_NOT_OK(doa_->Exec(st.st_ino, "scan_op", request, result));
+
+    /// Read RecordBatches from the result bufferlist. Since, this step might use
+    /// threads for decompressing compressed batches, to avoid running into
+    /// [ARROW-12597], we switch off threaded decompression to avoid nested threading
+    /// scenarios when scan tasks are executed in parallel by the CpuThreadPool.
+    arrow::RecordBatchVector batches;
+    RETURN_NOT_OK(skyhook::DeserializeTable(result, !options_->use_threads, &batches));
+    return arrow::MakeVectorIterator(std::move(batches));
+  }
+
+ protected:
+  arrow::dataset::FileSource source_;
+  std::shared_ptr<skyhook::SkyhookDirectObjectAccess> doa_;
+  skyhook::SkyhookFileType::type file_format_;
+  arrow::compute::Expression partition_expression_;
+};
+
+class SkyhookFileFormat::Impl {
+ public:
+  Impl(std::shared_ptr<RadosConnCtx> ctx, std::string file_format)
+      : ctx_(std::move(ctx)), file_format_(std::move(file_format)) {}
+
+  ~Impl() = default;
+
+  arrow::Status Init() {
+    /// Connect to the RADOS cluster and instantiate a `SkyhookDirectObjectAccess`
+    /// instance.
+    auto connection = std::make_shared<skyhook::rados::RadosConn>(ctx_);
+    RETURN_NOT_OK(connection->Connect());
+    doa_ = std::make_shared<skyhook::SkyhookDirectObjectAccess>(connection);
+    return arrow::Status::OK();
+  }
+
+  arrow::Result<arrow::dataset::ScanTaskIterator> ScanFile(
+      const std::shared_ptr<arrow::dataset::ScanOptions>& options,
+      const std::shared_ptr<arrow::dataset::FileFragment>& file) const {
+    /// Make sure client-side filtering and projection is turned off.
+    file->apply_compute = false;
+
+    /// Convert string file format name to Enum.
+    skyhook::SkyhookFileType::type file_format;
+    if (file_format_ == "parquet") {
+      file_format = skyhook::SkyhookFileType::type::PARQUET;
+    } else if (file_format_ == "ipc") {
+      file_format = skyhook::SkyhookFileType::type::IPC;
+    } else {
+      return arrow::Status::Invalid("Unsupported file format ", file_format_);
+    }
+
+    arrow::dataset::ScanTaskVector v{std::make_shared<SkyhookScanTask>(
+        options, file, file->source(), doa_, file_format, file->partition_expression())};
+    return arrow::MakeVectorIterator(v);
+  }
+
+  arrow::Result<std::shared_ptr<arrow::Schema>> Inspect(
+      const arrow::dataset::FileSource& source) const {
+    std::shared_ptr<arrow::dataset::FileFormat> file_format;
+    /// Convert string file format name to Arrow FileFormat.
+    if (file_format_ == "parquet") {
+      file_format = std::make_shared<arrow::dataset::ParquetFileFormat>();
+    } else if (file_format_ == "ipc") {
+      file_format = std::make_shared<arrow::dataset::IpcFileFormat>();
+    } else {
+      return arrow::Status::Invalid("Unsupported file format ", file_format_);
+    }
+    std::shared_ptr<arrow::Schema> schema;
+    ARROW_ASSIGN_OR_RAISE(schema, file_format->Inspect(source));
+    return schema;
+  }
+
+ private:
+  std::shared_ptr<skyhook::SkyhookDirectObjectAccess> doa_;
+  std::shared_ptr<RadosConnCtx> ctx_;
+  std::string file_format_;
+};
+
+arrow::Result<std::shared_ptr<SkyhookFileFormat>> SkyhookFileFormat::Make(
+    std::shared_ptr<RadosConnCtx> ctx, std::string file_format) {
+  auto format =
+      std::make_shared<SkyhookFileFormat>(std::move(ctx), std::move(file_format));
+  /// Establish connection to the Ceph cluster.
+  RETURN_NOT_OK(format->Init());
+  return format;
+}
+
+SkyhookFileFormat::SkyhookFileFormat(std::shared_ptr<RadosConnCtx> ctx,
+                                     std::string file_format)
+    : impl_(new Impl(std::move(ctx), std::move(file_format))) {}
+
+SkyhookFileFormat::~SkyhookFileFormat() = default;
+
+arrow::Status SkyhookFileFormat::Init() { return impl_->Init(); }
+
+arrow::Result<std::shared_ptr<arrow::Schema>> SkyhookFileFormat::Inspect(
+    const arrow::dataset::FileSource& source) const {
+  return impl_->Inspect(source);
+}
+
+arrow::Result<arrow::dataset::ScanTaskIterator> SkyhookFileFormat::ScanFile(
+    const std::shared_ptr<arrow::dataset::ScanOptions>& options,
+    const std::shared_ptr<arrow::dataset::FileFragment>& file) const {
+  return impl_->ScanFile(options, file);
+}
+
+std::shared_ptr<arrow::dataset::FileWriteOptions>
+SkyhookFileFormat::DefaultWriteOptions() {
+  return nullptr;
+}
+
+arrow::Result<std::shared_ptr<arrow::dataset::FileWriter>> SkyhookFileFormat::MakeWriter(
+    std::shared_ptr<arrow::io::OutputStream> destination,
+    std::shared_ptr<arrow::Schema> schema,
+    std::shared_ptr<arrow::dataset::FileWriteOptions> options,
+    arrow::fs::FileLocator destination_locator) const {
+  return arrow::Status::NotImplemented("Skyhook writer not yet implemented.");
+}
+
+}  // namespace skyhook
diff --git a/cpp/src/skyhook/client/file_skyhook.h b/cpp/src/skyhook/client/file_skyhook.h
new file mode 100644
index 0000000000000..52a19f5bf3b92
--- /dev/null
+++ b/cpp/src/skyhook/client/file_skyhook.h
@@ -0,0 +1,108 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+
+#include "arrow/api.h"
+#include "arrow/dataset/file_parquet.h"
+#include "arrow/dataset/scanner.h"
+#include "arrow/dataset/type_fwd.h"
+#include "arrow/dataset/visibility.h"
+
+namespace skyhook {
+
+/// \addtogroup dataset-file-formats
+///
+/// @{
+
+/// \struct RadosConnCtx
+/// \brief A struct to hold the parameters required
+/// for connecting to a RADOS cluster.
+struct RadosConnCtx {
+  std::string ceph_config_path;
+  std::string ceph_data_pool;
+  std::string ceph_user_name;
+  std::string ceph_cluster_name;
+  std::string ceph_cls_name;
+
+  RadosConnCtx(std::string ceph_config_path, std::string ceph_data_pool,
+               std::string ceph_user_name, std::string ceph_cluster_name,
+               std::string ceph_cls_name)
+      : ceph_config_path(std::move(ceph_config_path)),
+        ceph_data_pool(std::move(ceph_data_pool)),
+        ceph_user_name(std::move(ceph_user_name)),
+        ceph_cluster_name(std::move(ceph_cluster_name)),
+        ceph_cls_name(std::move(ceph_cls_name)) {}
+};
+
+/// \class SkyhookFileFormat
+/// \brief A FileFormat implementation that offloads fragment
+/// scan operations to the Ceph OSDs. For more details, see the
+/// Skyhook paper, https://arxiv.org/pdf/2105.09894.pdf.
+class SkyhookFileFormat : public arrow::dataset::FileFormat {
+ public:
+  static arrow::Result<std::shared_ptr<SkyhookFileFormat>> Make(
+      std::shared_ptr<RadosConnCtx> ctx, std::string file_format);
+  SkyhookFileFormat(std::shared_ptr<RadosConnCtx> ctx, std::string file_format);
+
+  ~SkyhookFileFormat() override;
+
+  std::string type_name() const override { return "skyhook"; }
+
+  bool Equals(const arrow::dataset::FileFormat& other) const override {
+    return type_name() == other.type_name();
+  }
+
+  arrow::Result<bool> IsSupported(
+      const arrow::dataset::FileSource& source) const override {
+    return true;
+  }
+
+  /// \brief Return the schema of the file fragment.
+  /// \param[in] source The source of the file fragment.
+  /// \return The schema of the file fragment.
+  arrow::Result<std::shared_ptr<arrow::Schema>> Inspect(
+      const arrow::dataset::FileSource& source) const override;
+
+  /// \brief Scan a file fragment.
+  /// \param[in] options The ScanOptions to use.
+  /// \param[in] file The file fragment to scan.
+  /// \return An iterator of ScanTasks.
+  arrow::Result<arrow::dataset::ScanTaskIterator> ScanFile(
+      const std::shared_ptr<arrow::dataset::ScanOptions>& options,
+      const std::shared_ptr<arrow::dataset::FileFragment>& file) const override;
+
+  /// \brief Create a writer for this format.
+  arrow::Result<std::shared_ptr<arrow::dataset::FileWriter>> MakeWriter(
+      std::shared_ptr<arrow::io::OutputStream> destination,
+      std::shared_ptr<arrow::Schema> schema,
+      std::shared_ptr<arrow::dataset::FileWriteOptions> options,
+      arrow::fs::FileLocator destination_locator) const override;
+
+  /// \brief Get default write options for this format.
+  std::shared_ptr<arrow::dataset::FileWriteOptions> DefaultWriteOptions() override;
+
+ private:
+  class Impl;
+  std::unique_ptr<Impl> impl_;
+
+  /// \brief Initialize the SkyhookFileFormat by connecting to RADOS.
+  arrow::Status Init();
+};
+
+/// @}
+
+}  // namespace skyhook
diff --git a/cpp/src/skyhook/cls/cls_skyhook.cc b/cpp/src/skyhook/cls/cls_skyhook.cc
new file mode 100644
index 0000000000000..5f50dd04607dd
--- /dev/null
+++ b/cpp/src/skyhook/cls/cls_skyhook.cc
@@ -0,0 +1,267 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#include <rados/objclass.h>
+
+#include <memory>
+
+#include "arrow/compute/exec/expression.h"
+#include "arrow/dataset/dataset.h"
+#include "arrow/dataset/file_ipc.h"
+#include "arrow/dataset/file_parquet.h"
+#include "arrow/io/interfaces.h"
+#include "arrow/result.h"
+#include "arrow/util/logging.h"
+
+#include "skyhook/protocol/skyhook_protocol.h"
+
+CLS_VER(1, 0)
+CLS_NAME(skyhook)
+
+cls_handle_t h_class;
+cls_method_handle_t h_scan_op;
+
+/// \brief Log skyhook errors using RADOS object class SDK's logger.
+void LogSkyhookError(const std::string& msg) { CLS_LOG(0, "error: %s", msg.c_str()); }
+
+/// \class RandomAccessObject
+/// \brief An interface to provide a file-like view over RADOS objects.
+class RandomAccessObject : public arrow::io::RandomAccessFile {
+ public:
+  explicit RandomAccessObject(cls_method_context_t hctx, int64_t file_size) {
+    hctx_ = hctx;
+    content_length_ = file_size;
+    chunks_ = std::vector<std::shared_ptr<ceph::bufferlist>>();
+  }
+
+  ~RandomAccessObject() override { DCHECK_OK(Close()); }
+
+  /// Check if the file stream is closed.
+  arrow::Status CheckClosed() const {
+    if (closed_) {
+      return arrow::Status::Invalid("Operation on closed stream");
+    }
+    return arrow::Status::OK();
+  }
+
+  /// Check if the position of the object is valid.
+  arrow::Status CheckPosition(int64_t position, const char* action) const {
+    if (position < 0) {
+      return arrow::Status::Invalid("Cannot ", action, " from negative position");
+    }
+    if (position > content_length_) {
+      return arrow::Status::IOError("Cannot ", action, " past end of file");
+    }
+    return arrow::Status::OK();
+  }
+
+  arrow::Result<int64_t> ReadAt(int64_t position, int64_t nbytes, void* out) override {
+    return arrow::Status::NotImplemented(
+        "ReadAt has not been implemented in RandomAccessObject");
+  }
+
+  /// Read a specified number of bytes from a specified position.
+  arrow::Result<std::shared_ptr<arrow::Buffer>> ReadAt(int64_t position,
+                                                       int64_t nbytes) override {
+    RETURN_NOT_OK(CheckClosed());
+    RETURN_NOT_OK(CheckPosition(position, "read"));
+
+    // No need to allocate more than the remaining number of bytes
+    nbytes = std::min(nbytes, content_length_ - position);
+
+    if (nbytes > 0) {
+      std::shared_ptr<ceph::bufferlist> bl = std::make_shared<ceph::bufferlist>();
+      cls_cxx_read(hctx_, position, nbytes, bl.get());
+      chunks_.push_back(bl);
+      return std::make_shared<arrow::Buffer>((uint8_t*)bl->c_str(), bl->length());
+    }
+    return std::make_shared<arrow::Buffer>("");
+  }
+
+  /// Read a specified number of bytes from the current position.
+  arrow::Result<std::shared_ptr<arrow::Buffer>> Read(int64_t nbytes) override {
+    ARROW_ASSIGN_OR_RAISE(auto buffer, ReadAt(pos_, nbytes));
+    pos_ += buffer->size();
+    return std::move(buffer);
+  }
+
+  /// Read a specified number of bytes from the current position into an output stream.
+  arrow::Result<int64_t> Read(int64_t nbytes, void* out) override {
+    ARROW_ASSIGN_OR_RAISE(int64_t bytes_read, ReadAt(pos_, nbytes, out));
+    pos_ += bytes_read;
+    return bytes_read;
+  }
+
+  /// Return the size of the file.
+  arrow::Result<int64_t> GetSize() override {
+    RETURN_NOT_OK(CheckClosed());
+    return content_length_;
+  }
+
+  /// Sets the file-pointer offset, measured from the beginning of the
+  /// file, at which the next read or write occurs.
+  arrow::Status Seek(int64_t position) override {
+    RETURN_NOT_OK(CheckClosed());
+    RETURN_NOT_OK(CheckPosition(position, "seek"));
+
+    pos_ = position;
+    return arrow::Status::OK();
+  }
+
+  /// Returns the file-pointer offset.
+  arrow::Result<int64_t> Tell() const override {
+    RETURN_NOT_OK(CheckClosed());
+    return pos_;
+  }
+
+  /// Mark the file as closed.
+  arrow::Status Close() override {
+    closed_ = true;
+    return arrow::Status::OK();
+  }
+
+  bool closed() const override { return closed_; }
+
+ private:
+  cls_method_context_t hctx_;
+  bool closed_ = false;
+  int64_t pos_ = 0;
+  int64_t content_length_ = -1;
+  std::vector<std::shared_ptr<ceph::bufferlist>> chunks_;
+};
+
+/// \brief Driver function to execute the Scan operations.
+/// \param[in] hctx RADOS object context.
+/// \param[in] req The scan request received from the client.
+/// \param[in] format The file format instance to use in the scan.
+/// \param[in] fragment_scan_options The fragment scan options to use to customize the
+/// scan.
+/// \return Table.
+arrow::Result<std::shared_ptr<arrow::Table>> DoScan(
+    cls_method_context_t hctx, const skyhook::ScanRequest& req,
+    const std::shared_ptr<arrow::dataset::FileFormat>& format,
+    const std::shared_ptr<arrow::dataset::FragmentScanOptions>& fragment_scan_options) {
+  auto file = std::make_shared<RandomAccessObject>(hctx, req.file_size);
+  arrow::dataset::FileSource source(file);
+  ARROW_ASSIGN_OR_RAISE(
+      auto fragment, format->MakeFragment(std::move(source), req.partition_expression));
+  auto options = std::make_shared<arrow::dataset::ScanOptions>();
+  auto builder = std::make_shared<arrow::dataset::ScannerBuilder>(
+      req.dataset_schema, std::move(fragment), std::move(options));
+
+  ARROW_RETURN_NOT_OK(builder->Filter(req.filter_expression));
+  ARROW_RETURN_NOT_OK(builder->Project(req.projection_schema->field_names()));
+  ARROW_RETURN_NOT_OK(builder->UseThreads(true));
+  ARROW_RETURN_NOT_OK(builder->FragmentScanOptions(fragment_scan_options));
+
+  ARROW_ASSIGN_OR_RAISE(auto scanner, builder->Finish());
+  ARROW_ASSIGN_OR_RAISE(auto table, scanner->ToTable());
+  return table;
+}
+
+/// \brief Scan RADOS objects containing Arrow IPC data.
+/// \param[in] hctx The RADOS object context.
+/// \param[in] req The scan request received from the client.
+/// \return Table.
+static arrow::Result<std::shared_ptr<arrow::Table>> ScanIpcObject(
+    cls_method_context_t hctx, skyhook::ScanRequest req) {
+  auto format = std::make_shared<arrow::dataset::IpcFileFormat>();
+  auto fragment_scan_options = std::make_shared<arrow::dataset::IpcFragmentScanOptions>();
+
+  ARROW_ASSIGN_OR_RAISE(auto result_table, DoScan(hctx, req, std::move(format),
+                                                  std::move(fragment_scan_options)));
+  return result_table;
+}
+
+/// \brief Scan RADOS objects containing Parquet binary data.
+/// \param[in] hctx The RADOS object context.
+/// \param[in] req The scan request received from the client.
+/// \return Table.
+static arrow::Result<std::shared_ptr<arrow::Table>> ScanParquetObject(
+    cls_method_context_t hctx, skyhook::ScanRequest req) {
+  auto format = std::make_shared<arrow::dataset::ParquetFileFormat>();
+  auto fragment_scan_options =
+      std::make_shared<arrow::dataset::ParquetFragmentScanOptions>();
+
+  ARROW_ASSIGN_OR_RAISE(auto result_table, DoScan(hctx, req, std::move(format),
+                                                  std::move(fragment_scan_options)));
+  return result_table;
+}
+
+/// \brief The scan operation to execute on the Ceph OSD nodes. The scan request is
+/// deserialized, the object is scanned, and the resulting table is serialized
+/// and sent back to the client.
+/// \param[in] hctx The RADOS object context.
+/// \param[in] in A bufferlist containing serialized Scan request.
+/// \param[out] out A bufferlist to store the serialized resultant table.
+/// \return Exit code.
+static int scan_op(cls_method_context_t hctx, ceph::bufferlist* in,
+                   ceph::bufferlist* out) {
+  // Components required to construct a File fragment.
+  arrow::Status s;
+  skyhook::ScanRequest req;
+
+  // Deserialize the scan request.
+  if (!(s = skyhook::DeserializeScanRequest(*in, &req)).ok()) {
+    LogSkyhookError(s.message());
+    return SCAN_REQ_DESER_ERR_CODE;
+  }
+
+  // Scan the object.
+  std::shared_ptr<arrow::Table> table;
+  arrow::Result<std::shared_ptr<arrow::Table>> maybe_table;
+  switch (req.file_format) {
+    case skyhook::SkyhookFileType::type::PARQUET:
+      maybe_table = ScanParquetObject(hctx, std::move(req));
+      if (!maybe_table.ok()) {
+        LogSkyhookError("Could not scan parquet object: " +
+                        maybe_table.status().ToString());
+        return SCAN_ERR_CODE;
+      }
+      table = *maybe_table;
+      break;
+    case skyhook::SkyhookFileType::type::IPC:
+      maybe_table = ScanIpcObject(hctx, std::move(req));
+      if (!maybe_table.ok()) {
+        LogSkyhookError("Could not scan IPC object: " + maybe_table.status().ToString());
+        return SCAN_ERR_CODE;
+      }
+      table = *maybe_table;
+      break;
+    default:
+      table = nullptr;
+  }
+  if (!table) {
+    LogSkyhookError("Unsupported file format");
+    return SCAN_ERR_CODE;
+  }
+
+  // Serialize the resultant table to send back to the client.
+  ceph::bufferlist bl;
+  if (!(s = skyhook::SerializeTable(table, &bl)).ok()) {
+    LogSkyhookError(s.message());
+    return SCAN_RES_SER_ERR_CODE;
+  }
+
+  *out = std::move(bl);
+  return 0;
+}
+
+void __cls_init() {
+  /// Register the skyhook object classes with the OSD.
+  cls_register("skyhook", &h_class);
+  cls_register_cxx_method(h_class, "scan_op", CLS_METHOD_RD, scan_op, &h_scan_op);
+}
diff --git a/cpp/src/skyhook/cls/cls_skyhook_test.cc b/cpp/src/skyhook/cls/cls_skyhook_test.cc
new file mode 100644
index 0000000000000..461cdd6bc79de
--- /dev/null
+++ b/cpp/src/skyhook/cls/cls_skyhook_test.cc
@@ -0,0 +1,207 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#include "skyhook/client/file_skyhook.h"
+
+#include "arrow/compute/exec/expression.h"
+#include "arrow/dataset/dataset.h"
+#include "arrow/dataset/file_base.h"
+#include "arrow/filesystem/api.h"
+#include "arrow/io/api.h"
+#include "arrow/testing/gtest_util.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/iterator.h"
+#include "gtest/gtest.h"
+
+#include "parquet/arrow/reader.h"
+#include "parquet/arrow/writer.h"
+
+std::shared_ptr<skyhook::SkyhookFileFormat> GetSkyhookFormat() {
+  // The constants below should match the parameters with
+  // which the Ceph cluster is configured in integration_skyhook.sh.
+  // Currently, all the default values have been used.
+  std::string ceph_config_path = "/etc/ceph/ceph.conf";
+  std::string ceph_data_pool = "cephfs_data";
+  std::string ceph_user_name = "client.admin";
+  std::string ceph_cluster_name = "ceph";
+  std::string ceph_cls_name = "skyhook";
+  std::shared_ptr<skyhook::RadosConnCtx> rados_ctx =
+      std::make_shared<skyhook::RadosConnCtx>(ceph_config_path, ceph_data_pool,
+                                              ceph_user_name, ceph_cluster_name,
+                                              ceph_cls_name);
+  EXPECT_OK_AND_ASSIGN(auto format,
+                       skyhook::SkyhookFileFormat::Make(rados_ctx, "parquet"));
+  return format;
+}
+
+std::shared_ptr<arrow::dataset::ParquetFileFormat> GetParquetFormat() {
+  return std::make_shared<arrow::dataset::ParquetFileFormat>();
+}
+
+std::shared_ptr<arrow::dataset::Dataset> GetDatasetFromDirectory(
+    std::shared_ptr<arrow::fs::FileSystem> fs,
+    std::shared_ptr<arrow::dataset::FileFormat> format, std::string dir) {
+  arrow::fs::FileSelector s;
+  s.base_dir = std::move(dir);
+  s.recursive = true;
+
+  arrow::dataset::FileSystemFactoryOptions options;
+  options.partitioning = std::make_shared<arrow::dataset::HivePartitioning>(
+      arrow::schema({arrow::field("payment_type", arrow::int32()),
+                     arrow::field("VendorID", arrow::int32())}));
+  EXPECT_OK_AND_ASSIGN(auto factory, arrow::dataset::FileSystemDatasetFactory::Make(
+                                         std::move(fs), s, std::move(format), options));
+
+  arrow::dataset::InspectOptions inspect_options;
+  arrow::dataset::FinishOptions finish_options;
+  EXPECT_OK_AND_ASSIGN(auto schema, factory->Inspect(inspect_options));
+  EXPECT_OK_AND_ASSIGN(auto dataset, factory->Finish(finish_options));
+  return dataset;
+}
+
+std::shared_ptr<arrow::fs::FileSystem> GetFileSystemFromUri(const std::string& uri,
+                                                            std::string* path) {
+  EXPECT_OK_AND_ASSIGN(auto fs, arrow::fs::FileSystemFromUri(uri, path));
+  return fs;
+}
+
+std::shared_ptr<arrow::dataset::Dataset> GetDatasetFromPath(
+    std::shared_ptr<arrow::fs::FileSystem> fs,
+    std::shared_ptr<arrow::dataset::FileFormat> format, std::string path) {
+  EXPECT_OK_AND_ASSIGN(auto info, fs->GetFileInfo(path));
+  return GetDatasetFromDirectory(std::move(fs), std::move(format), std::move(path));
+}
+
+std::shared_ptr<arrow::dataset::Scanner> GetScannerFromDataset(
+    const std::shared_ptr<arrow::dataset::Dataset>& dataset,
+    std::vector<std::string> columns, arrow::compute::Expression filter,
+    bool use_threads) {
+  EXPECT_OK_AND_ASSIGN(auto scanner_builder, dataset->NewScan());
+
+  if (!columns.empty()) {
+    ARROW_EXPECT_OK(scanner_builder->Project(std::move(columns)));
+  }
+
+  ARROW_EXPECT_OK(scanner_builder->Filter(std::move(filter)));
+  ARROW_EXPECT_OK(scanner_builder->UseThreads(use_threads));
+  EXPECT_OK_AND_ASSIGN(auto scanner, scanner_builder->Finish());
+  return scanner;
+}
+
+TEST(TestSkyhookCLS, SelectEntireDataset) {
+  std::string path;
+  auto fs = GetFileSystemFromUri("file:///mnt/cephfs/nyc", &path);
+  std::vector<std::string> columns;
+
+  auto parquet_format = GetParquetFormat();
+  auto dataset = GetDatasetFromPath(fs, parquet_format, path);
+  auto scanner =
+      GetScannerFromDataset(dataset, columns, arrow::compute::literal(true), true);
+  EXPECT_OK_AND_ASSIGN(auto table_parquet, scanner->ToTable());
+
+  auto skyhook_format = GetSkyhookFormat();
+  dataset = GetDatasetFromPath(fs, skyhook_format, path);
+  scanner = GetScannerFromDataset(dataset, columns, arrow::compute::literal(true), true);
+  EXPECT_OK_AND_ASSIGN(auto table_skyhook_parquet, scanner->ToTable());
+
+  ASSERT_EQ(table_parquet->Equals(*table_skyhook_parquet), 1);
+  ASSERT_EQ(table_parquet->num_rows(), table_skyhook_parquet->num_rows());
+}
+
+TEST(TestSkyhookCLS, SelectFewRows) {
+  std::string path;
+  auto fs = GetFileSystemFromUri("file:///mnt/cephfs/nyc", &path);
+  std::vector<std::string> columns;
+  auto filter = arrow::compute::greater(arrow::compute::field_ref("payment_type"),
+                                        arrow::compute::literal(2));
+  auto parquet_format = GetParquetFormat();
+  auto dataset = GetDatasetFromPath(fs, parquet_format, path);
+  auto scanner = GetScannerFromDataset(dataset, columns, filter, true);
+  EXPECT_OK_AND_ASSIGN(auto table_parquet, scanner->ToTable());
+
+  auto skyhook_format = GetSkyhookFormat();
+  dataset = GetDatasetFromPath(fs, skyhook_format, path);
+  scanner = GetScannerFromDataset(dataset, columns, filter, true);
+  EXPECT_OK_AND_ASSIGN(auto table_skyhook_parquet, scanner->ToTable());
+
+  ASSERT_EQ(table_parquet->Equals(*table_skyhook_parquet), 1);
+  ASSERT_EQ(table_parquet->num_rows(), table_skyhook_parquet->num_rows());
+}
+
+TEST(TestSkyhookCLS, SelectFewColumns) {
+  std::string path;
+  auto fs = GetFileSystemFromUri("file:///mnt/cephfs/nyc", &path);
+  std::vector<std::string> columns = {"fare_amount", "total_amount"};
+
+  auto parquet_format = GetParquetFormat();
+  auto dataset = GetDatasetFromPath(fs, parquet_format, path);
+  auto scanner =
+      GetScannerFromDataset(dataset, columns, arrow::compute::literal(true), true);
+  EXPECT_OK_AND_ASSIGN(auto table_parquet, scanner->ToTable());
+
+  auto skyhook_format = GetSkyhookFormat();
+  dataset = GetDatasetFromPath(fs, skyhook_format, path);
+  scanner = GetScannerFromDataset(dataset, columns, arrow::compute::literal(true), true);
+  EXPECT_OK_AND_ASSIGN(auto table_skyhook_parquet, scanner->ToTable());
+
+  ASSERT_EQ(table_parquet->Equals(*table_skyhook_parquet), 1);
+  ASSERT_EQ(table_parquet->num_rows(), table_skyhook_parquet->num_rows());
+}
+
+TEST(TestSkyhookCLS, SelectRowsAndColumnsOnPartitionKey) {
+  std::string path;
+  auto fs = GetFileSystemFromUri("file:///mnt/cephfs/nyc", &path);
+  std::vector<std::string> columns = {"fare_amount", "VendorID", "payment_type"};
+  auto filter = arrow::compute::greater(arrow::compute::field_ref("payment_type"),
+                                        arrow::compute::literal(2));
+
+  auto parquet_format = GetParquetFormat();
+  auto dataset = GetDatasetFromPath(fs, parquet_format, path);
+  auto scanner = GetScannerFromDataset(dataset, columns, filter, true);
+  EXPECT_OK_AND_ASSIGN(auto table_parquet, scanner->ToTable());
+
+  auto skyhook_format = GetSkyhookFormat();
+  dataset = GetDatasetFromPath(fs, skyhook_format, path);
+  scanner = GetScannerFromDataset(dataset, columns, filter, true);
+  EXPECT_OK_AND_ASSIGN(auto table_skyhook_parquet, scanner->ToTable());
+
+  ASSERT_EQ(table_parquet->Equals(*table_skyhook_parquet), 1);
+  ASSERT_EQ(table_parquet->num_rows(), table_skyhook_parquet->num_rows());
+}
+
+TEST(TestSkyhookCLS, SelectRowsAndColumnsOnlyOnPartitionKey) {
+  std::string path;
+  auto fs = GetFileSystemFromUri("file:///mnt/cephfs/nyc", &path);
+  std::vector<std::string> columns = {"total_amount", "VendorID", "payment_type"};
+  auto filter = arrow::compute::and_(
+      arrow::compute::greater(arrow::compute::field_ref("payment_type"),
+                              arrow::compute::literal(2)),
+      arrow::compute::greater(arrow::compute::field_ref("VendorID"),
+                              arrow::compute::literal(1)));
+
+  auto parquet_format = GetParquetFormat();
+  auto dataset = GetDatasetFromPath(fs, parquet_format, path);
+  auto scanner = GetScannerFromDataset(dataset, columns, filter, true);
+  EXPECT_OK_AND_ASSIGN(auto table_parquet, scanner->ToTable());
+
+  auto skyhook_format = GetSkyhookFormat();
+  dataset = GetDatasetFromPath(fs, skyhook_format, path);
+  scanner = GetScannerFromDataset(dataset, columns, filter, true);
+  EXPECT_OK_AND_ASSIGN(auto table_skyhook_parquet, scanner->ToTable());
+
+  ASSERT_EQ(table_parquet->Equals(*table_skyhook_parquet), 1);
+  ASSERT_EQ(table_parquet->num_rows(), table_skyhook_parquet->num_rows());
+}
diff --git a/cpp/src/skyhook/protocol/ScanRequest.fbs b/cpp/src/skyhook/protocol/ScanRequest.fbs
new file mode 100644
index 0000000000000..870d603a5ea28
--- /dev/null
+++ b/cpp/src/skyhook/protocol/ScanRequest.fbs
@@ -0,0 +1,29 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+namespace org.apache.arrow.flatbuf;
+
+table ScanRequest {
+  file_size: long;
+  file_format: short;
+  filter: [ubyte];
+  partition: [ubyte];
+  dataset_schema: [ubyte];
+  projection_schema: [ubyte];
+}
+
+root_type ScanRequest;
diff --git a/cpp/src/skyhook/protocol/ScanRequest_generated.h b/cpp/src/skyhook/protocol/ScanRequest_generated.h
new file mode 100644
index 0000000000000..884857a1b4df7
--- /dev/null
+++ b/cpp/src/skyhook/protocol/ScanRequest_generated.h
@@ -0,0 +1,167 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+
+#ifndef FLATBUFFERS_GENERATED_SCANREQUEST_ORG_APACHE_ARROW_FLATBUF_H_
+#define FLATBUFFERS_GENERATED_SCANREQUEST_ORG_APACHE_ARROW_FLATBUF_H_
+
+#include "flatbuffers/flatbuffers.h"
+
+namespace org {
+namespace apache {
+namespace arrow {
+namespace flatbuf {
+
+struct ScanRequest;
+
+struct ScanRequest FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_FILE_SIZE = 4,
+    VT_FILE_FORMAT = 6,
+    VT_FILTER = 8,
+    VT_PARTITION = 10,
+    VT_DATASET_SCHEMA = 12,
+    VT_PROJECTION_SCHEMA = 14
+  };
+  int64_t file_size() const {
+    return GetField<int64_t>(VT_FILE_SIZE, 0);
+  }
+  int16_t file_format() const {
+    return GetField<int16_t>(VT_FILE_FORMAT, 0);
+  }
+  const flatbuffers::Vector<uint8_t> *filter() const {
+    return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_FILTER);
+  }
+  const flatbuffers::Vector<uint8_t> *partition() const {
+    return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_PARTITION);
+  }
+  const flatbuffers::Vector<uint8_t> *dataset_schema() const {
+    return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_DATASET_SCHEMA);
+  }
+  const flatbuffers::Vector<uint8_t> *projection_schema() const {
+    return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_PROJECTION_SCHEMA);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int64_t>(verifier, VT_FILE_SIZE) &&
+           VerifyField<int16_t>(verifier, VT_FILE_FORMAT) &&
+           VerifyOffset(verifier, VT_FILTER) &&
+           verifier.VerifyVector(filter()) &&
+           VerifyOffset(verifier, VT_PARTITION) &&
+           verifier.VerifyVector(partition()) &&
+           VerifyOffset(verifier, VT_DATASET_SCHEMA) &&
+           verifier.VerifyVector(dataset_schema()) &&
+           VerifyOffset(verifier, VT_PROJECTION_SCHEMA) &&
+           verifier.VerifyVector(projection_schema()) &&
+           verifier.EndTable();
+  }
+};
+
+struct ScanRequestBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_file_size(int64_t file_size) {
+    fbb_.AddElement<int64_t>(ScanRequest::VT_FILE_SIZE, file_size, 0);
+  }
+  void add_file_format(int16_t file_format) {
+    fbb_.AddElement<int16_t>(ScanRequest::VT_FILE_FORMAT, file_format, 0);
+  }
+  void add_filter(flatbuffers::Offset<flatbuffers::Vector<uint8_t>> filter) {
+    fbb_.AddOffset(ScanRequest::VT_FILTER, filter);
+  }
+  void add_partition(flatbuffers::Offset<flatbuffers::Vector<uint8_t>> partition) {
+    fbb_.AddOffset(ScanRequest::VT_PARTITION, partition);
+  }
+  void add_dataset_schema(flatbuffers::Offset<flatbuffers::Vector<uint8_t>> dataset_schema) {
+    fbb_.AddOffset(ScanRequest::VT_DATASET_SCHEMA, dataset_schema);
+  }
+  void add_projection_schema(flatbuffers::Offset<flatbuffers::Vector<uint8_t>> projection_schema) {
+    fbb_.AddOffset(ScanRequest::VT_PROJECTION_SCHEMA, projection_schema);
+  }
+  explicit ScanRequestBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ScanRequestBuilder &operator=(const ScanRequestBuilder &);
+  flatbuffers::Offset<ScanRequest> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<ScanRequest>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<ScanRequest> CreateScanRequest(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    int64_t file_size = 0,
+    int16_t file_format = 0,
+    flatbuffers::Offset<flatbuffers::Vector<uint8_t>> filter = 0,
+    flatbuffers::Offset<flatbuffers::Vector<uint8_t>> partition = 0,
+    flatbuffers::Offset<flatbuffers::Vector<uint8_t>> dataset_schema = 0,
+    flatbuffers::Offset<flatbuffers::Vector<uint8_t>> projection_schema = 0) {
+  ScanRequestBuilder builder_(_fbb);
+  builder_.add_file_size(file_size);
+  builder_.add_projection_schema(projection_schema);
+  builder_.add_dataset_schema(dataset_schema);
+  builder_.add_partition(partition);
+  builder_.add_filter(filter);
+  builder_.add_file_format(file_format);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<ScanRequest> CreateScanRequestDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    int64_t file_size = 0,
+    int16_t file_format = 0,
+    const std::vector<uint8_t> *filter = nullptr,
+    const std::vector<uint8_t> *partition = nullptr,
+    const std::vector<uint8_t> *dataset_schema = nullptr,
+    const std::vector<uint8_t> *projection_schema = nullptr) {
+  auto filter__ = filter ? _fbb.CreateVector<uint8_t>(*filter) : 0;
+  auto partition__ = partition ? _fbb.CreateVector<uint8_t>(*partition) : 0;
+  auto dataset_schema__ = dataset_schema ? _fbb.CreateVector<uint8_t>(*dataset_schema) : 0;
+  auto projection_schema__ = projection_schema ? _fbb.CreateVector<uint8_t>(*projection_schema) : 0;
+  return org::apache::arrow::flatbuf::CreateScanRequest(
+      _fbb,
+      file_size,
+      file_format,
+      filter__,
+      partition__,
+      dataset_schema__,
+      projection_schema__);
+}
+
+inline const org::apache::arrow::flatbuf::ScanRequest *GetScanRequest(const void *buf) {
+  return flatbuffers::GetRoot<org::apache::arrow::flatbuf::ScanRequest>(buf);
+}
+
+inline const org::apache::arrow::flatbuf::ScanRequest *GetSizePrefixedScanRequest(const void *buf) {
+  return flatbuffers::GetSizePrefixedRoot<org::apache::arrow::flatbuf::ScanRequest>(buf);
+}
+
+inline bool VerifyScanRequestBuffer(
+    flatbuffers::Verifier &verifier) {
+  return verifier.VerifyBuffer<org::apache::arrow::flatbuf::ScanRequest>(nullptr);
+}
+
+inline bool VerifySizePrefixedScanRequestBuffer(
+    flatbuffers::Verifier &verifier) {
+  return verifier.VerifySizePrefixedBuffer<org::apache::arrow::flatbuf::ScanRequest>(nullptr);
+}
+
+inline void FinishScanRequestBuffer(
+    flatbuffers::FlatBufferBuilder &fbb,
+    flatbuffers::Offset<org::apache::arrow::flatbuf::ScanRequest> root) {
+  fbb.Finish(root);
+}
+
+inline void FinishSizePrefixedScanRequestBuffer(
+    flatbuffers::FlatBufferBuilder &fbb,
+    flatbuffers::Offset<org::apache::arrow::flatbuf::ScanRequest> root) {
+  fbb.FinishSizePrefixed(root);
+}
+
+}  // namespace flatbuf
+}  // namespace arrow
+}  // namespace apache
+}  // namespace org
+
+#endif  // FLATBUFFERS_GENERATED_SCANREQUEST_ORG_APACHE_ARROW_FLATBUF_H_
diff --git a/cpp/src/skyhook/protocol/rados_protocol.cc b/cpp/src/skyhook/protocol/rados_protocol.cc
new file mode 100644
index 0000000000000..cb1acec1faa91
--- /dev/null
+++ b/cpp/src/skyhook/protocol/rados_protocol.cc
@@ -0,0 +1,99 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#include "skyhook/protocol/rados_protocol.h"
+
+#include "arrow/util/io_util.h"
+
+#include <iostream>
+#include <vector>
+
+namespace skyhook {
+namespace rados {
+
+template <typename... Args>
+arrow::Status GetStatusFromReturnCode(int code, Args&&... args) {
+  if (code)
+    return arrow::internal::StatusFromErrno(code, arrow::StatusCode::Invalid,
+                                            std::forward<Args>(args)...);
+  return arrow::Status::OK();
+}
+
+arrow::Status IoCtxInterface::read(const std::string& oid, ceph::bufferlist& bl,
+                                   size_t len, uint64_t offset) {
+  return GetStatusFromReturnCode(ioCtx->read(oid, bl, len, offset),
+                                 "ioctx->read failed.");
+}
+
+arrow::Status IoCtxInterface::exec(const std::string& oid, const char* cls,
+                                   const char* method, ceph::bufferlist& in,
+                                   ceph::bufferlist& out) {
+  return GetStatusFromReturnCode(ioCtx->exec(oid, cls, method, in, out),
+                                 "ioctx->exec failed.");
+}
+
+arrow::Status IoCtxInterface::stat(const std::string& oid, uint64_t* psize) {
+  return GetStatusFromReturnCode(ioCtx->stat(oid, psize, NULL), "ioctx->stat failed.");
+}
+
+arrow::Status RadosInterface::init2(const char* const name, const char* const clustername,
+                                    uint64_t flags) {
+  return GetStatusFromReturnCode(cluster->init2(name, clustername, flags),
+                                 "rados->init failed.");
+}
+
+arrow::Status RadosInterface::ioctx_create(const char* name, IoCtxInterface* pioctx) {
+  librados::IoCtx ioCtx;
+  int ret = cluster->ioctx_create(name, ioCtx);
+  pioctx->setIoCtx(&ioCtx);
+  return GetStatusFromReturnCode(ret, "rados->ioctx_create failed.");
+}
+
+arrow::Status RadosInterface::conf_read_file(const char* const path) {
+  return GetStatusFromReturnCode(cluster->conf_read_file(path),
+                                 "rados->conf_read_file failed.");
+}
+
+arrow::Status RadosInterface::connect() {
+  return GetStatusFromReturnCode(cluster->connect(), "rados->connect failed.");
+}
+
+void RadosInterface::shutdown() { cluster->shutdown(); }
+
+RadosConn::~RadosConn() { Shutdown(); }
+
+arrow::Status RadosConn::Connect() {
+  if (connected) {
+    return arrow::Status::OK();
+  }
+
+  ARROW_RETURN_NOT_OK(
+      rados->init2(ctx->ceph_user_name.c_str(), ctx->ceph_cluster_name.c_str(), 0));
+  ARROW_RETURN_NOT_OK(rados->conf_read_file(ctx->ceph_config_path.c_str()));
+  ARROW_RETURN_NOT_OK(rados->connect());
+  ARROW_RETURN_NOT_OK(rados->ioctx_create(ctx->ceph_data_pool.c_str(), io_ctx.get()));
+  return arrow::Status::OK();
+}
+
+void RadosConn::Shutdown() {
+  if (connected) {
+    rados->shutdown();
+    connected = false;
+  }
+}
+
+}  // namespace rados
+}  // namespace skyhook
diff --git a/cpp/src/skyhook/protocol/rados_protocol.h b/cpp/src/skyhook/protocol/rados_protocol.h
new file mode 100644
index 0000000000000..3e5fac7640b65
--- /dev/null
+++ b/cpp/src/skyhook/protocol/rados_protocol.h
@@ -0,0 +1,103 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+
+#include <rados/librados.hpp>
+
+#include "arrow/status.h"
+#include "arrow/util/make_unique.h"
+
+#include "skyhook/client/file_skyhook.h"
+
+namespace skyhook {
+namespace rados {
+
+class IoCtxInterface {
+ public:
+  IoCtxInterface() { ioCtx = arrow::internal::make_unique<librados::IoCtx>(); }
+  /// \brief Read from a RADOS object.
+  ///
+  /// \param[in] oid the ID of the object to read.
+  /// \param[in] bl a bufferlist to hold the contents of the read object.
+  /// \param[in] len the length of data to read from the object.
+  /// \param[in] offset the offset to read from in the object.
+  arrow::Status read(const std::string& oid, ceph::bufferlist& bl, size_t len,
+                     uint64_t offset);
+  /// \brief Executes a Ceph Object Class method.
+  ///
+  /// \param[in] oid the object ID on which to invoke the CLS function.
+  /// \param[in] cls the name of the object class.
+  /// \param[in] method the name of the object class method.
+  /// \param[in] in a bufferlist to send data to the object class method.
+  /// \param[in] out a bufferlist to recieve data from the object class method.
+  arrow::Status exec(const std::string& oid, const char* cls, const char* method,
+                     ceph::bufferlist& in, ceph::bufferlist& out);
+  /// \brief Execute POSIX stat on a RADOS object.
+  ///
+  /// \param[in] oid the object ID on which to call stat.
+  /// \param[out] psize hold the size of the object.
+  arrow::Status stat(const std::string& oid, uint64_t* psize);
+  /// \brief Set the `librados::IoCtx` instance inside a IoCtxInterface instance.
+  void setIoCtx(librados::IoCtx* ioCtx_) { *ioCtx = *ioCtx_; }
+
+ private:
+  std::unique_ptr<librados::IoCtx> ioCtx;
+};
+
+class RadosInterface {
+ public:
+  RadosInterface() { cluster = arrow::internal::make_unique<librados::Rados>(); }
+  /// Initializes a cluster handle.
+  arrow::Status init2(const char* const name, const char* const clustername,
+                      uint64_t flags);
+  /// Create an I/O context
+  arrow::Status ioctx_create(const char* name, IoCtxInterface* pioctx);
+  /// Read the Ceph config file.
+  arrow::Status conf_read_file(const char* const path);
+  /// Connect to the Ceph cluster.
+  arrow::Status connect();
+  /// Close connection to the Ceph cluster.
+  void shutdown();
+
+ private:
+  std::unique_ptr<librados::Rados> cluster;
+};
+
+/// Connect to a Ceph cluster and hold the connection
+/// information for use in later stages.
+class RadosConn {
+ public:
+  explicit RadosConn(std::shared_ptr<skyhook::RadosConnCtx> ctx)
+      : ctx(std::move(ctx)),
+        rados(arrow::internal::make_unique<RadosInterface>()),
+        io_ctx(arrow::internal::make_unique<IoCtxInterface>()),
+        connected(false) {}
+  ~RadosConn();
+  /// Connect to the Ceph cluster.
+  arrow::Status Connect();
+  /// Shutdown the connection to the Ceph
+  /// cluster if already connected.
+  void Shutdown();
+
+  std::shared_ptr<skyhook::RadosConnCtx> ctx;
+  std::unique_ptr<RadosInterface> rados;
+  std::unique_ptr<IoCtxInterface> io_ctx;
+  bool connected;
+};
+
+}  // namespace rados
+}  // namespace skyhook
diff --git a/cpp/src/skyhook/protocol/skyhook_protocol.cc b/cpp/src/skyhook/protocol/skyhook_protocol.cc
new file mode 100644
index 0000000000000..c261048197209
--- /dev/null
+++ b/cpp/src/skyhook/protocol/skyhook_protocol.cc
@@ -0,0 +1,136 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#include "skyhook/protocol/skyhook_protocol.h"
+
+#include <flatbuffers/flatbuffers.h>
+
+#include "ScanRequest_generated.h"
+#include "arrow/io/api.h"
+#include "arrow/ipc/api.h"
+#include "arrow/result.h"
+#include "arrow/util/io_util.h"
+
+namespace skyhook {
+
+namespace flatbuf = org::apache::arrow::flatbuf;
+
+arrow::Status SerializeScanRequest(ScanRequest& req, ceph::bufferlist* bl) {
+  ARROW_ASSIGN_OR_RAISE(auto filter_expression,
+                        arrow::compute::Serialize(req.filter_expression));
+  ARROW_ASSIGN_OR_RAISE(auto partition_expression,
+                        arrow::compute::Serialize(req.partition_expression));
+  ARROW_ASSIGN_OR_RAISE(auto projection_schema,
+                        arrow::ipc::SerializeSchema(*req.projection_schema));
+  ARROW_ASSIGN_OR_RAISE(auto dataset_schema,
+                        arrow::ipc::SerializeSchema(*req.dataset_schema));
+
+  flatbuffers::FlatBufferBuilder builder(1024);
+  auto filter_expression_vector =
+      builder.CreateVector(filter_expression->data(), filter_expression->size());
+  auto partition_expression_vector =
+      builder.CreateVector(partition_expression->data(), partition_expression->size());
+  auto projected_schema_vector =
+      builder.CreateVector(projection_schema->data(), projection_schema->size());
+  auto dataset_schema_vector =
+      builder.CreateVector(dataset_schema->data(), dataset_schema->size());
+
+  auto request = flatbuf::CreateScanRequest(
+      builder, req.file_size, static_cast<int>(req.file_format), filter_expression_vector,
+      partition_expression_vector, dataset_schema_vector, projected_schema_vector);
+  builder.Finish(request);
+  uint8_t* buf = builder.GetBufferPointer();
+  int size = builder.GetSize();
+
+  bl->append(reinterpret_cast<const char*>(buf), size);
+  return arrow::Status::OK();
+}
+
+arrow::Status DeserializeScanRequest(ceph::bufferlist& bl, ScanRequest* req) {
+  auto request = flatbuf::GetScanRequest((uint8_t*)bl.c_str());
+
+  ARROW_ASSIGN_OR_RAISE(auto filter_expression,
+                        arrow::compute::Deserialize(std::make_shared<arrow::Buffer>(
+                            request->filter()->data(), request->filter()->size())));
+  req->filter_expression = filter_expression;
+
+  ARROW_ASSIGN_OR_RAISE(auto partition_expression,
+                        arrow::compute::Deserialize(std::make_shared<arrow::Buffer>(
+                            request->partition()->data(), request->partition()->size())));
+  req->partition_expression = partition_expression;
+
+  arrow::ipc::DictionaryMemo empty_memo;
+  arrow::io::BufferReader projection_schema_reader(request->projection_schema()->data(),
+                                                   request->projection_schema()->size());
+  arrow::io::BufferReader dataset_schema_reader(request->dataset_schema()->data(),
+                                                request->dataset_schema()->size());
+
+  ARROW_ASSIGN_OR_RAISE(req->projection_schema,
+                        arrow::ipc::ReadSchema(&projection_schema_reader, &empty_memo));
+  ARROW_ASSIGN_OR_RAISE(req->dataset_schema,
+                        arrow::ipc::ReadSchema(&dataset_schema_reader, &empty_memo));
+
+  req->file_size = request->file_size();
+  req->file_format = (SkyhookFileType::type)request->file_format();
+  return arrow::Status::OK();
+}
+
+arrow::Status SerializeTable(const std::shared_ptr<arrow::Table>& table,
+                             ceph::bufferlist* bl) {
+  ARROW_ASSIGN_OR_RAISE(auto buffer_output_stream,
+                        arrow::io::BufferOutputStream::Create());
+
+  auto options = arrow::ipc::IpcWriteOptions::Defaults();
+  auto codec = arrow::Compression::LZ4_FRAME;
+
+  ARROW_ASSIGN_OR_RAISE(options.codec, arrow::util::Codec::Create(codec));
+  ARROW_ASSIGN_OR_RAISE(auto writer, arrow::ipc::MakeStreamWriter(
+                                         buffer_output_stream, table->schema(), options));
+
+  ARROW_RETURN_NOT_OK(writer->WriteTable(*table));
+  ARROW_RETURN_NOT_OK(writer->Close());
+
+  ARROW_ASSIGN_OR_RAISE(auto buffer, buffer_output_stream->Finish());
+  bl->append(reinterpret_cast<const char*>(buffer->data()), buffer->size());
+  return arrow::Status::OK();
+}
+
+arrow::Status DeserializeTable(ceph::bufferlist& bl, bool use_threads,
+                               arrow::RecordBatchVector* batches) {
+  auto buffer = std::make_shared<arrow::Buffer>((uint8_t*)bl.c_str(), bl.length());
+  auto buffer_reader = std::make_shared<arrow::io::BufferReader>(buffer);
+  auto options = arrow::ipc::IpcReadOptions::Defaults();
+  options.use_threads = use_threads;
+  ARROW_ASSIGN_OR_RAISE(
+      auto reader, arrow::ipc::RecordBatchStreamReader::Open(buffer_reader, options));
+  ARROW_RETURN_NOT_OK(reader->ReadAll(batches));
+  return arrow::Status::OK();
+}
+
+arrow::Status ExecuteObjectClassFn(const std::shared_ptr<rados::RadosConn>& connection,
+                                   const std::string& oid, const std::string& fn,
+                                   ceph::bufferlist& in, ceph::bufferlist& out) {
+  int e = arrow::internal::ErrnoFromStatus(connection->io_ctx->exec(
+      oid.c_str(), connection->ctx->ceph_cls_name.c_str(), fn.c_str(), in, out));
+
+  if (e == SCAN_ERR_CODE) return arrow::Status::Invalid(SCAN_ERR_MSG);
+  if (e == SCAN_REQ_DESER_ERR_CODE) return arrow::Status::Invalid(SCAN_REQ_DESER_ERR_MSG);
+  if (e == SCAN_RES_SER_ERR_CODE) return arrow::Status::Invalid(SCAN_RES_SER_ERR_MSG);
+  if (e != 0) return arrow::Status::Invalid(SCAN_UNKNOWN_ERR_MSG);
+  return arrow::Status::OK();
+}
+
+}  // namespace skyhook
diff --git a/cpp/src/skyhook/protocol/skyhook_protocol.h b/cpp/src/skyhook/protocol/skyhook_protocol.h
new file mode 100644
index 0000000000000..b4f6d6ee1b477
--- /dev/null
+++ b/cpp/src/skyhook/protocol/skyhook_protocol.h
@@ -0,0 +1,116 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+
+#include "skyhook/protocol/rados_protocol.h"
+
+#include <sys/stat.h>
+#include <sstream>
+
+#include "arrow/compute/exec/expression.h"
+#include "arrow/record_batch.h"
+#include "arrow/table.h"
+#include "arrow/type.h"
+
+#define SCAN_UNKNOWN_ERR_MSG "something went wrong while scanning file fragment"
+#define SCAN_ERR_CODE 25
+#define SCAN_ERR_MSG "failed to scan file fragment"
+#define SCAN_REQ_DESER_ERR_CODE 26
+#define SCAN_REQ_DESER_ERR_MSG "failed to deserialize scan request"
+#define SCAN_RES_SER_ERR_CODE 27
+#define SCAN_RES_SER_ERR_MSG "failed to serialize result table"
+
+namespace skyhook {
+
+/// An enum to represent the different
+/// types of file formats that Skyhook supports.
+struct SkyhookFileType {
+  enum type { PARQUET, IPC };
+};
+
+/// A struct encapsulating all the parameters
+/// required to be serialized in the form of flatbuffers for
+/// sending to the cls.
+struct ScanRequest {
+  arrow::compute::Expression filter_expression;
+  arrow::compute::Expression partition_expression;
+  std::shared_ptr<arrow::Schema> projection_schema;
+  std::shared_ptr<arrow::Schema> dataset_schema;
+  int64_t file_size;
+  SkyhookFileType::type file_format;
+};
+
+/// Utility functions to serialize and deserialize scan requests and result Arrow tables.
+arrow::Status SerializeScanRequest(ScanRequest& req, ceph::bufferlist* bl);
+arrow::Status DeserializeScanRequest(ceph::bufferlist& bl, ScanRequest* req);
+arrow::Status SerializeTable(const std::shared_ptr<arrow::Table>& table,
+                             ceph::bufferlist* bl);
+arrow::Status DeserializeTable(ceph::bufferlist& bl, bool use_threads,
+                               arrow::RecordBatchVector* batches);
+
+/// Utility function to invoke a RADOS object class function on an RADOS object.
+arrow::Status ExecuteObjectClassFn(const std::shared_ptr<rados::RadosConn>& connection,
+                                   const std::string& oid, const std::string& fn,
+                                   ceph::bufferlist& in, ceph::bufferlist& out);
+
+/// An interface for translating the name of a file in CephFS to its
+/// corresponding object ID in RADOS assuming 1:1 mapping between a file
+/// and it's underlying object.
+class SkyhookDirectObjectAccess {
+ public:
+  explicit SkyhookDirectObjectAccess(std::shared_ptr<rados::RadosConn> connection)
+      : connection_(std::move(connection)) {}
+
+  ~SkyhookDirectObjectAccess() = default;
+
+  /// Execute a POSIX stat on a file.
+  arrow::Status Stat(const std::string& path, struct stat& st) {
+    struct stat file_st;
+    if (stat(path.c_str(), &file_st) < 0)
+      return arrow::Status::Invalid("stat returned non-zero exit code.");
+    st = file_st;
+    return arrow::Status::OK();
+  }
+
+  /// Convert a file inode to RADOS object ID.
+  std::string ConvertInodeToOID(uint64_t inode) {
+    std::stringstream ss;
+    /// In Ceph, the underlying stripes that make up a file are
+    /// named in the format [hex(inode)].[8-bit-binary(stripe_index)].
+    ss << std::hex << inode;
+
+    /// Since in Skyhook, we ensure a single stripe per file,
+    /// we can assume the stripe index to be always 0 and hence
+    /// hardcode it's 8-bit binary form.
+    std::string oid(ss.str() + ".00000000");
+    return oid;
+  }
+
+  /// Execute an object class method. It uses the `librados::exec` api to
+  /// perform object clsass method calls on the storage node and
+  /// stores the result in an output bufferlist.
+  arrow::Status Exec(uint64_t inode, const std::string& fn, ceph::bufferlist& in,
+                     ceph::bufferlist& out) {
+    std::string oid = ConvertInodeToOID(inode);
+    return ExecuteObjectClassFn(connection_, oid, fn, in, out);
+  }
+
+ private:
+  std::shared_ptr<rados::RadosConn> connection_;
+};
+
+}  // namespace skyhook
diff --git a/cpp/src/skyhook/protocol/skyhook_protocol_test.cc b/cpp/src/skyhook/protocol/skyhook_protocol_test.cc
new file mode 100644
index 0000000000000..1d3af3ef72cf1
--- /dev/null
+++ b/cpp/src/skyhook/protocol/skyhook_protocol_test.cc
@@ -0,0 +1,71 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#include "skyhook/protocol/skyhook_protocol.h"
+
+#include "arrow/compute/exec/expression.h"
+#include "arrow/dataset/test_util.h"
+#include "arrow/table.h"
+#include "arrow/testing/gtest_util.h"
+
+std::shared_ptr<arrow::Table> CreateTable() {
+  auto schema = arrow::schema({
+      {arrow::field("a", arrow::uint8())},
+      {arrow::field("b", arrow::uint32())},
+  });
+
+  std::shared_ptr<arrow::Table> table;
+  return TableFromJSON(schema, {R"([{"a": null, "b": 5},
+                                     {"a": 1,    "b": 3},
+                                     {"a": 3,    "b": null},
+                                     {"a": null, "b": null},
+                                     {"a": 2,    "b": 5},
+                                     {"a": 1,    "b": 5}
+                                    ])"});
+}
+
+TEST(TestSkyhookProtocol, SerDeserScanRequest) {
+  ceph::bufferlist bl;
+  skyhook::ScanRequest req;
+  req.filter_expression = arrow::compute::literal(true);
+  req.partition_expression = arrow::compute::literal(false);
+  req.projection_schema = arrow::schema({arrow::field("a", arrow::int64())});
+  req.dataset_schema = arrow::schema({arrow::field("a", arrow::int64())});
+  req.file_size = 1000000;
+  req.file_format = skyhook::SkyhookFileType::type::IPC;
+  ASSERT_OK(skyhook::SerializeScanRequest(req, &bl));
+
+  skyhook::ScanRequest req_;
+  ASSERT_OK(skyhook::DeserializeScanRequest(bl, &req_));
+  ASSERT_TRUE(req.filter_expression.Equals(req_.filter_expression));
+  ASSERT_TRUE(req.partition_expression.Equals(req_.partition_expression));
+  ASSERT_TRUE(req.projection_schema->Equals(req_.projection_schema));
+  ASSERT_TRUE(req.dataset_schema->Equals(req_.dataset_schema));
+  ASSERT_EQ(req.file_size, req_.file_size);
+  ASSERT_EQ(req.file_format, req_.file_format);
+}
+
+TEST(TestSkyhookProtocol, SerDeserTable) {
+  std::shared_ptr<arrow::Table> table = CreateTable();
+  ceph::bufferlist bl;
+  ASSERT_OK(skyhook::SerializeTable(table, &bl));
+
+  arrow::RecordBatchVector batches;
+  ASSERT_OK(skyhook::DeserializeTable(bl, false, &batches));
+  ASSERT_OK_AND_ASSIGN(auto materialized_table, arrow::Table::FromRecordBatches(batches));
+
+  ASSERT_TRUE(table->Equals(*materialized_table));
+}
diff --git a/cpp/src/skyhook/skyhook.pc.in b/cpp/src/skyhook/skyhook.pc.in
new file mode 100644
index 0000000000000..a3a4da5ee9c93
--- /dev/null
+++ b/cpp/src/skyhook/skyhook.pc.in
@@ -0,0 +1,26 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+prefix=@CMAKE_INSTALL_PREFIX@
+libdir=${prefix}/@CMAKE_INSTALL_LIBDIR@
+includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
+
+Name: Skyhook
+Description: Skyhook is a plugin for offloading computations into Ceph.
+Version: @SKYHOOK_VERSION@
+Requires: arrow_dataset
+Libs: -L${libdir} -larrow_skyhook_client
\ No newline at end of file
diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml
index 2c6784e314b80..c007e7f428af0 100644
--- a/dev/tasks/tasks.yml
+++ b/dev/tasks/tasks.yml
@@ -896,6 +896,15 @@ tasks:
         UBUNTU: 20.04
       image: ubuntu-cpp-bundled
 
+  test-skyhook-integration:
+    ci: github
+    template: docker-tests/github.linux.yml
+    params:
+      env:
+        UBUNTU: 20.04
+      flags: -e ARROW_SKYHOOK=ON
+      image: ubuntu-cpp
+
   test-debian-11-cpp:
     ci: github
     template: docker-tests/github.linux.yml
diff --git a/docker-compose.yml b/docker-compose.yml
index 455424d2cc54a..93314e440a2da 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -298,9 +298,11 @@ services:
     volumes: &debian-volumes
       - .:/arrow:delegated
       - ${DOCKER_VOLUME_PREFIX}debian-ccache:/ccache:delegated
+    # integration_skyhook.sh is a no-op unless skyhook is on.
     command: &cpp-command >
       /bin/bash -c "
         /arrow/ci/scripts/cpp_build.sh /arrow /build &&
+        /arrow/ci/scripts/integration_skyhook.sh /build &&
         /arrow/ci/scripts/cpp_test.sh /arrow /build"
 
   ubuntu-cpp:
@@ -324,6 +326,7 @@ services:
         gcc_version: ${GCC_VERSION}
     shm_size: *shm-size
     ulimits: *ulimits
+    privileged: true
     environment:
       <<: *ccache
       ARROW_ENABLE_TIMING_TESTS:  # inherit

From 9bd7b32d0692a6d718147483a4e41c3cba67b826 Mon Sep 17 00:00:00 2001
From: Benson Muite <bkmgit@users.noreply.github.com>
Date: Fri, 22 Oct 2021 15:34:42 -0400
Subject: [PATCH 007/194] ARROW-14391: [Docs] Archery requires docker

Closes #11472 from bkmgit/ARROW-14391

Lead-authored-by: Benson Muite <bkmgit@users.noreply.github.com>
Co-authored-by: David Li <li.davidm96@gmail.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 docs/source/developers/archery.rst | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/docs/source/developers/archery.rst b/docs/source/developers/archery.rst
index a587975d6c9e4..3f7cbee8fb4e1 100644
--- a/docs/source/developers/archery.rst
+++ b/docs/source/developers/archery.rst
@@ -26,7 +26,7 @@ utility called Archery.
 Installation
 ------------
 
-Archery requires Python 3.6 or later. It is recommended to install archery in
+Archery requires Python 3.6 or later. It is recommended to install Archery in
 *editable* mode with the ``-e`` flag to automatically update the installation
 when pulling the Arrow repository. After cloning the Arrow repository, from
 the top level directory install Archery by using the command
@@ -35,6 +35,10 @@ the top level directory install Archery by using the command
 
    pip install -e dev/archery[all]
 
+Many operations in Archery make use of `Docker <https://docs.docker.com/>`_
+and `docker-compose <https://docs.docker.com/compose/>`_, which you may
+also want to install.
+
 Usage
 -----
 
@@ -85,3 +89,6 @@ help output, for example:
      images  List the available docker-compose images.
      push    Push the generated docker-compose image.
      run     Execute docker-compose builds.
+
+A more detailed introduction to using docker with
+Archery is available in a separate :ref:`page <docker>`.

From 2dcafa19588246ff7b7abc553f6e8ab2c1989965 Mon Sep 17 00:00:00 2001
From: liyafan82 <fan_li_ya@foxmail.com>
Date: Sat, 23 Oct 2021 22:42:31 +0800
Subject: [PATCH 008/194] ARROW-13981: [Java] VectorSchemaRootAppender doesn't
 work for BitVector

Please see: https://issues.apache.org/jira/browse/ARROW-13981

Closes #11317 from liyafan82/fly_1002_app

Authored-by: liyafan82 <fan_li_ya@foxmail.com>
Signed-off-by: liyafan82 <fan_li_ya@foxmail.com>
---
 .../arrow/vector/util/VectorAppender.java     | 14 +++++++---
 .../arrow/vector/util/TestVectorAppender.java | 27 +++++++++++++++++++
 2 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java b/java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java
index e5809e93ea802..ea78917c3ddb2 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java
@@ -25,6 +25,7 @@
 import org.apache.arrow.vector.BaseFixedWidthVector;
 import org.apache.arrow.vector.BaseLargeVariableWidthVector;
 import org.apache.arrow.vector.BaseVariableWidthVector;
+import org.apache.arrow.vector.BitVector;
 import org.apache.arrow.vector.BitVectorHelper;
 import org.apache.arrow.vector.ExtensionTypeVector;
 import org.apache.arrow.vector.NullVector;
@@ -83,9 +84,16 @@ public ValueVector visit(BaseFixedWidthVector deltaVector, Void value) {
             deltaVector.getValidityBuffer(), deltaVector.getValueCount(), targetVector.getValidityBuffer());
 
     // append data buffer
-    PlatformDependent.copyMemory(deltaVector.getDataBuffer().memoryAddress(),
-            targetVector.getDataBuffer().memoryAddress() + deltaVector.getTypeWidth() * targetVector.getValueCount(),
-            deltaVector.getTypeWidth() * deltaVector.getValueCount());
+    if (targetVector instanceof BitVector) {
+      // special processing for bit vector, as its type width is 0
+      BitVectorHelper.concatBits(targetVector.getDataBuffer(), targetVector.getValueCount(),
+              deltaVector.getDataBuffer(), deltaVector.getValueCount(), targetVector.getDataBuffer());
+
+    } else {
+      PlatformDependent.copyMemory(deltaVector.getDataBuffer().memoryAddress(),
+              targetVector.getDataBuffer().memoryAddress() + deltaVector.getTypeWidth() * targetVector.getValueCount(),
+              deltaVector.getTypeWidth() * deltaVector.getValueCount());
+    }
     targetVector.setValueCount(newValueCount);
     return targetVector;
   }
diff --git a/java/vector/src/test/java/org/apache/arrow/vector/util/TestVectorAppender.java b/java/vector/src/test/java/org/apache/arrow/vector/util/TestVectorAppender.java
index 1cd26312008bb..25d26623d5c05 100644
--- a/java/vector/src/test/java/org/apache/arrow/vector/util/TestVectorAppender.java
+++ b/java/vector/src/test/java/org/apache/arrow/vector/util/TestVectorAppender.java
@@ -27,6 +27,7 @@
 import org.apache.arrow.memory.BufferAllocator;
 import org.apache.arrow.memory.RootAllocator;
 import org.apache.arrow.vector.BigIntVector;
+import org.apache.arrow.vector.BitVector;
 import org.apache.arrow.vector.Float4Vector;
 import org.apache.arrow.vector.IntVector;
 import org.apache.arrow.vector.LargeVarCharVector;
@@ -96,6 +97,32 @@ public void testAppendFixedWidthVector() {
     }
   }
 
+  @Test
+  public void testAppendBitVector() {
+    final int length1 = 10;
+    final int length2 = 5;
+    try (BitVector target = new BitVector("", allocator);
+         BitVector delta = new BitVector("", allocator)) {
+
+      target.allocateNew(length1);
+      delta.allocateNew(length2);
+
+      ValueVectorDataPopulator.setVector(target, 0, 1, 0, 1, 0, 1, 0, null, 0, 1);
+      ValueVectorDataPopulator.setVector(delta, null, 1, 1, 0, 0);
+
+      VectorAppender appender = new VectorAppender(target);
+      delta.accept(appender, null);
+
+      assertEquals(length1 + length2, target.getValueCount());
+
+      try (BitVector expected = new BitVector("expected", allocator)) {
+        expected.allocateNew();
+        ValueVectorDataPopulator.setVector(expected, 0, 1, 0, 1, 0, 1, 0, null, 0, 1, null, 1, 1, 0, 0);
+        assertVectorsEqual(expected, target);
+      }
+    }
+  }
+
   @Test
   public void testAppendEmptyFixedWidthVector() {
     try (IntVector target = new IntVector("", allocator);

From aecdc0bd75ef14095ec2a560885c3f4e059bc730 Mon Sep 17 00:00:00 2001
From: Matthew Topol <mtopol@factset.com>
Date: Sat, 23 Oct 2021 14:09:59 -0400
Subject: [PATCH 009/194] ARROW-13984: [Go][Parquet] file handling for go
 parquet, just the readers

This implements the file/column and page readers for Parquet files. In order to keep this smaller, I've only included what was necessary for the readers and will make a separate PR for the file and column writers after this.

Closes #11146 from zeroshade/goparquet-file

Lead-authored-by: Matthew Topol <mtopol@factset.com>
Co-authored-by: Matt Topol <mtopol@factset.com>
Signed-off-by: Matthew Topol <mtopol@factset.com>
---
 go/parquet/file/column_reader.go              | 498 ++++++++++++++
 go/parquet/file/column_reader_test.go         | 450 +++++++++++++
 go/parquet/file/column_reader_types.gen.go    | 299 +++++++++
 .../file/column_reader_types.gen.go.tmpl      |  62 ++
 go/parquet/file/file_reader.go                | 336 ++++++++++
 go/parquet/file/file_reader_test.go           | 304 +++++++++
 go/parquet/file/level_conversion.go           | 262 ++++++++
 go/parquet/file/level_conversion_test.go      | 194 ++++++
 go/parquet/file/page_reader.go                | 620 ++++++++++++++++++
 go/parquet/file/row_group_reader.go           | 130 ++++
 go/parquet/go.sum                             |   1 +
 go/parquet/internal/bmi/bitmap_bmi2_noasm.go  |  24 +
 go/parquet/internal/bmi/bmi.go                |   2 +-
 .../internal/encoding/boolean_decoder.go      |   4 +-
 .../internal/encoding/boolean_encoder.go      |   3 +
 .../internal/encoding/typed_encoder.gen.go    | 158 ++++-
 .../encoding/typed_encoder.gen.go.tmpl        |  46 +-
 go/parquet/internal/testutils/pagebuilder.go  | 297 +++++++++
 go/parquet/reader_properties.go               |   3 +-
 go/parquet/types.go                           |  10 +
 20 files changed, 3689 insertions(+), 14 deletions(-)
 create mode 100644 go/parquet/file/column_reader.go
 create mode 100644 go/parquet/file/column_reader_test.go
 create mode 100644 go/parquet/file/column_reader_types.gen.go
 create mode 100644 go/parquet/file/column_reader_types.gen.go.tmpl
 create mode 100644 go/parquet/file/file_reader.go
 create mode 100644 go/parquet/file/file_reader_test.go
 create mode 100644 go/parquet/file/level_conversion.go
 create mode 100644 go/parquet/file/level_conversion_test.go
 create mode 100644 go/parquet/file/page_reader.go
 create mode 100644 go/parquet/file/row_group_reader.go
 create mode 100644 go/parquet/internal/bmi/bitmap_bmi2_noasm.go
 create mode 100644 go/parquet/internal/testutils/pagebuilder.go

diff --git a/go/parquet/file/column_reader.go b/go/parquet/file/column_reader.go
new file mode 100644
index 0000000000000..79c6479b05b0d
--- /dev/null
+++ b/go/parquet/file/column_reader.go
@@ -0,0 +1,498 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package file
+
+import (
+	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/parquet"
+	"github.com/apache/arrow/go/parquet/internal/encoding"
+	"github.com/apache/arrow/go/parquet/internal/encryption"
+	format "github.com/apache/arrow/go/parquet/internal/gen-go/parquet"
+	"github.com/apache/arrow/go/parquet/internal/utils"
+	"github.com/apache/arrow/go/parquet/schema"
+	"golang.org/x/xerrors"
+)
+
+const (
+	// 4 MB is the default maximum page header size
+	defaultMaxPageHeaderSize = 4 * 1024 * 1024
+	// 16 KB is the default expected page header size
+	defaultPageHeaderSize = 16 * 1024
+)
+
+//go:generate go run ../../arrow/_tools/tmpl/main.go -i -data=../internal/encoding/physical_types.tmpldata column_reader_types.gen.go.tmpl
+
+func isDictIndexEncoding(e format.Encoding) bool {
+	return e == format.Encoding_RLE_DICTIONARY || e == format.Encoding_PLAIN_DICTIONARY
+}
+
+// CryptoContext is a context for keeping track of the current methods for decrypting.
+// It keeps track of the row group and column numbers along with references to the
+// decryptor objects.
+type CryptoContext struct {
+	StartDecryptWithDictionaryPage bool
+	RowGroupOrdinal                int16
+	ColumnOrdinal                  int16
+	MetaDecryptor                  encryption.Decryptor
+	DataDecryptor                  encryption.Decryptor
+}
+
+// ColumnChunkReader is the basic interface for all column readers. It will use
+// a page reader to read all the pages in a column chunk from a row group.
+//
+// To actually Read out the column data, you need to convert to the properly
+// typed ColumnChunkReader type such as *BooleanColumnReader etc.
+//
+// Some things to clarify when working with column readers:
+//
+// "Values" refers to the physical data values in a data page.
+//
+// This is separate from the number of "rows" in a column and the total number
+// of "elements" in a column because null values aren't stored physically in the
+// data page but are represented via definition levels, so the number of values
+// in a column can be less than the number of rows.
+//
+// The total number of "elements" in a column also differs because of potential
+// repeated fields, where you can have multiple values in the page which
+// together make up a single element (such as a list) or depending on the repetition
+// level and definition level, could represent an entire null list or just a null
+// element inside of a list.
+type ColumnChunkReader interface {
+	// HasNext returns whether there is more data to be read in this column
+	// and row group.
+	HasNext() bool
+	// Type returns the underlying physical type of the column
+	Type() parquet.Type
+	// Descriptor returns the column schema container
+	Descriptor() *schema.Column
+	// if HasNext returns false because of an error, this will return the error
+	// it encountered. Otherwise this will be nil if it's just the end of the
+	// column
+	Err() error
+	// Skip buffered values
+	consumeBufferedValues(int64)
+	// number of available buffered values that have not been decoded yet
+	// when this returns 0, you're at the end of a page.
+	numAvailValues() int64
+	// read the definition levels and return the number of definitions,
+	// and the number of values to be read (number of def levels == maxdef level)
+	// it also populates the passed in slice which should be sized appropriately.
+	readDefinitionLevels(levels []int16) (int, int64)
+	// read the repetition levels and return the number of repetition levels read
+	// also populates the passed in slice, which should be sized appropriately.
+	readRepetitionLevels(levels []int16) int
+	// a column is made up of potentially multiple pages across potentially multiple
+	// row groups. A PageReader allows looping through the pages in a single row group.
+	// When moving to another row group for reading, use setPageReader to re-use the
+	// column reader for reading the pages of the new row group.
+	pager() PageReader
+	// set a page reader into the columnreader so it can be reused.
+	//
+	// This will clear any current error in the reader but does not
+	// automatically read the first page of the page reader passed in until
+	// HasNext which will read in the next page.
+	setPageReader(PageReader)
+}
+
+type columnChunkReader struct {
+	descr             *schema.Column
+	rdr               PageReader
+	repetitionDecoder encoding.LevelDecoder
+	definitionDecoder encoding.LevelDecoder
+
+	curPage     Page
+	curEncoding format.Encoding
+	curDecoder  encoding.TypedDecoder
+
+	// number of currently buffered values in the current page
+	numBuffered int64
+	// the number of values we've decoded so far
+	numDecoded int64
+	mem        memory.Allocator
+
+	decoders      map[format.Encoding]encoding.TypedDecoder
+	decoderTraits encoding.DecoderTraits
+
+	// is set when an error is encountered
+	err          error
+	defLvlBuffer []int16
+}
+
+// NewColumnReader returns a column reader for the provided column initialized with the given pagereader that will
+// provide the pages of data for this column. The type is determined from the column passed in.
+func NewColumnReader(descr *schema.Column, pageReader PageReader, mem memory.Allocator) ColumnChunkReader {
+	base := columnChunkReader{descr: descr, rdr: pageReader, mem: mem, decoders: make(map[format.Encoding]encoding.TypedDecoder)}
+	switch descr.PhysicalType() {
+	case parquet.Types.FixedLenByteArray:
+		base.decoderTraits = &encoding.FixedLenByteArrayDecoderTraits
+		return &FixedLenByteArrayColumnChunkReader{base}
+	case parquet.Types.Float:
+		base.decoderTraits = &encoding.Float32DecoderTraits
+		return &Float32ColumnChunkReader{base}
+	case parquet.Types.Double:
+		base.decoderTraits = &encoding.Float64DecoderTraits
+		return &Float64ColumnChunkReader{base}
+	case parquet.Types.ByteArray:
+		base.decoderTraits = &encoding.ByteArrayDecoderTraits
+		return &ByteArrayColumnChunkReader{base}
+	case parquet.Types.Int32:
+		base.decoderTraits = &encoding.Int32DecoderTraits
+		return &Int32ColumnChunkReader{base}
+	case parquet.Types.Int64:
+		base.decoderTraits = &encoding.Int64DecoderTraits
+		return &Int64ColumnChunkReader{base}
+	case parquet.Types.Int96:
+		base.decoderTraits = &encoding.Int96DecoderTraits
+		return &Int96ColumnChunkReader{base}
+	case parquet.Types.Boolean:
+		base.decoderTraits = &encoding.BooleanDecoderTraits
+		return &BooleanColumnChunkReader{base}
+	}
+	return nil
+}
+
+func (c *columnChunkReader) Err() error                    { return c.err }
+func (c *columnChunkReader) Type() parquet.Type            { return c.descr.PhysicalType() }
+func (c *columnChunkReader) Descriptor() *schema.Column    { return c.descr }
+func (c *columnChunkReader) consumeBufferedValues(n int64) { c.numDecoded += n }
+func (c *columnChunkReader) numAvailValues() int64         { return c.numBuffered - c.numDecoded }
+func (c *columnChunkReader) pager() PageReader             { return c.rdr }
+func (c *columnChunkReader) setPageReader(rdr PageReader) {
+	c.rdr, c.err = rdr, nil
+	c.decoders = make(map[format.Encoding]encoding.TypedDecoder)
+	c.numBuffered, c.numDecoded = 0, 0
+}
+
+func (c *columnChunkReader) getDefLvlBuffer(sz int64) []int16 {
+	if int64(len(c.defLvlBuffer)) < sz {
+		c.defLvlBuffer = make([]int16, sz)
+		return c.defLvlBuffer
+	}
+
+	return c.defLvlBuffer[:sz]
+}
+
+// HasNext returns whether there is more data to be read in this column
+// and row group.
+func (c *columnChunkReader) HasNext() bool {
+	if c.numBuffered == 0 || c.numDecoded == c.numBuffered {
+		return c.readNewPage() && c.numBuffered != 0
+	}
+	return true
+}
+
+func (c *columnChunkReader) configureDict(page *DictionaryPage) error {
+	enc := page.encoding
+	if enc == format.Encoding_PLAIN_DICTIONARY || enc == format.Encoding_PLAIN {
+		enc = format.Encoding_RLE_DICTIONARY
+	}
+
+	if _, ok := c.decoders[enc]; ok {
+		return xerrors.New("parquet: column chunk cannot have more than one dictionary.")
+	}
+
+	switch page.Encoding() {
+	case format.Encoding_PLAIN, format.Encoding_PLAIN_DICTIONARY:
+		dict := c.decoderTraits.Decoder(parquet.Encodings.Plain, c.descr, false, c.mem)
+		dict.SetData(int(page.NumValues()), page.Data())
+
+		decoder := c.decoderTraits.Decoder(parquet.Encodings.Plain, c.descr, true, c.mem).(encoding.DictDecoder)
+		decoder.SetDict(dict)
+		c.decoders[enc] = decoder
+	default:
+		return xerrors.New("parquet: dictionary index must be plain encoding")
+	}
+
+	c.curDecoder = c.decoders[enc]
+	return nil
+}
+
+// read a new page from the page reader
+func (c *columnChunkReader) readNewPage() bool {
+	for c.rdr.Next() { // keep going until we get a data page
+		c.curPage = c.rdr.Page()
+		if c.curPage == nil {
+			break
+		}
+
+		var lvlByteLen int64
+		switch p := c.curPage.(type) {
+		case *DictionaryPage:
+			if err := c.configureDict(p); err != nil {
+				c.err = err
+				return false
+			}
+			continue
+		case *DataPageV1:
+			lvlByteLen, c.err = c.initLevelDecodersV1(p, p.repLvlEncoding, p.defLvlEncoding)
+			if c.err != nil {
+				return false
+			}
+		case *DataPageV2:
+			lvlByteLen, c.err = c.initLevelDecodersV2(p)
+			if c.err != nil {
+				return false
+			}
+		default:
+			// we can skip non-data pages
+			continue
+		}
+
+		c.err = c.initDataDecoder(c.curPage, lvlByteLen)
+		return c.err == nil
+	}
+	c.err = c.rdr.Err()
+	return false
+}
+
+func (c *columnChunkReader) initLevelDecodersV2(page *DataPageV2) (int64, error) {
+	c.numBuffered = int64(page.nvals)
+	c.numDecoded = 0
+	buf := page.Data()
+	totalLvlLen := int64(page.repLvlByteLen) + int64(page.defLvlByteLen)
+
+	if totalLvlLen > int64(len(buf)) {
+		return totalLvlLen, xerrors.New("parquet: data page too small for levels (corrupt header?)")
+	}
+
+	if c.descr.MaxRepetitionLevel() > 0 {
+		c.repetitionDecoder.SetDataV2(page.repLvlByteLen, c.descr.MaxRepetitionLevel(), int(c.numBuffered), buf)
+		buf = buf[page.repLvlByteLen:]
+	}
+
+	if c.descr.MaxDefinitionLevel() > 0 {
+		c.definitionDecoder.SetDataV2(page.defLvlByteLen, c.descr.MaxDefinitionLevel(), int(c.numBuffered), buf)
+	}
+
+	return totalLvlLen, nil
+}
+
+func (c *columnChunkReader) initLevelDecodersV1(page *DataPageV1, repLvlEncoding, defLvlEncoding format.Encoding) (int64, error) {
+	c.numBuffered = int64(page.nvals)
+	c.numDecoded = 0
+
+	buf := page.Data()
+	maxSize := len(buf)
+	levelsByteLen := int64(0)
+
+	// Data page layout: Repetition Levels - Definition Levels - encoded values.
+	// Levels are encoded as rle or bit-packed
+	if c.descr.MaxRepetitionLevel() > 0 {
+		repBytes, err := c.repetitionDecoder.SetData(parquet.Encoding(repLvlEncoding), c.descr.MaxRepetitionLevel(), int(c.numBuffered), buf)
+		if err != nil {
+			return levelsByteLen, err
+		}
+		buf = buf[repBytes:]
+		maxSize -= repBytes
+		levelsByteLen += int64(repBytes)
+	}
+
+	if c.descr.MaxDefinitionLevel() > 0 {
+		defBytes, err := c.definitionDecoder.SetData(parquet.Encoding(defLvlEncoding), c.descr.MaxDefinitionLevel(), int(c.numBuffered), buf)
+		if err != nil {
+			return levelsByteLen, err
+		}
+		levelsByteLen += int64(defBytes)
+		maxSize -= defBytes
+	}
+
+	return levelsByteLen, nil
+}
+
+func (c *columnChunkReader) initDataDecoder(page Page, lvlByteLen int64) error {
+	buf := page.Data()
+	if int64(len(buf)) < lvlByteLen {
+		return xerrors.New("parquet: page smaller than size of encoded levels")
+	}
+
+	buf = buf[lvlByteLen:]
+	encoding := page.Encoding()
+
+	if isDictIndexEncoding(encoding) {
+		encoding = format.Encoding_RLE_DICTIONARY
+	}
+
+	if decoder, ok := c.decoders[encoding]; ok {
+		c.curDecoder = decoder
+	} else {
+		switch encoding {
+		case format.Encoding_PLAIN,
+			format.Encoding_DELTA_BYTE_ARRAY,
+			format.Encoding_DELTA_LENGTH_BYTE_ARRAY,
+			format.Encoding_DELTA_BINARY_PACKED:
+			c.curDecoder = c.decoderTraits.Decoder(parquet.Encoding(encoding), c.descr, false, c.mem)
+			c.decoders[encoding] = c.curDecoder
+		case format.Encoding_RLE_DICTIONARY:
+			return xerrors.New("parquet: dictionary page must be before data page")
+		case format.Encoding_BYTE_STREAM_SPLIT:
+			return xerrors.Errorf("parquet: unsupported data encoding %s", encoding)
+		default:
+			return xerrors.Errorf("parquet: unknown encoding type %s", encoding)
+		}
+	}
+
+	c.curEncoding = encoding
+	c.curDecoder.SetData(int(c.numBuffered), buf)
+	return nil
+}
+
+// readDefinitionLevels decodes the definition levels from the page and returns
+// it returns the total number of levels that were decoded (and thus populated
+// in the passed in slice) and the number of physical values that exist to read
+// (the number of levels that are equal to the max definition level).
+//
+// If the max definition level is 0, the assumption is that there no nulls in the
+// column and therefore no definition levels to read, so it will always return 0, 0
+func (c *columnChunkReader) readDefinitionLevels(levels []int16) (totalDecoded int, valuesToRead int64) {
+	if c.descr.MaxDefinitionLevel() == 0 {
+		return 0, 0
+	}
+
+	return c.definitionDecoder.Decode(levels)
+}
+
+// readRepetitionLevels decodes the repetition levels from the page and returns
+// the total number of values decoded (and thus populated in the passed in levels
+// slice).
+//
+// If max repetition level is 0, it is assumed there are no repetition levels,
+// and thus will always return 0.
+func (c *columnChunkReader) readRepetitionLevels(levels []int16) int {
+	if c.descr.MaxRepetitionLevel() == 0 {
+		return 0
+	}
+
+	nlevels, _ := c.repetitionDecoder.Decode(levels)
+	return nlevels
+}
+
+// determineNumToRead reads the definition levels (and optionally populates the repetition levels)
+// in order to determine how many values need to be read to fulfill this batch read.
+//
+// batchLen is the number of values it is desired to read. defLvls must be either nil (in which case
+// a buffer will be used) or must be at least batchLen in length to be safe. repLvls should be either nil
+// (in which case it is ignored) or should be at least batchLen in length to be safe.
+//
+// In the return values: ndef is the number of definition levels that were actually read in which will
+// typically be the minimum of batchLen and numAvailValues.
+// toRead is the number of physical values that should be read in based on the definition levels (the number
+// of definition levels that were equal to maxDefinitionLevel). and err being either nil or any error encountered
+func (c *columnChunkReader) determineNumToRead(batchLen int64, defLvls, repLvls []int16) (ndefs int, toRead int64, err error) {
+	if !c.HasNext() {
+		return 0, 0, c.err
+	}
+
+	size := utils.Min(batchLen, c.numBuffered-c.numDecoded)
+
+	if c.descr.MaxDefinitionLevel() > 0 {
+		if defLvls == nil {
+			defLvls = c.getDefLvlBuffer(size)
+		}
+		ndefs, toRead = c.readDefinitionLevels(defLvls[:size])
+	} else {
+		toRead = size
+	}
+
+	if c.descr.MaxRepetitionLevel() > 0 && repLvls != nil {
+		nreps := c.readRepetitionLevels(repLvls[:size])
+		if defLvls != nil && ndefs != nreps {
+			err = xerrors.New("parquet: number of decoded rep/def levels did not match")
+		}
+	}
+	return
+}
+
+// skipValues some number of rows using readFn as the function to read the data and throw it away.
+// If we can skipValues a whole page based on its metadata, then we do so, otherwise we read the
+// page until we have skipped the number of rows desired.
+func (c *columnChunkReader) skipValues(nvalues int64, readFn func(batch int64, buf []byte) (int64, error)) (int64, error) {
+	var err error
+	toskip := nvalues
+	for c.HasNext() && toskip > 0 {
+		// if number to skip is more than the number of undecoded values, skip the page
+		if toskip > (c.numBuffered - c.numDecoded) {
+			toskip -= c.numBuffered - c.numDecoded
+			c.numDecoded = c.numBuffered
+		} else {
+			var (
+				batchSize int64 = 1024
+				valsRead  int64 = 0
+			)
+
+			scratch := memory.NewResizableBuffer(c.mem)
+			scratch.Reserve(c.decoderTraits.BytesRequired(int(batchSize)))
+			defer scratch.Release()
+
+			for {
+				batchSize = utils.Min(batchSize, toskip)
+				valsRead, err = readFn(batchSize, scratch.Buf())
+				toskip -= valsRead
+				if valsRead <= 0 || toskip <= 0 || err != nil {
+					break
+				}
+			}
+		}
+	}
+	if c.err != nil {
+		err = c.err
+	}
+	return nvalues - toskip, err
+}
+
+type readerFunc func(int64, int64) (int, error)
+
+// base function for reading a batch of values, this will read until it either reads in batchSize values or
+// it hits the end of the column chunk, including reading multiple pages.
+//
+// totalValues is the total number of values which were read in, and thus would be the total number
+// of definition levels and repetition levels which were populated (if they were non-nil). totalRead
+// is the number of physical values that were read in (ie: the number of non-null values)
+func (c *columnChunkReader) readBatch(batchSize int64, defLvls, repLvls []int16, readFn readerFunc) (totalLvls int64, totalRead int, err error) {
+	var (
+		read   int
+		defs   []int16
+		reps   []int16
+		ndefs  int
+		toRead int64
+	)
+
+	for c.HasNext() && totalLvls < batchSize && err == nil {
+		if defLvls != nil {
+			defs = defLvls[totalLvls:]
+		}
+		if repLvls != nil {
+			reps = repLvls[totalLvls:]
+		}
+		ndefs, toRead, err = c.determineNumToRead(batchSize-totalLvls, defs, reps)
+		if err != nil {
+			return totalLvls, totalRead, err
+		}
+
+		read, err = readFn(int64(totalRead), toRead)
+		// the total number of values processed here is the maximum of
+		// the number of definition levels or the number of physical values read.
+		// if this is a required field, ndefs will be 0 since there is no definition
+		// levels stored with it and `read` will be the number of values, otherwise
+		// we use ndefs since it will be equal to or greater than read.
+		totalVals := int64(utils.MaxInt(ndefs, read))
+		c.consumeBufferedValues(totalVals)
+
+		totalLvls += totalVals
+		totalRead += read
+	}
+	return totalLvls, totalRead, err
+}
diff --git a/go/parquet/file/column_reader_test.go b/go/parquet/file/column_reader_test.go
new file mode 100644
index 0000000000000..d22e365fa805e
--- /dev/null
+++ b/go/parquet/file/column_reader_test.go
@@ -0,0 +1,450 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package file_test
+
+import (
+	"math"
+	"math/rand"
+	"reflect"
+	"testing"
+
+	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/parquet"
+	"github.com/apache/arrow/go/parquet/file"
+	"github.com/apache/arrow/go/parquet/internal/testutils"
+	"github.com/apache/arrow/go/parquet/internal/utils"
+	"github.com/apache/arrow/go/parquet/schema"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/suite"
+)
+
+func initValues(values reflect.Value) {
+	if values.Kind() != reflect.Slice {
+		panic("must init values with slice")
+	}
+
+	r := rand.New(rand.NewSource(0))
+	typ := values.Type().Elem()
+	switch {
+	case typ.Bits() <= 32:
+		max := int64(math.MaxInt32)
+		min := int64(math.MinInt32)
+		for i := 0; i < values.Len(); i++ {
+			values.Index(i).Set(reflect.ValueOf(r.Int63n(max-min+1) + min).Convert(reflect.TypeOf(int32(0))))
+		}
+	case typ.Bits() <= 64:
+		max := int64(math.MaxInt64)
+		min := int64(math.MinInt64)
+		for i := 0; i < values.Len(); i++ {
+			values.Index(i).Set(reflect.ValueOf(r.Int63n(max-min+1) + min))
+		}
+	}
+}
+
+func initDictValues(values reflect.Value, numDicts int) {
+	repeatFactor := values.Len() / numDicts
+	initValues(values)
+	// add some repeated values
+	for j := 1; j < repeatFactor; j++ {
+		for i := 0; i < numDicts; i++ {
+			values.Index(numDicts*j + i).Set(values.Index(i))
+		}
+	}
+	// computed only dict_per_page * repeat_factor - 1 values < num_values compute remaining
+	for i := numDicts * repeatFactor; i < values.Len(); i++ {
+		values.Index(i).Set(values.Index(i - numDicts*repeatFactor))
+	}
+}
+
+func makePages(version parquet.DataPageVersion, d *schema.Column, npages, lvlsPerPage int, typ reflect.Type, enc parquet.Encoding) ([]file.Page, int, reflect.Value, []int16, []int16) {
+	nlevels := lvlsPerPage * npages
+	nvalues := 0
+
+	maxDef := d.MaxDefinitionLevel()
+	maxRep := d.MaxRepetitionLevel()
+
+	var (
+		defLevels []int16
+		repLevels []int16
+	)
+
+	valuesPerPage := make([]int, npages)
+	if maxDef > 0 {
+		defLevels = make([]int16, nlevels)
+		testutils.FillRandomInt16(0, 0, maxDef, defLevels)
+		for idx := range valuesPerPage {
+			numPerPage := 0
+			for i := 0; i < lvlsPerPage; i++ {
+				if defLevels[i+idx*lvlsPerPage] == maxDef {
+					numPerPage++
+					nvalues++
+				}
+			}
+			valuesPerPage[idx] = numPerPage
+		}
+	} else {
+		nvalues = nlevels
+		valuesPerPage[0] = lvlsPerPage
+		for i := 1; i < len(valuesPerPage); i *= 2 {
+			copy(valuesPerPage[i:], valuesPerPage[:i])
+		}
+	}
+
+	if maxRep > 0 {
+		repLevels = make([]int16, nlevels)
+		testutils.FillRandomInt16(0, 0, maxRep, repLevels)
+	}
+
+	values := reflect.MakeSlice(reflect.SliceOf(typ), nvalues, nvalues)
+	if enc == parquet.Encodings.Plain {
+		initValues(values)
+		return testutils.PaginatePlain(version, d, values, defLevels, repLevels, maxDef, maxRep, lvlsPerPage, valuesPerPage, parquet.Encodings.Plain), nvalues, values, defLevels, repLevels
+	} else if enc == parquet.Encodings.PlainDict || enc == parquet.Encodings.RLEDict {
+		initDictValues(values, lvlsPerPage)
+		return testutils.PaginateDict(version, d, values, defLevels, repLevels, maxDef, maxRep, lvlsPerPage, valuesPerPage, parquet.Encodings.RLEDict), nvalues, values, defLevels, repLevels
+	}
+	panic("invalid encoding type for make pages")
+}
+
+func compareVectorWithDefLevels(left, right reflect.Value, defLevels []int16, maxDef, maxRep int16) assert.Comparison {
+	return func() bool {
+		if left.Kind() != reflect.Slice || right.Kind() != reflect.Slice {
+			return false
+		}
+
+		if left.Type().Elem() != right.Type().Elem() {
+			return false
+		}
+
+		iLeft, iRight := 0, 0
+		for _, def := range defLevels {
+			if def == maxDef {
+				if !reflect.DeepEqual(left.Index(iLeft).Interface(), right.Index(iRight).Interface()) {
+					return false
+				}
+				iLeft++
+				iRight++
+			} else if def == (maxDef - 1) {
+				// null entry on the lowest nested level
+				iRight++
+			} else if def < (maxDef - 1) {
+				// null entry on higher nesting level, only supported for non-repeating data
+				if maxRep == 0 {
+					iRight++
+				}
+			}
+		}
+		return true
+	}
+}
+
+var mem = memory.DefaultAllocator
+
+type PrimitiveReaderSuite struct {
+	suite.Suite
+
+	dataPageVersion parquet.DataPageVersion
+	pager           file.PageReader
+	reader          file.ColumnChunkReader
+	pages           []file.Page
+	values          reflect.Value
+	defLevels       []int16
+	repLevels       []int16
+	nlevels         int
+	nvalues         int
+	maxDefLvl       int16
+	maxRepLvl       int16
+}
+
+func (p *PrimitiveReaderSuite) TearDownTest() {
+	p.clear()
+}
+
+func (p *PrimitiveReaderSuite) initReader(d *schema.Column) {
+	m := new(testutils.MockPageReader)
+	m.Test(p.T())
+	m.TestData().Set("pages", p.pages)
+	m.On("Err").Return((error)(nil))
+	p.pager = m
+	p.reader = file.NewColumnReader(d, m, mem)
+}
+
+func (p *PrimitiveReaderSuite) checkResults() {
+	vresult := make([]int32, p.nvalues)
+	dresult := make([]int16, p.nlevels)
+	rresult := make([]int16, p.nlevels)
+
+	var (
+		read        int64 = 0
+		totalRead   int   = 0
+		batchActual int   = 0
+		batchSize   int32 = 8
+		batch       int   = 0
+	)
+
+	rdr := p.reader.(*file.Int32ColumnChunkReader)
+	p.Require().NotNil(rdr)
+
+	// this will cover both cases:
+	// 1) batch size < page size (multiple ReadBatch from a single page)
+	// 2) batch size > page size (BatchRead limits to single page)
+	for {
+		read, batch, _ = rdr.ReadBatch(int64(batchSize), vresult[totalRead:], dresult[batchActual:], rresult[batchActual:])
+		totalRead += batch
+		batchActual += int(read)
+		batchSize = int32(utils.MinInt(1<<24, utils.MaxInt(int(batchSize*2), 4096)))
+		if batch <= 0 {
+			break
+		}
+	}
+
+	p.Equal(p.nlevels, batchActual)
+	p.Equal(p.nvalues, totalRead)
+	p.Equal(p.values.Interface(), vresult)
+	if p.maxDefLvl > 0 {
+		p.Equal(p.defLevels, dresult)
+	}
+	if p.maxRepLvl > 0 {
+		p.Equal(p.repLevels, rresult)
+	}
+
+	// catch improper writes at EOS
+	read, batchActual, _ = rdr.ReadBatch(5, vresult, nil, nil)
+	p.Zero(batchActual)
+	p.Zero(read)
+}
+
+func (p *PrimitiveReaderSuite) clear() {
+	p.values = reflect.ValueOf(nil)
+	p.defLevels = nil
+	p.repLevels = nil
+	p.pages = nil
+	p.pager = nil
+	p.reader = nil
+}
+
+func (p *PrimitiveReaderSuite) testPlain(npages, levels int, d *schema.Column) {
+	p.pages, p.nvalues, p.values, p.defLevels, p.repLevels = makePages(p.dataPageVersion, d, npages, levels, reflect.TypeOf(int32(0)), parquet.Encodings.Plain)
+	p.nlevels = npages * levels
+	p.initReader(d)
+	p.checkResults()
+	p.clear()
+}
+
+func (p *PrimitiveReaderSuite) testDict(npages, levels int, d *schema.Column) {
+	p.pages, p.nvalues, p.values, p.defLevels, p.repLevels = makePages(p.dataPageVersion, d, npages, levels, reflect.TypeOf(int32(0)), parquet.Encodings.RLEDict)
+	p.nlevels = npages * levels
+	p.initReader(d)
+	p.checkResults()
+	p.clear()
+}
+
+func (p *PrimitiveReaderSuite) TestInt32FlatRequired() {
+	const (
+		levelsPerPage int = 100
+		npages        int = 50
+	)
+
+	p.maxDefLvl = 0
+	p.maxRepLvl = 0
+
+	typ := schema.NewInt32Node("a", parquet.Repetitions.Required, -1)
+	d := schema.NewColumn(typ, p.maxDefLvl, p.maxRepLvl)
+	p.testPlain(npages, levelsPerPage, d)
+	p.testDict(npages, levelsPerPage, d)
+}
+
+func (p *PrimitiveReaderSuite) TestInt32FlatOptional() {
+	const (
+		levelsPerPage int = 100
+		npages        int = 50
+	)
+
+	p.maxDefLvl = 4
+	p.maxRepLvl = 0
+	typ := schema.NewInt32Node("b", parquet.Repetitions.Optional, -1)
+	d := schema.NewColumn(typ, p.maxDefLvl, p.maxRepLvl)
+	p.testPlain(npages, levelsPerPage, d)
+	p.testDict(npages, levelsPerPage, d)
+}
+
+func (p *PrimitiveReaderSuite) TestInt32FlatRepeated() {
+	const (
+		levelsPerPage int = 100
+		npages        int = 50
+	)
+
+	p.maxDefLvl = 4
+	p.maxRepLvl = 2
+	typ := schema.NewInt32Node("c", parquet.Repetitions.Repeated, -1)
+	d := schema.NewColumn(typ, p.maxDefLvl, p.maxRepLvl)
+	p.testPlain(npages, levelsPerPage, d)
+	p.testDict(npages, levelsPerPage, d)
+}
+
+func (p *PrimitiveReaderSuite) TestReadBatchMultiPage() {
+	const (
+		levelsPerPage int = 100
+		npages        int = 3
+	)
+
+	p.maxDefLvl = 0
+	p.maxRepLvl = 0
+	typ := schema.NewInt32Node("a", parquet.Repetitions.Required, -1)
+	d := schema.NewColumn(typ, p.maxDefLvl, p.maxRepLvl)
+	p.pages, p.nvalues, p.values, p.defLevels, p.repLevels = makePages(p.dataPageVersion, d, npages, levelsPerPage, reflect.TypeOf(int32(0)), parquet.Encodings.Plain)
+	p.initReader(d)
+
+	vresult := make([]int32, levelsPerPage*npages)
+	dresult := make([]int16, levelsPerPage*npages)
+	rresult := make([]int16, levelsPerPage*npages)
+
+	rdr := p.reader.(*file.Int32ColumnChunkReader)
+	total, read, err := rdr.ReadBatch(int64(levelsPerPage*npages), vresult, dresult, rresult)
+	p.NoError(err)
+	p.EqualValues(levelsPerPage*npages, total)
+	p.EqualValues(levelsPerPage*npages, read)
+}
+
+func (p *PrimitiveReaderSuite) TestInt32FlatRequiredSkip() {
+	const (
+		levelsPerPage int = 100
+		npages        int = 5
+	)
+
+	p.maxDefLvl = 0
+	p.maxRepLvl = 0
+	typ := schema.NewInt32Node("a", parquet.Repetitions.Required, -1)
+	d := schema.NewColumn(typ, p.maxDefLvl, p.maxRepLvl)
+	p.pages, p.nvalues, p.values, p.defLevels, p.repLevels = makePages(p.dataPageVersion, d, npages, levelsPerPage, reflect.TypeOf(int32(0)), parquet.Encodings.Plain)
+	p.initReader(d)
+
+	vresult := make([]int32, levelsPerPage/2)
+	dresult := make([]int16, levelsPerPage/2)
+	rresult := make([]int16, levelsPerPage/2)
+
+	rdr := p.reader.(*file.Int32ColumnChunkReader)
+
+	p.Run("skip_size > page_size", func() {
+		// Skip first 2 pages
+		skipped, _ := rdr.Skip(int64(2 * levelsPerPage))
+		p.Equal(int64(2*levelsPerPage), skipped)
+
+		rdr.ReadBatch(int64(levelsPerPage/2), vresult, dresult, rresult)
+		subVals := p.values.Slice(2*levelsPerPage, int(2.5*float64(levelsPerPage))).Interface().([]int32)
+		p.Equal(subVals, vresult)
+	})
+
+	p.Run("skip_size == page_size", func() {
+		// skip across two pages
+		skipped, _ := rdr.Skip(int64(levelsPerPage))
+		p.Equal(int64(levelsPerPage), skipped)
+		// read half a page
+		rdr.ReadBatch(int64(levelsPerPage/2), vresult, dresult, rresult)
+		subVals := p.values.Slice(int(3.5*float64(levelsPerPage)), 4*levelsPerPage).Interface().([]int32)
+		p.Equal(subVals, vresult)
+	})
+
+	p.Run("skip_size < page_size", func() {
+		// skip limited to a single page
+		// Skip half a page
+		skipped, _ := rdr.Skip(int64(levelsPerPage / 2))
+		p.Equal(int64(0.5*float32(levelsPerPage)), skipped)
+		// Read half a page
+		rdr.ReadBatch(int64(levelsPerPage/2), vresult, dresult, rresult)
+		subVals := p.values.Slice(int(4.5*float64(levelsPerPage)), p.values.Len()).Interface().([]int32)
+		p.Equal(subVals, vresult)
+	})
+}
+
+func (p *PrimitiveReaderSuite) TestDictionaryEncodedPages() {
+	p.maxDefLvl = 0
+	p.maxRepLvl = 0
+	typ := schema.NewInt32Node("a", parquet.Repetitions.Required, -1)
+	descr := schema.NewColumn(typ, p.maxDefLvl, p.maxRepLvl)
+	dummy := memory.NewResizableBuffer(mem)
+
+	p.Run("Dict: Plain, Data: RLEDict", func() {
+		dictPage := file.NewDictionaryPage(dummy, 0, parquet.Encodings.Plain)
+		dataPage := testutils.MakeDataPage(p.dataPageVersion, descr, nil, 0, parquet.Encodings.RLEDict, dummy, nil, nil, 0, 0)
+
+		p.pages = append(p.pages, dictPage, dataPage)
+		p.initReader(descr)
+		p.NotPanics(func() { p.reader.HasNext() })
+		p.NoError(p.reader.Err())
+		p.pages = p.pages[:0]
+	})
+
+	p.Run("Dict: Plain Dictionary, Data: Plain Dictionary", func() {
+		dictPage := file.NewDictionaryPage(dummy, 0, parquet.Encodings.PlainDict)
+		dataPage := testutils.MakeDataPage(p.dataPageVersion, descr, nil, 0, parquet.Encodings.PlainDict, dummy, nil, nil, 0, 0)
+		p.pages = append(p.pages, dictPage, dataPage)
+		p.initReader(descr)
+		p.NotPanics(func() { p.reader.HasNext() })
+		p.NoError(p.reader.Err())
+		p.pages = p.pages[:0]
+	})
+
+	p.Run("Panic if dict page not first", func() {
+		dataPage := testutils.MakeDataPage(p.dataPageVersion, descr, nil, 0, parquet.Encodings.RLEDict, dummy, nil, nil, 0, 0)
+		p.pages = append(p.pages, dataPage)
+		p.initReader(descr)
+		p.NotPanics(func() { p.False(p.reader.HasNext()) })
+		p.Error(p.reader.Err())
+		p.pages = p.pages[:0]
+	})
+
+	p.Run("Only RLE is supported", func() {
+		dictPage := file.NewDictionaryPage(dummy, 0, parquet.Encodings.DeltaByteArray)
+		p.pages = append(p.pages, dictPage)
+		p.initReader(descr)
+		p.NotPanics(func() { p.False(p.reader.HasNext()) })
+		p.Error(p.reader.Err())
+		p.pages = p.pages[:0]
+	})
+
+	p.Run("Cannot have more than one dict", func() {
+		dictPage1 := file.NewDictionaryPage(dummy, 0, parquet.Encodings.PlainDict)
+		dictPage2 := file.NewDictionaryPage(dummy, 0, parquet.Encodings.Plain)
+		p.pages = append(p.pages, dictPage1, dictPage2)
+		p.initReader(descr)
+		p.NotPanics(func() { p.False(p.reader.HasNext()) })
+		p.Error(p.reader.Err())
+		p.pages = p.pages[:0]
+	})
+
+	p.Run("Unsupported encoding", func() {
+		dataPage := testutils.MakeDataPage(p.dataPageVersion, descr, nil, 0, parquet.Encodings.DeltaByteArray, dummy, nil, nil, 0, 0)
+		p.pages = append(p.pages, dataPage)
+		p.initReader(descr)
+		p.Panics(func() { p.reader.HasNext() })
+		// p.Error(p.reader.Err())
+		p.pages = p.pages[:0]
+	})
+
+	p.pages = p.pages[:2]
+}
+
+func TestPrimitiveReader(t *testing.T) {
+	t.Parallel()
+	t.Run("datapage v1", func(t *testing.T) {
+		suite.Run(t, new(PrimitiveReaderSuite))
+	})
+	t.Run("datapage v2", func(t *testing.T) {
+		suite.Run(t, &PrimitiveReaderSuite{dataPageVersion: parquet.DataPageV2})
+	})
+}
diff --git a/go/parquet/file/column_reader_types.gen.go b/go/parquet/file/column_reader_types.gen.go
new file mode 100644
index 0000000000000..ab1fd535bbf27
--- /dev/null
+++ b/go/parquet/file/column_reader_types.gen.go
@@ -0,0 +1,299 @@
+// Code generated by column_reader_types.gen.go.tmpl. DO NOT EDIT.
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package file
+
+import (
+	"unsafe"
+
+	"github.com/apache/arrow/go/arrow"
+	"github.com/apache/arrow/go/parquet"
+	"github.com/apache/arrow/go/parquet/internal/encoding"
+)
+
+// Int32ColumnChunkReader is the Typed Column chunk reader instance for reading
+// Int32 column data.
+type Int32ColumnChunkReader struct {
+	columnChunkReader
+}
+
+// Skip skips the next nvalues so that the next call to ReadBatch
+// will start reading *after* the skipped values.
+func (cr *Int32ColumnChunkReader) Skip(nvalues int64) (int64, error) {
+	return cr.columnChunkReader.skipValues(nvalues,
+		func(batch int64, buf []byte) (int64, error) {
+			vals, _, err := cr.ReadBatch(batch,
+				arrow.Int32Traits.CastFromBytes(buf),
+				arrow.Int16Traits.CastFromBytes(buf),
+				arrow.Int16Traits.CastFromBytes(buf))
+			return vals, err
+		})
+}
+
+// ReadBatch reads batchSize values from the column.
+//
+// Returns error if values is not at least big enough to hold the number of values that will be read.
+//
+// defLvls and repLvls can be nil, or will be populated if not nil. If not nil, they must be
+// at least large enough to hold the number of values that will be read.
+//
+// total is the number of rows that were read, valuesRead is the actual number of physical values
+// that were read excluding nulls
+func (cr *Int32ColumnChunkReader) ReadBatch(batchSize int64, values []int32, defLvls, repLvls []int16) (total int64, valuesRead int, err error) {
+	return cr.readBatch(batchSize, defLvls, repLvls, func(start, len int64) (int, error) {
+		return cr.curDecoder.(encoding.Int32Decoder).Decode(values[start : start+len])
+	})
+}
+
+// Int64ColumnChunkReader is the Typed Column chunk reader instance for reading
+// Int64 column data.
+type Int64ColumnChunkReader struct {
+	columnChunkReader
+}
+
+// Skip skips the next nvalues so that the next call to ReadBatch
+// will start reading *after* the skipped values.
+func (cr *Int64ColumnChunkReader) Skip(nvalues int64) (int64, error) {
+	return cr.columnChunkReader.skipValues(nvalues,
+		func(batch int64, buf []byte) (int64, error) {
+			vals, _, err := cr.ReadBatch(batch,
+				arrow.Int64Traits.CastFromBytes(buf),
+				arrow.Int16Traits.CastFromBytes(buf),
+				arrow.Int16Traits.CastFromBytes(buf))
+			return vals, err
+		})
+}
+
+// ReadBatch reads batchSize values from the column.
+//
+// Returns error if values is not at least big enough to hold the number of values that will be read.
+//
+// defLvls and repLvls can be nil, or will be populated if not nil. If not nil, they must be
+// at least large enough to hold the number of values that will be read.
+//
+// total is the number of rows that were read, valuesRead is the actual number of physical values
+// that were read excluding nulls
+func (cr *Int64ColumnChunkReader) ReadBatch(batchSize int64, values []int64, defLvls, repLvls []int16) (total int64, valuesRead int, err error) {
+	return cr.readBatch(batchSize, defLvls, repLvls, func(start, len int64) (int, error) {
+		return cr.curDecoder.(encoding.Int64Decoder).Decode(values[start : start+len])
+	})
+}
+
+// Int96ColumnChunkReader is the Typed Column chunk reader instance for reading
+// Int96 column data.
+type Int96ColumnChunkReader struct {
+	columnChunkReader
+}
+
+// Skip skips the next nvalues so that the next call to ReadBatch
+// will start reading *after* the skipped values.
+func (cr *Int96ColumnChunkReader) Skip(nvalues int64) (int64, error) {
+	return cr.columnChunkReader.skipValues(nvalues,
+		func(batch int64, buf []byte) (int64, error) {
+			vals, _, err := cr.ReadBatch(batch,
+				parquet.Int96Traits.CastFromBytes(buf),
+				arrow.Int16Traits.CastFromBytes(buf),
+				arrow.Int16Traits.CastFromBytes(buf))
+			return vals, err
+		})
+}
+
+// ReadBatch reads batchSize values from the column.
+//
+// Returns error if values is not at least big enough to hold the number of values that will be read.
+//
+// defLvls and repLvls can be nil, or will be populated if not nil. If not nil, they must be
+// at least large enough to hold the number of values that will be read.
+//
+// total is the number of rows that were read, valuesRead is the actual number of physical values
+// that were read excluding nulls
+func (cr *Int96ColumnChunkReader) ReadBatch(batchSize int64, values []parquet.Int96, defLvls, repLvls []int16) (total int64, valuesRead int, err error) {
+	return cr.readBatch(batchSize, defLvls, repLvls, func(start, len int64) (int, error) {
+		return cr.curDecoder.(encoding.Int96Decoder).Decode(values[start : start+len])
+	})
+}
+
+// Float32ColumnChunkReader is the Typed Column chunk reader instance for reading
+// Float32 column data.
+type Float32ColumnChunkReader struct {
+	columnChunkReader
+}
+
+// Skip skips the next nvalues so that the next call to ReadBatch
+// will start reading *after* the skipped values.
+func (cr *Float32ColumnChunkReader) Skip(nvalues int64) (int64, error) {
+	return cr.columnChunkReader.skipValues(nvalues,
+		func(batch int64, buf []byte) (int64, error) {
+			vals, _, err := cr.ReadBatch(batch,
+				arrow.Float32Traits.CastFromBytes(buf),
+				arrow.Int16Traits.CastFromBytes(buf),
+				arrow.Int16Traits.CastFromBytes(buf))
+			return vals, err
+		})
+}
+
+// ReadBatch reads batchSize values from the column.
+//
+// Returns error if values is not at least big enough to hold the number of values that will be read.
+//
+// defLvls and repLvls can be nil, or will be populated if not nil. If not nil, they must be
+// at least large enough to hold the number of values that will be read.
+//
+// total is the number of rows that were read, valuesRead is the actual number of physical values
+// that were read excluding nulls
+func (cr *Float32ColumnChunkReader) ReadBatch(batchSize int64, values []float32, defLvls, repLvls []int16) (total int64, valuesRead int, err error) {
+	return cr.readBatch(batchSize, defLvls, repLvls, func(start, len int64) (int, error) {
+		return cr.curDecoder.(encoding.Float32Decoder).Decode(values[start : start+len])
+	})
+}
+
+// Float64ColumnChunkReader is the Typed Column chunk reader instance for reading
+// Float64 column data.
+type Float64ColumnChunkReader struct {
+	columnChunkReader
+}
+
+// Skip skips the next nvalues so that the next call to ReadBatch
+// will start reading *after* the skipped values.
+func (cr *Float64ColumnChunkReader) Skip(nvalues int64) (int64, error) {
+	return cr.columnChunkReader.skipValues(nvalues,
+		func(batch int64, buf []byte) (int64, error) {
+			vals, _, err := cr.ReadBatch(batch,
+				arrow.Float64Traits.CastFromBytes(buf),
+				arrow.Int16Traits.CastFromBytes(buf),
+				arrow.Int16Traits.CastFromBytes(buf))
+			return vals, err
+		})
+}
+
+// ReadBatch reads batchSize values from the column.
+//
+// Returns error if values is not at least big enough to hold the number of values that will be read.
+//
+// defLvls and repLvls can be nil, or will be populated if not nil. If not nil, they must be
+// at least large enough to hold the number of values that will be read.
+//
+// total is the number of rows that were read, valuesRead is the actual number of physical values
+// that were read excluding nulls
+func (cr *Float64ColumnChunkReader) ReadBatch(batchSize int64, values []float64, defLvls, repLvls []int16) (total int64, valuesRead int, err error) {
+	return cr.readBatch(batchSize, defLvls, repLvls, func(start, len int64) (int, error) {
+		return cr.curDecoder.(encoding.Float64Decoder).Decode(values[start : start+len])
+	})
+}
+
+// BooleanColumnChunkReader is the Typed Column chunk reader instance for reading
+// Boolean column data.
+type BooleanColumnChunkReader struct {
+	columnChunkReader
+}
+
+// Skip skips the next nvalues so that the next call to ReadBatch
+// will start reading *after* the skipped values.
+func (cr *BooleanColumnChunkReader) Skip(nvalues int64) (int64, error) {
+	return cr.columnChunkReader.skipValues(nvalues,
+		func(batch int64, buf []byte) (int64, error) {
+			vals, _, err := cr.ReadBatch(batch,
+				*(*[]bool)(unsafe.Pointer(&buf)),
+				arrow.Int16Traits.CastFromBytes(buf),
+				arrow.Int16Traits.CastFromBytes(buf))
+			return vals, err
+		})
+}
+
+// ReadBatch reads batchSize values from the column.
+//
+// Returns error if values is not at least big enough to hold the number of values that will be read.
+//
+// defLvls and repLvls can be nil, or will be populated if not nil. If not nil, they must be
+// at least large enough to hold the number of values that will be read.
+//
+// total is the number of rows that were read, valuesRead is the actual number of physical values
+// that were read excluding nulls
+func (cr *BooleanColumnChunkReader) ReadBatch(batchSize int64, values []bool, defLvls, repLvls []int16) (total int64, valuesRead int, err error) {
+	return cr.readBatch(batchSize, defLvls, repLvls, func(start, len int64) (int, error) {
+		return cr.curDecoder.(encoding.BooleanDecoder).Decode(values[start : start+len])
+	})
+}
+
+// ByteArrayColumnChunkReader is the Typed Column chunk reader instance for reading
+// ByteArray column data.
+type ByteArrayColumnChunkReader struct {
+	columnChunkReader
+}
+
+// Skip skips the next nvalues so that the next call to ReadBatch
+// will start reading *after* the skipped values.
+func (cr *ByteArrayColumnChunkReader) Skip(nvalues int64) (int64, error) {
+	return cr.columnChunkReader.skipValues(nvalues,
+		func(batch int64, buf []byte) (int64, error) {
+			vals, _, err := cr.ReadBatch(batch,
+				parquet.ByteArrayTraits.CastFromBytes(buf),
+				arrow.Int16Traits.CastFromBytes(buf),
+				arrow.Int16Traits.CastFromBytes(buf))
+			return vals, err
+		})
+}
+
+// ReadBatch reads batchSize values from the column.
+//
+// Returns error if values is not at least big enough to hold the number of values that will be read.
+//
+// defLvls and repLvls can be nil, or will be populated if not nil. If not nil, they must be
+// at least large enough to hold the number of values that will be read.
+//
+// total is the number of rows that were read, valuesRead is the actual number of physical values
+// that were read excluding nulls
+func (cr *ByteArrayColumnChunkReader) ReadBatch(batchSize int64, values []parquet.ByteArray, defLvls, repLvls []int16) (total int64, valuesRead int, err error) {
+	return cr.readBatch(batchSize, defLvls, repLvls, func(start, len int64) (int, error) {
+		return cr.curDecoder.(encoding.ByteArrayDecoder).Decode(values[start : start+len])
+	})
+}
+
+// FixedLenByteArrayColumnChunkReader is the Typed Column chunk reader instance for reading
+// FixedLenByteArray column data.
+type FixedLenByteArrayColumnChunkReader struct {
+	columnChunkReader
+}
+
+// Skip skips the next nvalues so that the next call to ReadBatch
+// will start reading *after* the skipped values.
+func (cr *FixedLenByteArrayColumnChunkReader) Skip(nvalues int64) (int64, error) {
+	return cr.columnChunkReader.skipValues(nvalues,
+		func(batch int64, buf []byte) (int64, error) {
+			vals, _, err := cr.ReadBatch(batch,
+				parquet.FixedLenByteArrayTraits.CastFromBytes(buf),
+				arrow.Int16Traits.CastFromBytes(buf),
+				arrow.Int16Traits.CastFromBytes(buf))
+			return vals, err
+		})
+}
+
+// ReadBatch reads batchSize values from the column.
+//
+// Returns error if values is not at least big enough to hold the number of values that will be read.
+//
+// defLvls and repLvls can be nil, or will be populated if not nil. If not nil, they must be
+// at least large enough to hold the number of values that will be read.
+//
+// total is the number of rows that were read, valuesRead is the actual number of physical values
+// that were read excluding nulls
+func (cr *FixedLenByteArrayColumnChunkReader) ReadBatch(batchSize int64, values []parquet.FixedLenByteArray, defLvls, repLvls []int16) (total int64, valuesRead int, err error) {
+	return cr.readBatch(batchSize, defLvls, repLvls, func(start, len int64) (int, error) {
+		return cr.curDecoder.(encoding.FixedLenByteArrayDecoder).Decode(values[start : start+len])
+	})
+}
diff --git a/go/parquet/file/column_reader_types.gen.go.tmpl b/go/parquet/file/column_reader_types.gen.go.tmpl
new file mode 100644
index 0000000000000..23b7d3ed823c3
--- /dev/null
+++ b/go/parquet/file/column_reader_types.gen.go.tmpl
@@ -0,0 +1,62 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package file
+
+import (
+    "github.com/apache/arrow/go/parquet"
+    "github.com/apache/arrow/go/parquet/internal/encoding"
+)
+
+{{range .In}}
+// {{.Name}}ColumnChunkReader is the Typed Column chunk reader instance for reading
+// {{.Name}} column data.
+type {{.Name}}ColumnChunkReader struct {
+  columnChunkReader
+}
+
+// Skip skips the next nvalues so that the next call to ReadBatch
+// will start reading *after* the skipped values.
+func (cr *{{.Name}}ColumnChunkReader) Skip(nvalues int64) (int64, error) {
+  return cr.columnChunkReader.skipValues(nvalues,
+    func(batch int64, buf []byte) (int64, error) {
+      vals, _, err := cr.ReadBatch(batch,
+        {{- if ne .Name "Boolean"}}
+        {{.prefix}}.{{.Name}}Traits.CastFromBytes(buf),
+        {{- else}}
+        *(*[]bool)(unsafe.Pointer(&buf)),
+        {{- end}}
+        arrow.Int16Traits.CastFromBytes(buf),
+        arrow.Int16Traits.CastFromBytes(buf))
+      return vals, err
+    })
+}
+
+// ReadBatch reads batchSize values from the column.
+//
+// Returns error if values is not at least big enough to hold the number of values that will be read.
+//
+// defLvls and repLvls can be nil, or will be populated if not nil. If not nil, they must be
+// at least large enough to hold the number of values that will be read.
+//
+// total is the number of rows that were read, valuesRead is the actual number of physical values
+// that were read excluding nulls
+func (cr *{{.Name}}ColumnChunkReader) ReadBatch(batchSize int64, values []{{.name}}, defLvls, repLvls []int16) (total int64, valuesRead int, err error) {
+  return cr.readBatch(batchSize, defLvls, repLvls, func(start, len int64) (int, error) {
+    return cr.curDecoder.(encoding.{{.Name}}Decoder).Decode(values[start:start+len])
+  })
+}
+{{end}}
diff --git a/go/parquet/file/file_reader.go b/go/parquet/file/file_reader.go
new file mode 100644
index 0000000000000..8b95223a14d82
--- /dev/null
+++ b/go/parquet/file/file_reader.go
@@ -0,0 +1,336 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package file
+
+import (
+	"bytes"
+	"encoding/binary"
+	"io"
+	"os"
+
+	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/parquet"
+	"github.com/apache/arrow/go/parquet/internal/encryption"
+	"github.com/apache/arrow/go/parquet/metadata"
+	"golang.org/x/exp/mmap"
+	"golang.org/x/xerrors"
+)
+
+const (
+	footerSize uint32 = 8
+)
+
+var (
+	magicBytes                  = []byte("PAR1")
+	magicEBytes                 = []byte("PARE")
+	errInconsistentFileMetadata = xerrors.New("parquet: file is smaller than indicated metadata size")
+)
+
+// Reader is the main interface for reading a parquet file
+type Reader struct {
+	r             parquet.ReaderAtSeeker
+	props         *parquet.ReaderProperties
+	metadata      *metadata.FileMetaData
+	footerOffset  int64
+	fileDecryptor encryption.FileDecryptor
+}
+
+// an adapter for mmap'd files
+type mmapAdapter struct {
+	*mmap.ReaderAt
+
+	pos int64
+}
+
+func (m *mmapAdapter) Close() error {
+	return m.ReaderAt.Close()
+}
+
+func (m *mmapAdapter) ReadAt(p []byte, off int64) (int, error) {
+	return m.ReaderAt.ReadAt(p, off)
+}
+
+func (m *mmapAdapter) Read(p []byte) (n int, err error) {
+	n, err = m.ReaderAt.ReadAt(p, m.pos)
+	m.pos += int64(n)
+	return
+}
+
+func (m *mmapAdapter) Seek(offset int64, whence int) (int64, error) {
+	newPos, offs := int64(0), offset
+	switch whence {
+	case io.SeekStart:
+		newPos = offs
+	case io.SeekCurrent:
+		newPos = m.pos + offs
+	case io.SeekEnd:
+		newPos = int64(m.ReaderAt.Len()) + offs
+	}
+	if newPos < 0 {
+		return 0, xerrors.New("negative result pos")
+	}
+	if newPos > int64(m.ReaderAt.Len()) {
+		return 0, xerrors.New("new position exceeds size of file")
+	}
+	m.pos = newPos
+	return newPos, nil
+}
+
+type ReadOption func(*Reader)
+
+// WithReadProps specifies a specific reader properties instance to use, rather
+// than using the default ReaderProperties.
+func WithReadProps(props *parquet.ReaderProperties) ReadOption {
+	return func(r *Reader) {
+		r.props = props
+	}
+}
+
+// WithMetadata allows providing a specific FileMetaData object rather than reading
+// the file metadata from the file itself.
+func WithMetadata(m *metadata.FileMetaData) ReadOption {
+	return func(r *Reader) {
+		r.metadata = m
+	}
+}
+
+// OpenParquetFile will return a Reader for the given parquet file on the local file system.
+//
+// Optionally the file can be memory mapped for faster reading. If no read properties are provided
+// then the default ReaderProperties will be used. The WithMetadata option can be used to provide
+// a FileMetaData object rather than reading the file metadata from the file.
+func OpenParquetFile(filename string, memoryMap bool, opts ...ReadOption) (*Reader, error) {
+	var source parquet.ReaderAtSeeker
+
+	var err error
+	if memoryMap {
+		rdr, err := mmap.Open(filename)
+		if err != nil {
+			return nil, err
+		}
+		source = &mmapAdapter{rdr, 0}
+	} else {
+		source, err = os.Open(filename)
+		if err != nil {
+			return nil, err
+		}
+	}
+	return NewParquetReader(source, opts...)
+}
+
+// NewParquetReader returns a FileReader instance that reads a parquet file which can be read from r.
+// This reader needs to support Read, ReadAt and Seeking.
+//
+// If no read properties are provided then the default ReaderProperties will be used. The WithMetadata
+// option can be used to provide a FileMetaData object rather than reading the file metadata from the file.
+func NewParquetReader(r parquet.ReaderAtSeeker, opts ...ReadOption) (*Reader, error) {
+	var err error
+	f := &Reader{r: r}
+	for _, o := range opts {
+		o(f)
+	}
+
+	if f.footerOffset <= 0 {
+		f.footerOffset, err = r.Seek(0, io.SeekEnd)
+		if err != nil {
+			return nil, xerrors.Errorf("parquet: could not retrieve footer offset: %w", err)
+		}
+	}
+
+	if f.props == nil {
+		f.props = parquet.NewReaderProperties(memory.NewGoAllocator())
+	}
+
+	if f.metadata == nil {
+		return f, f.parseMetaData()
+	}
+
+	return f, nil
+}
+
+// Close will close the current reader, and if the underlying reader being used
+// is an `io.Closer` then Close will be called on it too.
+func (f *Reader) Close() error {
+	if r, ok := f.r.(io.Closer); ok {
+		return r.Close()
+	}
+	return nil
+}
+
+// MetaData returns the underlying FileMetadata object
+func (f *Reader) MetaData() *metadata.FileMetaData { return f.metadata }
+
+// parseMetaData handles parsing the metadata from the opened file.
+func (f *Reader) parseMetaData() error {
+	if f.footerOffset <= int64(footerSize) {
+		return xerrors.Errorf("parquet: file too small (size=%d)", f.footerOffset)
+	}
+
+	buf := make([]byte, footerSize)
+	// backup 8 bytes to read the footer size (first four bytes) and the magic bytes (last 4 bytes)
+	n, err := f.r.ReadAt(buf, f.footerOffset-int64(footerSize))
+	if err != nil {
+		return xerrors.Errorf("parquet: could not read footer: %w", err)
+	}
+	if n != len(buf) {
+		return xerrors.Errorf("parquet: could not read %d bytes from end of file", len(buf))
+	}
+
+	size := int64(binary.LittleEndian.Uint32(buf[:4]))
+	if size < 0 || size+int64(footerSize) > f.footerOffset {
+		return errInconsistentFileMetadata
+	}
+
+	fileDecryptProps := f.props.FileDecryptProps
+
+	switch {
+	case bytes.Equal(buf[4:], magicBytes): // non-encrypted metadata
+		buf = make([]byte, size)
+		if _, err := f.r.ReadAt(buf, f.footerOffset-int64(footerSize)-size); err != nil {
+			return xerrors.Errorf("parquet: could not read footer: %w", err)
+		}
+
+		f.metadata, err = metadata.NewFileMetaData(buf, nil)
+		if err != nil {
+			return xerrors.Errorf("parquet: could not read footer: %w", err)
+		}
+
+		if !f.metadata.IsSetEncryptionAlgorithm() {
+			if fileDecryptProps != nil && !fileDecryptProps.PlaintextFilesAllowed() {
+				return xerrors.Errorf("parquet: applying decryption properties on plaintext file")
+			}
+		} else {
+			if err := f.parseMetaDataEncryptedFilePlaintextFooter(fileDecryptProps, buf); err != nil {
+				return err
+			}
+		}
+	case bytes.Equal(buf[4:], magicEBytes): // encrypted metadata
+		buf = make([]byte, size)
+		if _, err := f.r.ReadAt(buf, f.footerOffset-int64(footerSize)-size); err != nil {
+			return xerrors.Errorf("parquet: could not read footer: %w", err)
+		}
+
+		if fileDecryptProps == nil {
+			return xerrors.New("could not read encrypted metadata, no decryption found in reader's properties")
+		}
+
+		fileCryptoMetadata, err := metadata.NewFileCryptoMetaData(buf)
+		if err != nil {
+			return err
+		}
+		algo := fileCryptoMetadata.EncryptionAlgorithm()
+		fileAad, err := f.handleAadPrefix(fileDecryptProps, &algo)
+		if err != nil {
+			return err
+		}
+		f.fileDecryptor = encryption.NewFileDecryptor(fileDecryptProps, fileAad, algo.Algo, string(fileCryptoMetadata.KeyMetadata()), f.props.Allocator())
+
+		f.metadata, err = metadata.NewFileMetaData(buf[fileCryptoMetadata.Len():], f.fileDecryptor)
+		if err != nil {
+			return xerrors.Errorf("parquet: could not read footer: %w", err)
+		}
+	default:
+		return xerrors.Errorf("parquet: magic bytes not found in footer. Either the file is corrupted or this isn't a parquet file")
+	}
+
+	return nil
+}
+
+func (f *Reader) handleAadPrefix(fileDecrypt *parquet.FileDecryptionProperties, algo *parquet.Algorithm) (string, error) {
+	aadPrefixInProps := fileDecrypt.AadPrefix()
+	aadPrefix := []byte(aadPrefixInProps)
+	fileHasAadPrefix := algo.Aad.AadPrefix != nil && len(algo.Aad.AadPrefix) > 0
+	aadPrefixInFile := algo.Aad.AadPrefix
+
+	if algo.Aad.SupplyAadPrefix && aadPrefixInProps == "" {
+		return "", xerrors.New("AAD Prefix used for file encryption but not stored in file and not suppliedin decryption props")
+	}
+
+	if fileHasAadPrefix {
+		if aadPrefixInProps != "" {
+			if aadPrefixInProps != string(aadPrefixInFile) {
+				return "", xerrors.New("AAD prefix in file and in properties but not the same")
+			}
+		}
+		aadPrefix = aadPrefixInFile
+		if fileDecrypt.Verifier != nil {
+			fileDecrypt.Verifier.Verify(string(aadPrefix))
+		}
+	} else {
+		if !algo.Aad.SupplyAadPrefix && aadPrefixInProps != "" {
+			return "", xerrors.New("AAD Prefix set in decryptionproperties but was not used for file encryption")
+		}
+		if fileDecrypt.Verifier != nil {
+			return "", xerrors.New("AAD Prefix Verifier is set but AAD Prefix not found in file")
+		}
+	}
+	return string(append(aadPrefix, algo.Aad.AadFileUnique...)), nil
+}
+
+func (f *Reader) parseMetaDataEncryptedFilePlaintextFooter(decryptProps *parquet.FileDecryptionProperties, data []byte) error {
+	if decryptProps != nil {
+		algo := f.metadata.EncryptionAlgorithm()
+		fileAad, err := f.handleAadPrefix(decryptProps, &algo)
+		if err != nil {
+			return err
+		}
+		f.fileDecryptor = encryption.NewFileDecryptor(decryptProps, fileAad, algo.Algo, string(f.metadata.GetFooterSigningKeyMetadata()), f.props.Allocator())
+		// set the InternalFileDecryptor in the metadata as well, as it's used
+		// for signature verification and for ColumnChunkMetaData creation.
+		f.metadata.FileDecryptor = f.fileDecryptor
+		if decryptProps.PlaintextFooterIntegrity() {
+			if len(data)-f.metadata.Size() != encryption.GcmTagLength+encryption.NonceLength {
+				return xerrors.New("failed reading metadata for encryption signature")
+			}
+
+			if !f.metadata.VerifySignature(data[f.metadata.Size():]) {
+				return xerrors.New("parquet crypto signature verification failed")
+			}
+		}
+	}
+	return nil
+}
+
+// WriterVersion returns the Application Version that was written in the file
+// metadata
+func (f *Reader) WriterVersion() *metadata.AppVersion {
+	return f.metadata.WriterVersion()
+}
+
+// NumRows returns the total number of rows in this parquet file.
+func (f *Reader) NumRows() int64 {
+	return f.metadata.GetNumRows()
+}
+
+// NumRowGroups returns the total number of row groups in this file.
+func (f *Reader) NumRowGroups() int {
+	return len(f.metadata.GetRowGroups())
+}
+
+// RowGroup returns a reader for the desired (0-based) row group
+func (f *Reader) RowGroup(i int) *RowGroupReader {
+	rg := f.metadata.RowGroups[i]
+
+	return &RowGroupReader{
+		fileMetadata:  f.metadata,
+		rgMetadata:    metadata.NewRowGroupMetaData(rg, f.metadata.Schema, f.WriterVersion(), f.fileDecryptor),
+		props:         f.props,
+		r:             f.r,
+		sourceSz:      f.footerOffset,
+		fileDecryptor: f.fileDecryptor,
+	}
+}
diff --git a/go/parquet/file/file_reader_test.go b/go/parquet/file/file_reader_test.go
new file mode 100644
index 0000000000000..6dfb1fa6bc736
--- /dev/null
+++ b/go/parquet/file/file_reader_test.go
@@ -0,0 +1,304 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package file_test
+
+import (
+	"bytes"
+	"encoding/binary"
+	"io"
+	"math/rand"
+	"testing"
+
+	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/parquet/compress"
+	"github.com/apache/arrow/go/parquet/file"
+	"github.com/apache/arrow/go/parquet/internal/encoding"
+	format "github.com/apache/arrow/go/parquet/internal/gen-go/parquet"
+	"github.com/apache/arrow/go/parquet/internal/thrift"
+	"github.com/apache/arrow/go/parquet/metadata"
+	libthrift "github.com/apache/thrift/lib/go/thrift"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/suite"
+)
+
+func getDummyStats(statSize int, fillAll bool) *format.Statistics {
+	statBytes := make([]byte, statSize)
+	memory.Set(statBytes, 1)
+
+	ret := format.NewStatistics()
+	ret.Max = statBytes
+	if fillAll {
+		ret.Min = statBytes
+		ret.NullCount = libthrift.Int64Ptr(42)
+		ret.DistinctCount = libthrift.Int64Ptr(1)
+	}
+	return ret
+}
+
+func checkStatistics(t *testing.T, stats format.Statistics, actual metadata.EncodedStatistics) {
+	if stats.IsSetMax() {
+		assert.Equal(t, stats.Max, actual.Max)
+	}
+	if stats.IsSetMin() {
+		assert.Equal(t, stats.Min, actual.Min)
+	}
+	if stats.IsSetNullCount() {
+		assert.Equal(t, stats.GetNullCount(), actual.NullCount)
+	}
+	if stats.IsSetDistinctCount() {
+		assert.Equal(t, stats.GetDistinctCount(), actual.DistinctCount)
+	}
+}
+
+type PageSerdeSuite struct {
+	suite.Suite
+
+	sink   *encoding.BufferWriter
+	buffer *memory.Buffer
+
+	pageHdr       format.PageHeader
+	dataPageHdr   format.DataPageHeader
+	dataPageHdrV2 format.DataPageHeaderV2
+
+	pageReader file.PageReader
+}
+
+func TestFileDeserializing(t *testing.T) {
+	t.Parallel()
+	suite.Run(t, new(PageSerdeSuite))
+}
+
+func (p *PageSerdeSuite) ResetStream() {
+	p.sink = encoding.NewBufferWriter(0, memory.DefaultAllocator)
+}
+
+func (p *PageSerdeSuite) EndStream() {
+	p.buffer = p.sink.Finish()
+}
+
+func (p *PageSerdeSuite) SetupTest() {
+	p.dataPageHdr.Encoding = format.Encoding_PLAIN
+	p.dataPageHdr.DefinitionLevelEncoding = format.Encoding_RLE
+	p.dataPageHdr.RepetitionLevelEncoding = format.Encoding_RLE
+
+	p.ResetStream()
+}
+
+func (p *PageSerdeSuite) InitSerializedPageReader(nrows int64, codec compress.Compression) {
+	p.EndStream()
+
+	p.pageReader, _ = file.NewPageReader(bytes.NewReader(p.buffer.Bytes()), nrows, codec, memory.DefaultAllocator, nil)
+}
+
+func (p *PageSerdeSuite) WriteDataPageHeader(maxSerialized int, uncompressed, compressed int32) {
+	// simplifying writing serialized data page headers which may or may
+	// not have meaningful data associated with them
+
+	p.pageHdr.DataPageHeader = &p.dataPageHdr
+	p.pageHdr.UncompressedPageSize = uncompressed
+	p.pageHdr.CompressedPageSize = compressed
+	p.pageHdr.Type = format.PageType_DATA_PAGE
+
+	serializer := thrift.NewThriftSerializer()
+	p.NotPanics(func() {
+		serializer.Serialize(&p.pageHdr, p.sink, nil)
+	})
+}
+
+func (p *PageSerdeSuite) WriteDataPageHeaderV2(maxSerialized int, uncompressed, compressed int32) {
+	p.pageHdr.DataPageHeaderV2 = &p.dataPageHdrV2
+	p.pageHdr.UncompressedPageSize = uncompressed
+	p.pageHdr.CompressedPageSize = compressed
+	p.pageHdr.Type = format.PageType_DATA_PAGE_V2
+
+	serializer := thrift.NewThriftSerializer()
+	p.NotPanics(func() {
+		serializer.Serialize(&p.pageHdr, p.sink, nil)
+	})
+}
+
+func (p *PageSerdeSuite) CheckDataPageHeader(expected format.DataPageHeader, page file.Page) {
+	p.Equal(format.PageType_DATA_PAGE, page.Type())
+
+	p.IsType(&file.DataPageV1{}, page)
+	p.Equal(expected.NumValues, page.NumValues())
+	p.Equal(expected.Encoding, page.Encoding())
+	p.EqualValues(expected.DefinitionLevelEncoding, page.(*file.DataPageV1).DefinitionLevelEncoding())
+	p.EqualValues(expected.RepetitionLevelEncoding, page.(*file.DataPageV1).RepetitionLevelEncoding())
+	checkStatistics(p.T(), *expected.Statistics, page.(file.DataPage).Statistics())
+}
+
+func (p *PageSerdeSuite) CheckDataPageHeaderV2(expected format.DataPageHeaderV2, page file.Page) {
+	p.Equal(format.PageType_DATA_PAGE_V2, page.Type())
+
+	p.IsType(&file.DataPageV2{}, page)
+	p.Equal(expected.NumValues, page.NumValues())
+	p.Equal(expected.Encoding, page.Encoding())
+	p.Equal(expected.NumNulls, page.(*file.DataPageV2).NumNulls())
+	p.Equal(expected.DefinitionLevelsByteLength, page.(*file.DataPageV2).DefinitionLevelByteLen())
+	p.Equal(expected.RepetitionLevelsByteLength, page.(*file.DataPageV2).RepetitionLevelByteLen())
+	p.Equal(expected.IsCompressed, page.(*file.DataPageV2).IsCompressed())
+	checkStatistics(p.T(), *expected.Statistics, page.(file.DataPage).Statistics())
+}
+
+func (p *PageSerdeSuite) TestDataPageV1() {
+	const (
+		statsSize = 512
+		nrows     = 4444
+	)
+	p.dataPageHdr.Statistics = getDummyStats(statsSize, true)
+	p.dataPageHdr.NumValues = nrows
+
+	p.WriteDataPageHeader(1024, 0, 0)
+	p.InitSerializedPageReader(nrows, compress.Codecs.Uncompressed)
+	p.True(p.pageReader.Next())
+	currentPage := p.pageReader.Page()
+	p.CheckDataPageHeader(p.dataPageHdr, currentPage)
+}
+
+func (p *PageSerdeSuite) TestDataPageV2() {
+	const (
+		statsSize = 512
+		nrows     = 4444
+	)
+	p.dataPageHdrV2.Statistics = getDummyStats(statsSize, true)
+	p.dataPageHdrV2.NumValues = nrows
+	p.WriteDataPageHeaderV2(1024, 0, 0)
+	p.InitSerializedPageReader(nrows, compress.Codecs.Uncompressed)
+	p.True(p.pageReader.Next())
+	p.CheckDataPageHeaderV2(p.dataPageHdrV2, p.pageReader.Page())
+}
+
+func (p *PageSerdeSuite) TestLargePageHeaders() {
+	const (
+		statsSize     = 256 * 1024 // 256KB
+		nrows         = 4141
+		maxHeaderSize = 512 * 1024 // 512KB
+	)
+
+	p.dataPageHdr.Statistics = getDummyStats(statsSize, false)
+	p.dataPageHdr.NumValues = nrows
+	p.WriteDataPageHeader(maxHeaderSize, 0, 0)
+	pos, err := p.sink.Seek(0, io.SeekCurrent)
+	p.NoError(err)
+	p.GreaterOrEqual(maxHeaderSize, int(pos))
+	p.LessOrEqual(statsSize, int(pos))
+	p.GreaterOrEqual(16*1024*1024, int(pos))
+
+	p.InitSerializedPageReader(nrows, compress.Codecs.Uncompressed)
+	p.True(p.pageReader.Next())
+	p.CheckDataPageHeader(p.dataPageHdr, p.pageReader.Page())
+}
+
+func (p *PageSerdeSuite) TestFailLargePageHeaders() {
+	const (
+		statsSize      = 256 * 1024 // 256KB
+		nrows          = 1337       // dummy value
+		maxHeaderSize  = 512 * 1024 // 512 KB
+		smallerMaxSize = 128 * 1024 // 128KB
+	)
+	p.dataPageHdr.Statistics = getDummyStats(statsSize, false)
+	p.WriteDataPageHeader(maxHeaderSize, 0, 0)
+	pos, err := p.sink.Seek(0, io.SeekCurrent)
+	p.NoError(err)
+	p.GreaterOrEqual(maxHeaderSize, int(pos))
+
+	p.LessOrEqual(smallerMaxSize, int(pos))
+	p.InitSerializedPageReader(nrows, compress.Codecs.Uncompressed)
+	p.pageReader.SetMaxPageHeaderSize(smallerMaxSize)
+	p.NotPanics(func() { p.False(p.pageReader.Next()) })
+	p.Error(p.pageReader.Err())
+}
+
+func (p *PageSerdeSuite) TestCompression() {
+	codecs := []compress.Compression{
+		compress.Codecs.Snappy,
+		compress.Codecs.Brotli,
+		compress.Codecs.Gzip,
+		// compress.Codecs.Lz4, // not yet implemented
+		compress.Codecs.Zstd,
+	}
+
+	const (
+		nrows  = 32 // dummy value
+		npages = 10
+	)
+	p.dataPageHdr.NumValues = nrows
+
+	fauxData := make([][]byte, npages)
+	for idx := range fauxData {
+		// each page is larger
+		fauxData[idx] = make([]byte, (idx+1)*64)
+		rand.Read(fauxData[idx])
+	}
+	for _, c := range codecs {
+		p.Run(c.String(), func() {
+			codec, _ := compress.GetCodec(c)
+			for _, data := range fauxData {
+				maxCompressed := codec.CompressBound(int64(len(data)))
+				buffer := make([]byte, maxCompressed)
+				buffer = codec.Encode(buffer, data)
+				p.WriteDataPageHeader(1024, int32(len(data)), int32(len(buffer)))
+				_, err := p.sink.Write(buffer)
+				p.NoError(err)
+			}
+
+			p.InitSerializedPageReader(nrows*npages, c)
+
+			for _, data := range fauxData {
+				p.True(p.pageReader.Next())
+				page := p.pageReader.Page()
+				p.IsType(&file.DataPageV1{}, page)
+				p.Equal(data, page.Data())
+			}
+			p.ResetStream()
+		})
+	}
+}
+
+func TestInvalidHeaders(t *testing.T) {
+	badHeader := []byte("PAR2")
+	_, err := file.NewParquetReader(bytes.NewReader(badHeader))
+	assert.Error(t, err)
+}
+
+func TestInvalidFooter(t *testing.T) {
+	// file is smaller than FOOTER_SIZE
+	badFile := []byte("PAR1PAR")
+	_, err := file.NewParquetReader(bytes.NewReader(badFile))
+	assert.Error(t, err)
+
+	// Magic Number Incorrect
+	badFile2 := []byte("PAR1PAR2")
+	_, err = file.NewParquetReader(bytes.NewReader(badFile2))
+	assert.Error(t, err)
+}
+
+func TestIncompleteMetadata(t *testing.T) {
+	sink := encoding.NewBufferWriter(0, memory.DefaultAllocator)
+	magic := []byte("PAR1")
+
+	sink.Write(magic)
+	sink.Write(make([]byte, 10))
+	const metadataLen = 24
+	binary.Write(sink, binary.LittleEndian, uint32(metadataLen))
+	sink.Write(magic)
+	buf := sink.Finish()
+	defer buf.Release()
+	_, err := file.NewParquetReader(bytes.NewReader(buf.Bytes()))
+	assert.Error(t, err)
+}
diff --git a/go/parquet/file/level_conversion.go b/go/parquet/file/level_conversion.go
new file mode 100644
index 0000000000000..6c56c13933e08
--- /dev/null
+++ b/go/parquet/file/level_conversion.go
@@ -0,0 +1,262 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package file
+
+import (
+	"math"
+	"math/bits"
+	"unsafe"
+
+	"github.com/apache/arrow/go/parquet"
+	"github.com/apache/arrow/go/parquet/internal/bmi"
+	"github.com/apache/arrow/go/parquet/internal/utils"
+	"github.com/apache/arrow/go/parquet/schema"
+	"golang.org/x/xerrors"
+)
+
+type LevelInfo struct {
+	// How many slots an undefined but present (i.e. null) element in
+	// parquet consumes when decoding to Arrow.
+	// "Slot" is used in the same context as the Arrow specification
+	// (i.e. a value holder).
+	// This is only ever >1 for descendents of FixedSizeList.
+	NullSlotUsage int32
+	// The definition level at which the value for the field
+	// is considered not null (definition levels greater than
+	// or equal to this value indicate a not-null
+	// value for the field). For list fields definition levels
+	// greater than or equal to this field indicate a present,
+	// possibly null, child value.
+	DefLevel int16
+	// The repetition level corresponding to this element
+	// or the closest repeated ancestor.  Any repetition
+	// level less than this indicates either a new list OR
+	// an empty list (which is determined in conjunction
+	// with definition levels).
+	RepLevel int16
+	// The definition level indicating the level at which the closest
+	// repeated ancestor is not empty.  This is used to discriminate
+	// between a value less than |def_level| being null or excluded entirely.
+	// For instance if we have an arrow schema like:
+	// list(struct(f0: int)).  Then then there are the following
+	// definition levels:
+	//   0 = null list
+	//   1 = present but empty list.
+	//   2 = a null value in the list
+	//   3 = a non null struct but null integer.
+	//   4 = a present integer.
+	// When reconstructing, the struct and integer arrays'
+	// repeated_ancestor_def_level would be 2.  Any
+	// def_level < 2 indicates that there isn't a corresponding
+	// child value in the list.
+	// i.e. [null, [], [null], [{f0: null}], [{f0: 1}]]
+	// has the def levels [0, 1, 2, 3, 4].  The actual
+	// struct array is only of length 3: [not-set, set, set] and
+	// the int array is also of length 3: [N/A, null, 1].
+	RepeatedAncestorDefLevel int16
+}
+
+func newDefaultLevelInfo() *LevelInfo {
+	return &LevelInfo{NullSlotUsage: 1}
+}
+
+func (l *LevelInfo) Equal(rhs *LevelInfo) bool {
+	return l.NullSlotUsage == rhs.NullSlotUsage &&
+		l.DefLevel == rhs.DefLevel &&
+		l.RepLevel == rhs.RepLevel &&
+		l.RepeatedAncestorDefLevel == rhs.RepeatedAncestorDefLevel
+}
+
+func (l *LevelInfo) HasNullableValues() bool {
+	return l.RepeatedAncestorDefLevel < l.DefLevel
+}
+
+func (l *LevelInfo) IncrementOptional() {
+	l.DefLevel++
+}
+
+func (l *LevelInfo) IncrementRepeated() int16 {
+	lastRepAncestor := l.RepeatedAncestorDefLevel
+	// Repeated fields add both a repetition and definition level. This is used
+	// to distinguish between an empty list and a list with an item in it.
+	l.RepLevel++
+	l.DefLevel++
+
+	// For levels >= repeated_ancenstor_def_level it indicates the list was
+	// non-null and had at least one element.  This is important
+	// for later decoding because we need to add a slot for these
+	// values.  for levels < current_def_level no slots are added
+	// to arrays.
+	l.RepeatedAncestorDefLevel = l.DefLevel
+	return lastRepAncestor
+}
+
+func (l *LevelInfo) Increment(n schema.Node) {
+	switch n.RepetitionType() {
+	case parquet.Repetitions.Repeated:
+		l.IncrementRepeated()
+	case parquet.Repetitions.Optional:
+		l.IncrementOptional()
+	}
+}
+
+// Input/Output structure for reconstructed validity bitmaps.
+type ValidityBitmapInputOutput struct {
+	// Input only.
+	// The maximum number of values_read expected (actual
+	// values read must be less than or equal to this value).
+	// If this number is exceeded methods will throw a
+	// ParquetException. Exceeding this limit indicates
+	// either a corrupt or incorrectly written file.
+	ReadUpperBound int64
+	// Output only. The number of values added to the encountered
+	// (this is logically the count of the number of elements
+	// for an Arrow array).
+	Read int64
+	// Input/Output. The number of nulls encountered.
+	NullCount int64
+	// Output only. The validity bitmap to populate. May be be null only
+	// for DefRepLevelsToListInfo (if all that is needed is list offsets).
+	ValidBits []byte
+	// Input only, offset into valid_bits to start at.
+	ValidBitsOffset int64
+}
+
+const extractBitsSize int64 = 8 * int64(unsafe.Sizeof(uint64(0)))
+
+// create a bitmap out of the definition Levels and return the number of non-null values
+func defLevelsBatchToBitmap(defLevels []int16, remainingUpperBound int64, info LevelInfo, wr utils.BitmapWriter, hasRepeatedParent bool) uint64 {
+	definedBitmap := bmi.GreaterThanBitmap(defLevels, info.DefLevel-1)
+
+	if hasRepeatedParent {
+		// Greater than level_info.repeated_ancestor_def_level - 1 implies >= the
+		// repeated_ancestor_def_level
+		presentBitmap := bmi.GreaterThanBitmap(defLevels, info.RepeatedAncestorDefLevel-1)
+		selectedBits := bmi.ExtractBits(definedBitmap, presentBitmap)
+		selectedCount := int64(bits.OnesCount64(presentBitmap))
+		if selectedCount > remainingUpperBound {
+			panic("values read exceeded upper bound")
+		}
+		wr.AppendWord(selectedBits, selectedCount)
+		return uint64(bits.OnesCount64(selectedBits))
+	}
+
+	if int64(len(defLevels)) > remainingUpperBound {
+		panic("values read exceed upper bound")
+	}
+
+	wr.AppendWord(definedBitmap, int64(len(defLevels)))
+	return uint64(bits.OnesCount64(definedBitmap))
+}
+
+// create a bitmap out of the definition Levels
+func defLevelsToBitmapInternal(defLevels []int16, info LevelInfo, out *ValidityBitmapInputOutput, hasRepeatedParent bool) {
+	wr := utils.NewFirstTimeBitmapWriter(out.ValidBits, out.ValidBitsOffset, int64(len(defLevels)))
+	defer wr.Finish()
+	setCount := defLevelsBatchToBitmap(defLevels, out.ReadUpperBound, info, wr, hasRepeatedParent)
+	out.Read = int64(wr.Pos())
+	out.NullCount += out.Read - int64(setCount)
+}
+
+// DefLevelsToBitmap creates a validitybitmap out of the passed in definition levels and info object.
+func DefLevelsToBitmap(defLevels []int16, info LevelInfo, out *ValidityBitmapInputOutput) {
+	hasRepeatedParent := false
+	if info.RepLevel > 0 {
+		hasRepeatedParent = true
+	}
+	defLevelsToBitmapInternal(defLevels, info, out, hasRepeatedParent)
+}
+
+// DefRepLevelsToListInfo takes in the definition and repetition levels in order to populate the validity bitmap
+// and properly handle nested lists and update the offsets for them.
+func DefRepLevelsToListInfo(defLevels, repLevels []int16, info LevelInfo, out *ValidityBitmapInputOutput, offsets []int32) error {
+	var wr utils.BitmapWriter
+	if out.ValidBits != nil {
+		wr = utils.NewFirstTimeBitmapWriter(out.ValidBits, out.ValidBitsOffset, out.ReadUpperBound)
+		defer wr.Finish()
+	}
+	offsetPos := 0
+	for idx := range defLevels {
+		// skip items that belong to empty or null ancestor lists and further nested lists
+		if defLevels[idx] < info.RepeatedAncestorDefLevel || repLevels[idx] > info.RepLevel {
+			continue
+		}
+
+		if repLevels[idx] == info.RepLevel {
+			// continuation of an existing list.
+			// offsets can be null for structs with repeated children
+			if offsetPos < len(offsets) {
+				if offsets[offsetPos] == math.MaxInt32 {
+					return xerrors.New("list index overflow")
+				}
+				offsets[offsetPos]++
+			}
+		} else {
+			if (wr != nil && int64(wr.Pos()) >= out.ReadUpperBound) || (offsetPos >= int(out.ReadUpperBound)) {
+				return xerrors.Errorf("definition levels exceeded upper bound: %d", out.ReadUpperBound)
+			}
+
+			// current_rep < list rep_level i.e. start of a list (ancestor empty lists
+			// are filtered out above)
+			// offsets can be null for structs with repeated children
+			if offsetPos+1 < len(offsets) {
+				offsetPos++
+				// use cumulative offsets because variable size lists are more common
+				// than fixed size lists so it should be cheaper to make these
+				// cumulative and subtract when validating fixed size lists
+				offsets[offsetPos] = offsets[offsetPos-1]
+				if defLevels[idx] >= info.DefLevel {
+					if offsets[offsetPos] == math.MaxInt32 {
+						return xerrors.New("list index overflow")
+					}
+					offsets[offsetPos]++
+				}
+			}
+
+			if wr != nil {
+				// the level info def level for lists reflects element present level
+				// the prior level distinguishes between empty lists
+				if defLevels[idx] >= info.DefLevel-1 {
+					wr.Set()
+				} else {
+					out.NullCount++
+					wr.Clear()
+				}
+				wr.Next()
+			}
+		}
+	}
+
+	if len(offsets) > 0 {
+		out.Read = int64(offsetPos)
+	} else if wr != nil {
+		out.Read = int64(wr.Pos())
+	}
+
+	if out.NullCount > 0 && info.NullSlotUsage > 1 {
+		return xerrors.New("null values with null_slot_usage > 1 not supported.")
+	}
+	return nil
+}
+
+// DefRepLevelsToBitmap constructs a full validitybitmap out of the definition and repetition levels
+// properly handling nested lists and parents.
+func DefRepLevelsToBitmap(defLevels, repLevels []int16, info LevelInfo, out *ValidityBitmapInputOutput) error {
+	info.RepLevel++
+	info.DefLevel++
+	return DefRepLevelsToListInfo(defLevels, repLevels, info, out, nil)
+}
diff --git a/go/parquet/file/level_conversion_test.go b/go/parquet/file/level_conversion_test.go
new file mode 100644
index 0000000000000..08d2fe311f88a
--- /dev/null
+++ b/go/parquet/file/level_conversion_test.go
@@ -0,0 +1,194 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package file
+
+import (
+	"strings"
+	"testing"
+
+	"github.com/apache/arrow/go/arrow/bitutil"
+	"github.com/apache/arrow/go/parquet/internal/bmi"
+	"github.com/apache/arrow/go/parquet/internal/utils"
+	"github.com/stretchr/testify/assert"
+)
+
+func bitmapToString(bitmap []byte, bitCount int64) string {
+	var bld strings.Builder
+	bld.Grow(int(bitCount))
+	for i := 0; i < int(bitCount); i++ {
+		if bitutil.BitIsSet(bitmap, i) {
+			bld.WriteByte('1')
+		} else {
+			bld.WriteByte('0')
+		}
+	}
+	return bld.String()
+}
+
+func TestDefLevelsToBitmap(t *testing.T) {
+	defLevels := []int16{3, 3, 3, 2, 3, 3, 3, 3, 3}
+	validBits := []byte{2, 0}
+
+	var info LevelInfo
+	info.DefLevel = 3
+	info.RepLevel = 1
+
+	var io ValidityBitmapInputOutput
+	io.ReadUpperBound = int64(len(defLevels))
+	io.Read = -1
+	io.ValidBits = validBits
+
+	DefLevelsToBitmap(defLevels, info, &io)
+	assert.Equal(t, int64(9), io.Read)
+	assert.Equal(t, int64(1), io.NullCount)
+
+	// call again with 0 definition levels make sure that valid bits is unmodified
+	curByte := validBits[1]
+	io.NullCount = 0
+	DefLevelsToBitmap(defLevels[:0], info, &io)
+
+	assert.Zero(t, io.Read)
+	assert.Zero(t, io.NullCount)
+	assert.Equal(t, curByte, validBits[1])
+}
+
+func TestDefLevelstToBitmapPowerOf2(t *testing.T) {
+	defLevels := []int16{3, 3, 3, 2, 3, 3, 3, 3}
+	validBits := []byte{1, 0}
+
+	var (
+		info LevelInfo
+		io   ValidityBitmapInputOutput
+	)
+
+	info.RepLevel = 1
+	info.DefLevel = 3
+	io.Read = -1
+	io.ReadUpperBound = int64(len(defLevels))
+	io.ValidBits = validBits
+
+	DefLevelsToBitmap(defLevels[4:8], info, &io)
+	assert.Equal(t, int64(4), io.Read)
+	assert.Zero(t, io.NullCount)
+}
+
+func TestGreaterThanBitmapGeneratesExpectedBitmasks(t *testing.T) {
+	defLevels := []int16{
+		0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
+		0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
+		0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
+		0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7}
+
+	tests := []struct {
+		name     string
+		num      int
+		rhs      int16
+		expected uint64
+	}{
+		{"no levels", 0, 0, 0},
+		{"64 and 8", 64, 8, 0},
+		{"64 and -1", 64, -1, 0xFFFFFFFFFFFFFFFF},
+		// should be zero padded
+		{"zero pad 47, -1", 47, -1, 0x7FFFFFFFFFFF},
+		{"zero pad 64 and 6", 64, 6, 0x8080808080808080},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			assert.Equal(t, tt.expected, bmi.GreaterThanBitmap(defLevels[:tt.num], tt.rhs))
+		})
+	}
+}
+
+func TestWithRepetitionlevelFiltersOutEmptyListValues(t *testing.T) {
+	validityBitmap := make([]byte, 8)
+	io := ValidityBitmapInputOutput{
+		ReadUpperBound:  64,
+		Read:            1,
+		NullCount:       5,
+		ValidBits:       validityBitmap,
+		ValidBitsOffset: 1,
+	}
+
+	info := LevelInfo{
+		RepeatedAncestorDefLevel: 1,
+		DefLevel:                 2,
+		RepLevel:                 1,
+	}
+
+	defLevels := []int16{0, 0, 0, 2, 2, 1, 0, 2}
+	DefLevelsToBitmap(defLevels, info, &io)
+
+	assert.Equal(t, bitmapToString(validityBitmap, 8), "01101000")
+	for _, x := range validityBitmap[1:] {
+		assert.Zero(t, x)
+	}
+	assert.EqualValues(t, 6, io.NullCount)
+	assert.EqualValues(t, 4, io.Read)
+}
+
+type MultiLevelTestData struct {
+	defLevels []int16
+	repLevels []int16
+}
+
+func TriplNestedList() MultiLevelTestData {
+	// Triply nested list values borrow from write_path
+	// [null, [[1, null, 3], []], []],
+	// [[[]], [[], [1, 2]], null, [[3]]],
+	// null,
+	// []
+	return MultiLevelTestData{
+		defLevels: []int16{2, 7, 6, 7, 5, 3, // first row
+			5, 5, 7, 7, 2, 7, // second row
+			0, // third row
+			1},
+		repLevels: []int16{0, 1, 3, 3, 2, 1, // first row
+			0, 1, 2, 3, 1, 1, // second row
+			0, 0},
+	}
+}
+
+func TestActualCase(t *testing.T) {
+	out := make([]byte, 512)
+	defs := make([]int16, 64)
+	for i := range defs {
+		defs[i] = 3
+	}
+
+	defs[0] = 0
+	defs[25] = 0
+	defs[33] = 0
+	defs[49] = 0
+	defs[58] = 0
+	defs[59] = 0
+	defs[60] = 0
+	defs[61] = 0
+
+	remaining := int64(4096)
+	info := LevelInfo{
+		NullSlotUsage:            0,
+		DefLevel:                 3,
+		RepLevel:                 1,
+		RepeatedAncestorDefLevel: 2,
+	}
+
+	wr := utils.NewFirstTimeBitmapWriter(out, 0, 4096)
+	v := defLevelsBatchToBitmap(defs, remaining, info, wr, true)
+	assert.EqualValues(t, 56, v)
+	assert.Equal(t, []byte{255, 255, 255, 255}, out[:4])
+}
diff --git a/go/parquet/file/page_reader.go b/go/parquet/file/page_reader.go
new file mode 100644
index 0000000000000..251499af21ce7
--- /dev/null
+++ b/go/parquet/file/page_reader.go
@@ -0,0 +1,620 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package file
+
+import (
+	"bytes"
+	"io"
+	"sync"
+
+	"github.com/JohnCGriffin/overflow"
+	"github.com/apache/arrow/go/arrow/ipc"
+	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/parquet"
+	"github.com/apache/arrow/go/parquet/compress"
+	"github.com/apache/arrow/go/parquet/internal/debug"
+	"github.com/apache/arrow/go/parquet/internal/encryption"
+	format "github.com/apache/arrow/go/parquet/internal/gen-go/parquet"
+	"github.com/apache/arrow/go/parquet/internal/thrift"
+	"github.com/apache/arrow/go/parquet/metadata"
+	"golang.org/x/xerrors"
+)
+
+// PageReader is the interface used by the columnreader in order to read
+// and handle DataPages and loop through them.
+type PageReader interface {
+	// Set the maximum Page header size allowed to be read
+	SetMaxPageHeaderSize(int)
+	// Return the current page, or nil if there are no more
+	Page() Page
+	// Fetch the next page, returns false if there are no more pages
+	Next() bool
+	// if Next returns false, Err will return the error encountered or
+	// nil if there was no error and you just hit the end of the page
+	Err() error
+	// Reset allows reusing a page reader
+	Reset(r parquet.ReaderAtSeeker, nrows int64, compressType compress.Compression, ctx *CryptoContext)
+}
+
+// Page is an interface for handling DataPages or Dictionary Pages
+type Page interface {
+	// Returns which kind of page this is
+	Type() format.PageType
+	// Get the raw bytes of this page
+	Data() []byte
+	// return the encoding used for this page, Plain/RLE, etc.
+	Encoding() format.Encoding
+	// get the number of values in this page
+	NumValues() int32
+	// release this page object back into the page pool for re-use
+	Release()
+}
+
+type page struct {
+	buf *memory.Buffer
+	typ format.PageType
+
+	nvals    int32
+	encoding format.Encoding
+}
+
+func (p *page) Type() format.PageType     { return p.typ }
+func (p *page) Data() []byte              { return p.buf.Bytes() }
+func (p *page) NumValues() int32          { return p.nvals }
+func (p *page) Encoding() format.Encoding { return p.encoding }
+
+// DataPage is the base interface for both DataPageV1 and DataPageV2 of the
+// parquet spec.
+type DataPage interface {
+	Page
+	UncompressedSize() int64
+	Statistics() metadata.EncodedStatistics
+}
+
+// Create some pools to use for reusing the data page objects themselves so that
+// we can avoid tight loops that are creating and destroying tons of individual
+// objects. This combined with a Release function on the pages themselves
+// which will put them back into the pool yields significant memory reduction
+// and performance benefits
+
+var dataPageV1Pool = sync.Pool{
+	New: func() interface{} { return (*DataPageV1)(nil) },
+}
+
+var dataPageV2Pool = sync.Pool{
+	New: func() interface{} { return (*DataPageV2)(nil) },
+}
+
+var dictPagePool = sync.Pool{
+	New: func() interface{} { return (*DictionaryPage)(nil) },
+}
+
+// DataPageV1 represents a DataPage version 1 from the parquet.thrift file
+type DataPageV1 struct {
+	page
+
+	defLvlEncoding   format.Encoding
+	repLvlEncoding   format.Encoding
+	uncompressedSize int64
+	statistics       metadata.EncodedStatistics
+}
+
+// NewDataPageV1 returns a V1 data page with the given buffer as its data and the specified encoding information
+//
+// Will utilize objects that have been released back into the data page pool and
+// re-use them if available as opposed to creating new objects. Calling Release on the
+// data page object will release it back to the pool for re-use.
+func NewDataPageV1(buffer *memory.Buffer, num int32, encoding, defEncoding, repEncoding parquet.Encoding, uncompressedSize int64) *DataPageV1 {
+	dp := dataPageV1Pool.Get().(*DataPageV1)
+	if dp == nil {
+		return &DataPageV1{
+			page:             page{buf: buffer, typ: format.PageType_DATA_PAGE, nvals: num, encoding: format.Encoding(encoding)},
+			defLvlEncoding:   format.Encoding(defEncoding),
+			repLvlEncoding:   format.Encoding(repEncoding),
+			uncompressedSize: uncompressedSize,
+		}
+	}
+
+	dp.buf, dp.nvals = buffer, num
+	dp.encoding = format.Encoding(encoding)
+	dp.defLvlEncoding, dp.repLvlEncoding = format.Encoding(defEncoding), format.Encoding(repEncoding)
+	dp.statistics.HasMax, dp.statistics.HasMin = false, false
+	dp.statistics.HasNullCount, dp.statistics.HasDistinctCount = false, false
+	dp.uncompressedSize = uncompressedSize
+	return dp
+}
+
+// NewDataPageV1WithStats is the same as NewDataPageV1, but also allows adding the stat info into the created page
+func NewDataPageV1WithStats(buffer *memory.Buffer, num int32, encoding, defEncoding, repEncoding parquet.Encoding, uncompressedSize int64, stats metadata.EncodedStatistics) *DataPageV1 {
+	ret := NewDataPageV1(buffer, num, encoding, defEncoding, repEncoding, uncompressedSize)
+	ret.statistics = stats
+	return ret
+}
+
+// Release this page back into the DataPage object pool so that it can be reused.
+//
+// After calling this function, the object should not be utilized anymore, otherwise
+// conflicts can arise.
+func (d *DataPageV1) Release() {
+	d.buf.Release()
+	d.buf = nil
+	dataPageV1Pool.Put(d)
+}
+
+// UncompressedSize returns the size of the data in this data page when uncompressed
+func (d *DataPageV1) UncompressedSize() int64 { return d.uncompressedSize }
+
+// Statistics returns the encoded statistics on this data page
+func (d *DataPageV1) Statistics() metadata.EncodedStatistics { return d.statistics }
+
+// DefinitionLevelEncoding returns the encoding utilized for the Definition Levels
+func (d *DataPageV1) DefinitionLevelEncoding() parquet.Encoding {
+	return parquet.Encoding(d.defLvlEncoding)
+}
+
+// RepetitionLevelEncoding returns the encoding utilized for the Repetition Levels
+func (d *DataPageV1) RepetitionLevelEncoding() parquet.Encoding {
+	return parquet.Encoding(d.repLvlEncoding)
+}
+
+// DataPageV2 is the representation of the V2 data page from the parquet.thrift spec
+type DataPageV2 struct {
+	page
+
+	nulls            int32
+	nrows            int32
+	defLvlByteLen    int32
+	repLvlByteLen    int32
+	compressed       bool
+	uncompressedSize int64
+	statistics       metadata.EncodedStatistics
+}
+
+// NewDataPageV2 constructs a new V2 data page with the provided information and a buffer of the raw data.
+func NewDataPageV2(buffer *memory.Buffer, numValues, numNulls, numRows int32, encoding parquet.Encoding, defLvlsByteLen, repLvlsByteLen int32, uncompressed int64, isCompressed bool) *DataPageV2 {
+	dp := dataPageV2Pool.Get().(*DataPageV2)
+	if dp == nil {
+		return &DataPageV2{
+			page:             page{buf: buffer, typ: format.PageType_DATA_PAGE_V2, nvals: numValues, encoding: format.Encoding(encoding)},
+			nulls:            numNulls,
+			nrows:            numRows,
+			defLvlByteLen:    defLvlsByteLen,
+			repLvlByteLen:    repLvlsByteLen,
+			compressed:       isCompressed,
+			uncompressedSize: uncompressed,
+		}
+	}
+
+	dp.buf, dp.nvals = buffer, numValues
+	dp.encoding = format.Encoding(encoding)
+	dp.nulls, dp.nrows = numNulls, numRows
+	dp.defLvlByteLen, dp.repLvlByteLen = defLvlsByteLen, repLvlsByteLen
+	dp.compressed, dp.uncompressedSize = isCompressed, uncompressed
+	dp.statistics.HasMax, dp.statistics.HasMin = false, false
+	dp.statistics.HasNullCount, dp.statistics.HasDistinctCount = false, false
+	return dp
+}
+
+// NewDataPageV2WithStats is the same as NewDataPageV2 but allows providing the encoded stats with the page.
+func NewDataPageV2WithStats(buffer *memory.Buffer, numValues, numNulls, numRows int32, encoding parquet.Encoding, defLvlsByteLen, repLvlsByteLen int32, uncompressed int64, isCompressed bool, stats metadata.EncodedStatistics) *DataPageV2 {
+	ret := NewDataPageV2(buffer, numValues, numNulls, numRows, encoding, defLvlsByteLen, repLvlsByteLen, uncompressed, isCompressed)
+	ret.statistics = stats
+	return ret
+}
+
+// Release this page back into the DataPage object pool so that it can be reused.
+//
+// After calling this function, the object should not be utilized anymore, otherwise
+// conflicts can arise.
+func (d *DataPageV2) Release() {
+	d.buf.Release()
+	d.buf = nil
+	dataPageV2Pool.Put(d)
+}
+
+// UncompressedSize is the size of the raw page when uncompressed. If `IsCompressed` is true, then
+// the raw data in the buffer is expected to be compressed.
+func (d *DataPageV2) UncompressedSize() int64 { return d.uncompressedSize }
+
+// Statistics are the encoded statistics in the data page
+func (d *DataPageV2) Statistics() metadata.EncodedStatistics { return d.statistics }
+
+// NumNulls is the reported number of nulls in this datapage
+func (d *DataPageV2) NumNulls() int32 { return d.nulls }
+
+// DefinitionLevelByteLen is the number of bytes in the buffer that are used to represent the definition levels
+func (d *DataPageV2) DefinitionLevelByteLen() int32 { return d.defLvlByteLen }
+
+// RepetitionLevelByteLen is the number of bytes in the buffer which are used to represent the repetition Levels
+func (d *DataPageV2) RepetitionLevelByteLen() int32 { return d.repLvlByteLen }
+
+// IsCompressed returns true if the data of this page is compressed
+func (d *DataPageV2) IsCompressed() bool { return d.compressed }
+
+// DictionaryPage represents the a page of data that uses dictionary encoding
+type DictionaryPage struct {
+	page
+
+	sorted bool
+}
+
+// NewDictionaryPage constructs a new dictionary page with the provided data buffer and number of values.
+func NewDictionaryPage(buffer *memory.Buffer, nvals int32, encoding parquet.Encoding) *DictionaryPage {
+	dp := dictPagePool.Get().(*DictionaryPage)
+	if dp == nil {
+		return &DictionaryPage{
+			page: page{
+				buf:      buffer,
+				typ:      format.PageType_DICTIONARY_PAGE,
+				nvals:    nvals,
+				encoding: format.Encoding(encoding),
+			},
+		}
+	}
+
+	dp.buf = buffer
+	dp.nvals = nvals
+	dp.encoding = format.Encoding(encoding)
+	dp.sorted = false
+	return dp
+}
+
+// Release this page back into the DataPage object pool so that it can be reused.
+//
+// After calling this function, the object should not be utilized anymore, otherwise
+// conflicts can arise.
+func (d *DictionaryPage) Release() {
+	d.buf.Release()
+	d.buf = nil
+	dictPagePool.Put(d)
+}
+
+// IsSorted returns whether the dictionary itself is sorted
+func (d *DictionaryPage) IsSorted() bool { return d.sorted }
+
+type serializedPageReader struct {
+	r        ipc.ReadAtSeeker
+	nrows    int64
+	rowsSeen int64
+	mem      memory.Allocator
+	codec    compress.Codec
+
+	curPageHdr        *format.PageHeader
+	buf               *memory.Buffer
+	pageOrd           int16
+	maxPageHeaderSize int
+
+	curPage           Page
+	cryptoCtx         CryptoContext
+	dataPageAad       string
+	dataPageHeaderAad string
+
+	decompressBuffer bytes.Buffer
+	err              error
+}
+
+// NewPageReader returns a page reader for the data which can be read from the provided reader and compression.
+func NewPageReader(r parquet.ReaderAtSeeker, nrows int64, compressType compress.Compression, mem memory.Allocator, ctx *CryptoContext) (PageReader, error) {
+	if mem == nil {
+		mem = memory.NewGoAllocator()
+	}
+
+	codec, err := compress.GetCodec(compressType)
+	if err != nil {
+		return nil, err
+	}
+
+	rdr := &serializedPageReader{
+		r:                 r,
+		maxPageHeaderSize: defaultMaxPageHeaderSize,
+		nrows:             nrows,
+		mem:               mem,
+		codec:             codec,
+		buf:               memory.NewResizableBuffer(mem),
+	}
+	rdr.decompressBuffer.Grow(defaultPageHeaderSize)
+	if ctx != nil {
+		rdr.cryptoCtx = *ctx
+		rdr.initDecryption()
+	}
+	return rdr, nil
+}
+
+func (p *serializedPageReader) Reset(r parquet.ReaderAtSeeker, nrows int64, compressType compress.Compression, ctx *CryptoContext) {
+	p.rowsSeen, p.pageOrd = 0, 0
+	p.curPageHdr, p.curPage, p.err = nil, nil, nil
+	p.r, p.nrows = r, nrows
+
+	p.codec, p.err = compress.GetCodec(compressType)
+	if p.err != nil {
+		return
+	}
+	p.buf.ResizeNoShrink(0)
+	p.decompressBuffer.Reset()
+	if ctx != nil {
+		p.cryptoCtx = *ctx
+		p.initDecryption()
+	} else {
+		p.cryptoCtx = CryptoContext{}
+		p.dataPageAad = ""
+		p.dataPageHeaderAad = ""
+	}
+}
+
+func (p *serializedPageReader) Err() error { return p.err }
+
+func (p *serializedPageReader) SetMaxPageHeaderSize(sz int) {
+	p.maxPageHeaderSize = sz
+}
+
+func (p *serializedPageReader) initDecryption() {
+	if p.cryptoCtx.DataDecryptor != nil {
+		p.dataPageAad = encryption.CreateModuleAad(p.cryptoCtx.DataDecryptor.FileAad(), encryption.DataPageModule,
+			p.cryptoCtx.RowGroupOrdinal, p.cryptoCtx.ColumnOrdinal, -1)
+	}
+	if p.cryptoCtx.MetaDecryptor != nil {
+		p.dataPageHeaderAad = encryption.CreateModuleAad(p.cryptoCtx.MetaDecryptor.FileAad(), encryption.DataPageHeaderModule,
+			p.cryptoCtx.RowGroupOrdinal, p.cryptoCtx.ColumnOrdinal, -1)
+	}
+}
+
+func (p *serializedPageReader) updateDecryption(decrypt encryption.Decryptor, moduleType int8, pageAad string) {
+	if p.cryptoCtx.StartDecryptWithDictionaryPage {
+		aad := encryption.CreateModuleAad(decrypt.FileAad(), moduleType, p.cryptoCtx.RowGroupOrdinal, p.cryptoCtx.ColumnOrdinal, -1)
+		decrypt.UpdateAad(aad)
+	} else {
+		pageaad := []byte(pageAad)
+		encryption.QuickUpdatePageAad(pageaad, p.pageOrd)
+		decrypt.UpdateAad(string(pageaad))
+	}
+}
+
+func (p *serializedPageReader) Page() Page {
+	return p.curPage
+}
+
+func (p *serializedPageReader) decompress(lenCompressed int, buf []byte) ([]byte, error) {
+	p.decompressBuffer.Reset()
+	p.decompressBuffer.Grow(lenCompressed)
+	if _, err := io.CopyN(&p.decompressBuffer, p.r, int64(lenCompressed)); err != nil {
+		return nil, err
+	}
+
+	data := p.decompressBuffer.Bytes()
+	if p.cryptoCtx.DataDecryptor != nil {
+		data = p.cryptoCtx.DataDecryptor.Decrypt(p.decompressBuffer.Bytes())
+	}
+
+	return p.codec.Decode(buf, data), nil
+}
+
+type dataheader interface {
+	IsSetStatistics() bool
+	GetStatistics() *format.Statistics
+}
+
+func extractStats(dataHeader dataheader) (pageStats metadata.EncodedStatistics) {
+	if dataHeader.IsSetStatistics() {
+		stats := dataHeader.GetStatistics()
+		if stats.IsSetMaxValue() {
+			pageStats.SetMax(stats.GetMaxValue())
+		} else if stats.IsSetMax() {
+			pageStats.SetMax(stats.GetMax())
+		}
+		if stats.IsSetMinValue() {
+			pageStats.SetMin(stats.GetMinValue())
+		} else if stats.IsSetMin() {
+			pageStats.SetMin(stats.GetMin())
+		}
+
+		if stats.IsSetNullCount() {
+			pageStats.SetNullCount(stats.GetNullCount())
+		}
+		if stats.IsSetDistinctCount() {
+			pageStats.SetDistinctCount(stats.GetDistinctCount())
+		}
+	}
+	return
+}
+
+func (p *serializedPageReader) Next() bool {
+	// Loop here because there may be unhandled page types that we skip until
+	// finding a page that we do know what to do with
+	if p.curPage != nil {
+		p.curPage.Release()
+	}
+	p.curPage = nil
+	p.curPageHdr = format.NewPageHeader()
+	p.err = nil
+
+	for p.rowsSeen < p.nrows {
+		// headerSize := 0
+		allowedPgSz := defaultPageHeaderSize
+
+		start, _ := p.r.Seek(0, io.SeekCurrent)
+		p.decompressBuffer.Reset()
+		// Page headers can be very large because of page statistics
+		// We try to deserialize a larger buffer progressively
+		// until a maximum allowed header limit
+		for {
+			n, err := io.CopyN(&p.decompressBuffer, p.r, int64(allowedPgSz))
+			// view, err := p.r.Peek(allowedPgSz)
+			if err != nil && err != io.EOF {
+				p.err = err
+				return false
+			}
+
+			if n == 0 {
+				return false
+			}
+
+			view := p.decompressBuffer.Bytes()
+
+			extra := 0
+			if p.cryptoCtx.MetaDecryptor != nil {
+				p.updateDecryption(p.cryptoCtx.MetaDecryptor, encryption.DictPageHeaderModule, p.dataPageHeaderAad)
+				view = p.cryptoCtx.MetaDecryptor.Decrypt(view)
+				extra = p.cryptoCtx.MetaDecryptor.CiphertextSizeDelta()
+			}
+
+			remaining, err := thrift.DeserializeThrift(p.curPageHdr, view)
+			if err != nil {
+				allowedPgSz *= 2
+				if allowedPgSz > p.maxPageHeaderSize {
+					p.err = xerrors.New("parquet: deserializing page header failed")
+					return false
+				}
+				continue
+			}
+
+			p.r.Seek(start+int64(len(view)-int(remaining)+extra), io.SeekStart)
+			break
+		}
+
+		lenCompressed := int(p.curPageHdr.GetCompressedPageSize())
+		lenUncompressed := int(p.curPageHdr.GetUncompressedPageSize())
+		if lenCompressed < 0 || lenUncompressed < 0 {
+			p.err = xerrors.New("parquet: invalid page header")
+			return false
+		}
+
+		if p.cryptoCtx.DataDecryptor != nil {
+			p.updateDecryption(p.cryptoCtx.DataDecryptor, encryption.DictPageModule, p.dataPageAad)
+		}
+
+		p.buf.ResizeNoShrink(lenUncompressed)
+
+		switch p.curPageHdr.GetType() {
+		case format.PageType_DICTIONARY_PAGE:
+			p.cryptoCtx.StartDecryptWithDictionaryPage = false
+			dictHeader := p.curPageHdr.GetDictionaryPageHeader()
+			if dictHeader.GetNumValues() < 0 {
+				p.err = xerrors.New("parquet: invalid page header (negative number of values)")
+				return false
+			}
+
+			data, err := p.decompress(lenCompressed, p.buf.Bytes())
+			if err != nil {
+				p.err = err
+				return false
+			}
+			debug.Assert(len(data) == lenUncompressed, "len(data) != lenUncompressed")
+
+			// p.buf.Resize(lenUncompressed)
+			// make dictionary page
+			p.curPage = &DictionaryPage{
+				page: page{
+					buf:      memory.NewBufferBytes(data),
+					typ:      p.curPageHdr.Type,
+					nvals:    dictHeader.GetNumValues(),
+					encoding: dictHeader.GetEncoding(),
+				},
+				sorted: dictHeader.IsSetIsSorted() && dictHeader.GetIsSorted(),
+			}
+
+		case format.PageType_DATA_PAGE:
+			p.pageOrd++
+			dataHeader := p.curPageHdr.GetDataPageHeader()
+			if dataHeader.GetNumValues() < 0 {
+				p.err = xerrors.New("parquet: invalid page header (negative number of values)")
+				return false
+			}
+
+			p.rowsSeen += int64(dataHeader.GetNumValues())
+			data, err := p.decompress(lenCompressed, p.buf.Bytes())
+			if err != nil {
+				p.err = err
+				return false
+			}
+			debug.Assert(len(data) == lenUncompressed, "len(data) != lenUncompressed")
+
+			// make datapagev1
+			p.curPage = &DataPageV1{
+				page: page{
+					buf:      memory.NewBufferBytes(data),
+					typ:      p.curPageHdr.Type,
+					nvals:    dataHeader.GetNumValues(),
+					encoding: dataHeader.GetEncoding(),
+				},
+				defLvlEncoding:   dataHeader.GetDefinitionLevelEncoding(),
+				repLvlEncoding:   dataHeader.GetRepetitionLevelEncoding(),
+				uncompressedSize: int64(lenUncompressed),
+				statistics:       extractStats(dataHeader),
+			}
+		case format.PageType_DATA_PAGE_V2:
+			p.pageOrd++
+			dataHeader := p.curPageHdr.GetDataPageHeaderV2()
+			if dataHeader.GetNumValues() < 0 {
+				p.err = xerrors.New("parquet: invalid page header (negative number of values)")
+				return false
+			}
+
+			if dataHeader.GetDefinitionLevelsByteLength() < 0 || dataHeader.GetRepetitionLevelsByteLength() < 0 {
+				p.err = xerrors.New("parquet: invalid page header (negative levels byte length)")
+				return false
+			}
+
+			compressed := dataHeader.GetIsCompressed()
+			// extract stats
+			p.rowsSeen += int64(dataHeader.GetNumValues())
+			levelsBytelen, ok := overflow.Add(int(dataHeader.GetDefinitionLevelsByteLength()), int(dataHeader.GetRepetitionLevelsByteLength()))
+			if !ok {
+				p.err = xerrors.New("parquet: levels size too large (corrupt file?)")
+				return false
+			}
+
+			var data []byte
+			if compressed {
+				if levelsBytelen > 0 {
+					io.ReadFull(p.r, p.buf.Bytes()[:levelsBytelen])
+				}
+				if data, p.err = p.decompress(lenCompressed-levelsBytelen, p.buf.Bytes()[levelsBytelen:]); p.err != nil {
+					return false
+				}
+			} else {
+				io.ReadFull(p.r, p.buf.Bytes())
+				data = p.buf.Bytes()
+			}
+			debug.Assert(len(data) == lenUncompressed, "len(data) != lenUncompressed")
+
+			// make datapage v2
+			p.curPage = &DataPageV2{
+				page: page{
+					buf:      memory.NewBufferBytes(data),
+					typ:      p.curPageHdr.Type,
+					nvals:    dataHeader.GetNumValues(),
+					encoding: dataHeader.GetEncoding(),
+				},
+				nulls:            dataHeader.GetNumNulls(),
+				nrows:            dataHeader.GetNumRows(),
+				defLvlByteLen:    dataHeader.GetDefinitionLevelsByteLength(),
+				repLvlByteLen:    dataHeader.GetRepetitionLevelsByteLength(),
+				compressed:       compressed,
+				uncompressedSize: int64(lenUncompressed),
+				statistics:       extractStats(dataHeader),
+			}
+		default:
+			// we don't know this page type, we're allowed to skip non-data pages
+			continue
+		}
+
+		p.buf = memory.NewResizableBuffer(p.mem)
+		return true
+	}
+
+	return false
+}
diff --git a/go/parquet/file/row_group_reader.go b/go/parquet/file/row_group_reader.go
new file mode 100644
index 0000000000000..9c74a25c11eca
--- /dev/null
+++ b/go/parquet/file/row_group_reader.go
@@ -0,0 +1,130 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package file
+
+import (
+	"github.com/apache/arrow/go/arrow/ipc"
+	"github.com/apache/arrow/go/parquet"
+	"github.com/apache/arrow/go/parquet/internal/encryption"
+	"github.com/apache/arrow/go/parquet/internal/utils"
+	"github.com/apache/arrow/go/parquet/metadata"
+	"golang.org/x/xerrors"
+)
+
+const (
+	maxDictHeaderSize int64 = 100
+)
+
+// RowGroupReader is the primary interface for reading a single row group
+type RowGroupReader struct {
+	r             ipc.ReadAtSeeker
+	sourceSz      int64
+	fileMetadata  *metadata.FileMetaData
+	rgMetadata    *metadata.RowGroupMetaData
+	props         *parquet.ReaderProperties
+	fileDecryptor encryption.FileDecryptor
+}
+
+// MetaData returns the metadata of the current Row Group
+func (r *RowGroupReader) MetaData() *metadata.RowGroupMetaData { return r.rgMetadata }
+
+// NumColumns returns the number of columns of data as defined in the metadata of this row group
+func (r *RowGroupReader) NumColumns() int { return r.rgMetadata.NumColumns() }
+
+// NumRows returns the number of rows in just this row group
+func (r *RowGroupReader) NumRows() int64 { return r.rgMetadata.NumRows() }
+
+// ByteSize returns the full byte size of this row group as defined in its metadata
+func (r *RowGroupReader) ByteSize() int64 { return r.rgMetadata.TotalByteSize() }
+
+// Column returns a column reader for the requested (0-indexed) column
+//
+// panics if passed a column not in the range [0, NumColumns)
+func (r *RowGroupReader) Column(i int) ColumnChunkReader {
+	if i >= r.NumColumns() || i < 0 {
+		panic(xerrors.Errorf("parquet: trying to read column index %d but row group metadata only has %d columns", i, r.rgMetadata.NumColumns()))
+	}
+
+	descr := r.fileMetadata.Schema.Column(i)
+	pageRdr, err := r.GetColumnPageReader(i)
+	if err != nil {
+		panic(xerrors.Errorf("parquet: unable to initialize page reader: %w", err))
+	}
+	return NewColumnReader(descr, pageRdr, r.props.Allocator())
+}
+
+func (r *RowGroupReader) GetColumnPageReader(i int) (PageReader, error) {
+	col, err := r.rgMetadata.ColumnChunk(i)
+	if err != nil {
+		return nil, err
+	}
+
+	colStart := col.DataPageOffset()
+	if col.HasDictionaryPage() && col.DictionaryPageOffset() > 0 && colStart > col.DictionaryPageOffset() {
+		colStart = col.DictionaryPageOffset()
+	}
+
+	colLen := col.TotalCompressedSize()
+	if r.fileMetadata.WriterVersion().LessThan(metadata.Parquet816FixedVersion) {
+		bytesRemain := r.sourceSz - (colStart + colLen)
+		padding := utils.Min(maxDictHeaderSize, bytesRemain)
+		colLen += padding
+	}
+
+	stream, err := r.props.GetStream(r.r, colStart, colLen)
+	if err != nil {
+		return nil, err
+	}
+
+	cryptoMetadata := col.CryptoMetadata()
+	if cryptoMetadata == nil {
+		return NewPageReader(stream, col.NumValues(), col.Compression(), r.props.Allocator(), nil)
+	}
+
+	if r.fileDecryptor == nil {
+		return nil, xerrors.New("column in rowgroup is encrypted, but no file decryptor")
+	}
+
+	const encryptedRowGroupsLimit = 32767
+	if i > encryptedRowGroupsLimit {
+		return nil, xerrors.New("encrypted files cannot contain more than 32767 column chunks")
+	}
+
+	if cryptoMetadata.IsSetENCRYPTION_WITH_FOOTER_KEY() {
+		ctx := CryptoContext{
+			StartDecryptWithDictionaryPage: col.HasDictionaryPage(),
+			RowGroupOrdinal:                r.rgMetadata.Ordinal(),
+			ColumnOrdinal:                  int16(i),
+			MetaDecryptor:                  r.fileDecryptor.GetFooterDecryptorForColumnMeta(""),
+			DataDecryptor:                  r.fileDecryptor.GetFooterDecryptorForColumnData(""),
+		}
+		return NewPageReader(stream, col.NumValues(), col.Compression(), r.props.Allocator(), &ctx)
+	}
+
+	// column encrypted with it's own key
+	columnKeyMeta := cryptoMetadata.GetENCRYPTION_WITH_COLUMN_KEY().KeyMetadata
+	columnPath := cryptoMetadata.GetENCRYPTION_WITH_COLUMN_KEY().PathInSchema
+
+	ctx := CryptoContext{
+		StartDecryptWithDictionaryPage: col.HasDictionaryPage(),
+		RowGroupOrdinal:                r.rgMetadata.Ordinal(),
+		ColumnOrdinal:                  int16(i),
+		MetaDecryptor:                  r.fileDecryptor.GetColumnMetaDecryptor(parquet.ColumnPath(columnPath).String(), string(columnKeyMeta), ""),
+		DataDecryptor:                  r.fileDecryptor.GetColumnDataDecryptor(parquet.ColumnPath(columnPath).String(), string(columnKeyMeta), ""),
+	}
+	return NewPageReader(stream, col.NumValues(), col.Compression(), r.props.Allocator(), &ctx)
+}
diff --git a/go/parquet/go.sum b/go/parquet/go.sum
index cf7b6789c2a75..46b4f5a555aa9 100644
--- a/go/parquet/go.sum
+++ b/go/parquet/go.sum
@@ -75,6 +75,7 @@ github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZb
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
 github.com/rogpeppe/fastuuid v1.2.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ=
+github.com/stretchr/objx v0.1.0 h1:4G4v2dO3VZwixGIRoQ5Lfboy6nUhCyYzaqnIAPPhYs4=
 github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
 github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA=
 github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY=
diff --git a/go/parquet/internal/bmi/bitmap_bmi2_noasm.go b/go/parquet/internal/bmi/bitmap_bmi2_noasm.go
new file mode 100644
index 0000000000000..6dc4a39a60e5a
--- /dev/null
+++ b/go/parquet/internal/bmi/bitmap_bmi2_noasm.go
@@ -0,0 +1,24 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build noasm
+
+package bmi
+
+func init() {
+	funclist.extractBits = extractBitsGo
+	funclist.gtbitmap = greaterThanBitmapGo
+}
diff --git a/go/parquet/internal/bmi/bmi.go b/go/parquet/internal/bmi/bmi.go
index ea0f6e374febe..a12af3e75d8e4 100644
--- a/go/parquet/internal/bmi/bmi.go
+++ b/go/parquet/internal/bmi/bmi.go
@@ -254,7 +254,7 @@ func extractBitsGo(bitmap, selectBitmap uint64) uint64 {
 	for selectBitmap != 0 {
 		maskLen := bits.OnesCount32(uint32(selectBitmap & lookupMask))
 		value := pextTable[selectBitmap&lookupMask][bitmap&lookupMask]
-		bitValue |= uint64(value << bitLen)
+		bitValue |= uint64(value) << bitLen
 		bitLen += maskLen
 		bitmap >>= lookupBits
 		selectBitmap >>= lookupBits
diff --git a/go/parquet/internal/encoding/boolean_decoder.go b/go/parquet/internal/encoding/boolean_decoder.go
index a33b21a3181f6..bdf1fd56f9825 100644
--- a/go/parquet/internal/encoding/boolean_decoder.go
+++ b/go/parquet/internal/encoding/boolean_decoder.go
@@ -45,7 +45,7 @@ func (dec *PlainBooleanDecoder) Decode(out []bool) (int, error) {
 
 	unalignedExtract := func(start, end, curBitOffset int) int {
 		i := start
-		for ; curBitOffset < end; i, curBitOffset = i+1, curBitOffset+1 {
+		for ; curBitOffset < end && i < max; i, curBitOffset = i+1, curBitOffset+1 {
 			out[i] = (dec.data[0] & byte(1<<curBitOffset)) != 0
 		}
 		return i // return the number of bits we extracted
@@ -56,7 +56,7 @@ func (dec *PlainBooleanDecoder) Decode(out []bool) (int, error) {
 	i := 0
 	if dec.bitOffset != 0 {
 		i = unalignedExtract(0, 8, dec.bitOffset)
-		dec.bitOffset = 0
+		dec.bitOffset = (dec.bitOffset + i) % 8
 	}
 
 	// determine the number of full bytes worth of bits we can decode
diff --git a/go/parquet/internal/encoding/boolean_encoder.go b/go/parquet/internal/encoding/boolean_encoder.go
index ba06d4c22be7c..81f523b2116be 100644
--- a/go/parquet/internal/encoding/boolean_encoder.go
+++ b/go/parquet/internal/encoding/boolean_encoder.go
@@ -47,6 +47,9 @@ func (enc *PlainBooleanEncoder) Put(in []bool) {
 	if enc.wr == nil {
 		enc.wr = utils.NewBitmapWriter(enc.bitsBuffer, 0, boolsInBuf)
 	}
+	if len(in) == 0 {
+		return
+	}
 
 	n := enc.wr.AppendBools(in)
 	for n < len(in) {
diff --git a/go/parquet/internal/encoding/typed_encoder.gen.go b/go/parquet/internal/encoding/typed_encoder.gen.go
index 211f0622411c5..54f0f56310737 100644
--- a/go/parquet/internal/encoding/typed_encoder.gen.go
+++ b/go/parquet/internal/encoding/typed_encoder.gen.go
@@ -495,10 +495,9 @@ type int96EncoderTraits struct{}
 
 // Encoder returns an encoder for int96 type data, using the specified encoding type and whether or not
 // it should be dictionary encoded.
-// dictionary encoding does not exist for this type and Encoder will panic if useDict is true
 func (int96EncoderTraits) Encoder(e format.Encoding, useDict bool, descr *schema.Column, mem memory.Allocator) TypedEncoder {
 	if useDict {
-		panic("parquet: no parquet.Int96 dictionary encoding")
+		return &DictInt96Encoder{newDictEncoderBase(descr, NewBinaryDictionary(mem), mem)}
 	}
 
 	switch e {
@@ -521,7 +520,7 @@ func (int96DecoderTraits) BytesRequired(n int) int {
 // Decoder returns a decoder for int96 typed data of the requested encoding type if available
 func (int96DecoderTraits) Decoder(e parquet.Encoding, descr *schema.Column, useDict bool, mem memory.Allocator) TypedDecoder {
 	if useDict {
-		panic("dictionary decoding unimplemented for int96")
+		return &DictInt96Decoder{dictDecoder{decoder: newDecoderBase(format.Encoding_RLE_DICTIONARY, descr), mem: mem}}
 	}
 
 	switch e {
@@ -532,6 +531,157 @@ func (int96DecoderTraits) Decoder(e parquet.Encoding, descr *schema.Column, useD
 	}
 }
 
+// DictInt96Encoder is an encoder for parquet.Int96 data using dictionary encoding
+type DictInt96Encoder struct {
+	dictEncoder
+}
+
+// Type returns the underlying physical type that can be encoded with this encoder
+func (enc *DictInt96Encoder) Type() parquet.Type {
+	return parquet.Types.Int96
+}
+
+// WriteDict populates the byte slice with the dictionary index
+func (enc *DictInt96Encoder) WriteDict(out []byte) {
+	enc.memo.(BinaryMemoTable).CopyFixedWidthValues(0, parquet.Int96SizeBytes, out)
+}
+
+// Put encodes the values passed in, adding to the index as needed
+func (enc *DictInt96Encoder) Put(in []parquet.Int96) {
+	for _, v := range in {
+		memoIdx, found, err := enc.memo.GetOrInsert(v)
+		if err != nil {
+			panic(err)
+		}
+		if !found {
+			enc.dictEncodedSize += parquet.Int96SizeBytes
+		}
+		enc.addIndex(memoIdx)
+	}
+}
+
+// PutSpaced is like Put but assumes space for nulls
+func (enc *DictInt96Encoder) PutSpaced(in []parquet.Int96, validBits []byte, validBitsOffset int64) {
+	utils.VisitSetBitRuns(validBits, validBitsOffset, int64(len(in)), func(pos, length int64) error {
+		enc.Put(in[pos : pos+length])
+		return nil
+	})
+}
+
+// DictInt96Decoder is a decoder for decoding dictionary encoded data for parquet.Int96 columns
+type DictInt96Decoder struct {
+	dictDecoder
+}
+
+// Type returns the underlying physical type that can be decoded with this decoder
+func (DictInt96Decoder) Type() parquet.Type {
+	return parquet.Types.Int96
+}
+
+// Decode populates the passed in slice with min(len(out), remaining values) values,
+// decoding using hte dictionary to get the actual values. Returns the number of values
+// actually decoded and any error encountered.
+func (d *DictInt96Decoder) Decode(out []parquet.Int96) (int, error) {
+	vals := utils.MinInt(len(out), d.nvals)
+	decoded, err := d.decode(out[:vals])
+	if err != nil {
+		return decoded, err
+	}
+	if vals != decoded {
+		return decoded, xerrors.New("parquet: dict eof exception")
+	}
+	d.nvals -= vals
+	return vals, nil
+}
+
+// Decode spaced is like Decode but will space out the data leaving slots for null values
+// based on the provided bitmap.
+func (d *DictInt96Decoder) DecodeSpaced(out []parquet.Int96, nullCount int, validBits []byte, validBitsOffset int64) (int, error) {
+	vals := utils.MinInt(len(out), d.nvals)
+	decoded, err := d.decodeSpaced(out[:vals], nullCount, validBits, validBitsOffset)
+	if err != nil {
+		return decoded, err
+	}
+	if vals != decoded {
+		return decoded, xerrors.New("parquet: dict spaced eof exception")
+	}
+	d.nvals -= vals
+	return vals, nil
+}
+
+// Int96DictConverter is a helper for dictionary handling which is used for converting
+// run length encoded indexes into the actual values that are stored in the dictionary index page.
+type Int96DictConverter struct {
+	valueDecoder Int96Decoder
+	dict         []parquet.Int96
+	zeroVal      parquet.Int96
+}
+
+// ensure validates that we've decoded dictionary values up to the index
+// provided so that we don't need to decode the entire dictionary at start.
+func (dc *Int96DictConverter) ensure(idx utils.IndexType) error {
+	if len(dc.dict) <= int(idx) {
+		if cap(dc.dict) <= int(idx) {
+			val := make([]parquet.Int96, int(idx+1)-len(dc.dict))
+			n, err := dc.valueDecoder.Decode(val)
+			if err != nil {
+				return err
+			}
+			dc.dict = append(dc.dict, val[:n]...)
+		} else {
+			cur := len(dc.dict)
+			n, err := dc.valueDecoder.Decode(dc.dict[cur : idx+1])
+			if err != nil {
+				return err
+			}
+			dc.dict = dc.dict[:cur+n]
+		}
+	}
+	return nil
+}
+
+// IsValid verifies that the set of indexes passed in are all valid indexes
+// in the dictionary and if necessary decodes dictionary indexes up to the index
+// requested.
+func (dc *Int96DictConverter) IsValid(idxes ...utils.IndexType) bool {
+	min, max := utils.GetMinMaxInt32(*(*[]int32)(unsafe.Pointer(&idxes)))
+	dc.ensure(utils.IndexType(max))
+
+	return min >= 0 && int(min) < len(dc.dict) && int(max) >= 0 && int(max) < len(dc.dict)
+}
+
+// Fill populates the slice passed in entirely with the value at dictionary index indicated by val
+func (dc *Int96DictConverter) Fill(out interface{}, val utils.IndexType) error {
+	o := out.([]parquet.Int96)
+	if err := dc.ensure(val); err != nil {
+		return err
+	}
+	o[0] = dc.dict[val]
+	for i := 1; i < len(o); i *= 2 {
+		copy(o[i:], o[:i])
+	}
+	return nil
+}
+
+// FillZero populates the entire slice of out with the zero value for parquet.Int96
+func (dc *Int96DictConverter) FillZero(out interface{}) {
+	o := out.([]parquet.Int96)
+	o[0] = dc.zeroVal
+	for i := 1; i < len(o); i *= 2 {
+		copy(o[i:], o[:i])
+	}
+}
+
+// Copy populates the slice provided with the values in the dictionary at the indexes
+// in the vals slice.
+func (dc *Int96DictConverter) Copy(out interface{}, vals []utils.IndexType) error {
+	o := out.([]parquet.Int96)
+	for idx, val := range vals {
+		o[idx] = dc.dict[val]
+	}
+	return nil
+}
+
 // Float32Encoder is the interface for all encoding types that implement encoding
 // float32 values.
 type Float32Encoder interface {
@@ -1385,6 +1535,8 @@ func NewDictConverter(dict TypedDecoder) utils.DictionaryConverter {
 		return &Int32DictConverter{valueDecoder: dict.(Int32Decoder), dict: make([]int32, 0, dict.ValuesLeft())}
 	case parquet.Types.Int64:
 		return &Int64DictConverter{valueDecoder: dict.(Int64Decoder), dict: make([]int64, 0, dict.ValuesLeft())}
+	case parquet.Types.Int96:
+		return &Int96DictConverter{valueDecoder: dict.(Int96Decoder), dict: make([]parquet.Int96, 0, dict.ValuesLeft())}
 	case parquet.Types.Float:
 		return &Float32DictConverter{valueDecoder: dict.(Float32Decoder), dict: make([]float32, 0, dict.ValuesLeft())}
 	case parquet.Types.Double:
diff --git a/go/parquet/internal/encoding/typed_encoder.gen.go.tmpl b/go/parquet/internal/encoding/typed_encoder.gen.go.tmpl
index d2ebbe423e0a7..14c1e9a46f50f 100644
--- a/go/parquet/internal/encoding/typed_encoder.gen.go.tmpl
+++ b/go/parquet/internal/encoding/typed_encoder.gen.go.tmpl
@@ -56,15 +56,15 @@ type {{.lower}}EncoderTraits struct{}
 
 // Encoder returns an encoder for {{.lower}} type data, using the specified encoding type and whether or not
 // it should be dictionary encoded.
-{{- if or (eq .Name "Boolean") (eq .Name "Int96")}}
+{{- if or (eq .Name "Boolean") }}
 // dictionary encoding does not exist for this type and Encoder will panic if useDict is true
 {{- end }}
 func ({{.lower}}EncoderTraits) Encoder(e format.Encoding, useDict bool, descr *schema.Column, mem memory.Allocator) TypedEncoder {
   if useDict {
-{{- if or (eq .Name "Boolean") (eq .Name "Int96")}}
+{{- if or (eq .Name "Boolean") }}
     panic("parquet: no {{.name}} dictionary encoding")
 {{- else}}
-    return &Dict{{.Name}}Encoder{newDictEncoderBase(descr, New{{if and (ne .Name "ByteArray") (ne .Name "FixedLenByteArray")}}{{.Name}}Dictionary(){{else}}BinaryDictionary(mem){{end}}, mem)}
+    return &Dict{{.Name}}Encoder{newDictEncoderBase(descr, New{{if and (ne .Name "Int96") (ne .Name "ByteArray") (ne .Name "FixedLenByteArray")}}{{.Name}}Dictionary(){{else}}BinaryDictionary(mem){{end}}, mem)}
 {{- end}}
   }
 
@@ -105,7 +105,7 @@ func ({{.lower}}DecoderTraits) BytesRequired(n int) int {
 // Decoder returns a decoder for {{.lower}} typed data of the requested encoding type if available
 func ({{.lower}}DecoderTraits) Decoder(e parquet.Encoding, descr *schema.Column, useDict bool, mem memory.Allocator) TypedDecoder {
   if useDict {
-{{- if and (ne .Name "Boolean") (ne .Name "Int96")}}
+{{- if and (ne .Name "Boolean") }}
     return &Dict{{.Name}}Decoder{dictDecoder{decoder: newDecoderBase(format.Encoding_RLE_DICTIONARY, descr), mem: mem}}
 {{- else}}
     panic("dictionary decoding unimplemented for {{.lower}}")
@@ -150,7 +150,7 @@ func ({{.lower}}DecoderTraits) Decoder(e parquet.Encoding, descr *schema.Column,
   }
 }
 
-{{if and (ne .Name "Boolean") (ne .Name "Int96")}}
+{{if and (ne .Name "Boolean") }}
 // Dict{{.Name}}Encoder is an encoder for {{.name}} data using dictionary encoding
 type Dict{{.Name}}Encoder struct {
   dictEncoder
@@ -162,6 +162,12 @@ func (enc *Dict{{.Name}}Encoder) Type() parquet.Type {
 }
 
 {{if and (ne .Name "ByteArray") (ne .Name "FixedLenByteArray")}}
+{{if (ne .Name "Int96")}}
+// WriteDict populates the byte slice with the dictionary index
+func (enc *Dict{{.Name}}Encoder) WriteDict(out []byte) {
+  enc.memo.CopyValues({{.prefix}}.{{.Name}}Traits.CastFromBytes(out))
+}
+
 // Put encodes the values passed in, adding to the index as needed.
 func (enc *Dict{{.Name}}Encoder) Put(in []{{.name}}) {
   for _, val := range in {
@@ -179,6 +185,34 @@ func (enc *Dict{{.Name}}Encoder) PutSpaced(in []{{.name}}, validBits []byte, val
     return nil
   })
 }
+{{else}}
+// WriteDict populates the byte slice with the dictionary index
+func (enc *DictInt96Encoder) WriteDict(out []byte) {
+  enc.memo.(BinaryMemoTable).CopyFixedWidthValues(0, parquet.Int96SizeBytes, out)
+}
+
+// Put encodes the values passed in, adding to the index as needed
+func (enc *DictInt96Encoder) Put(in []parquet.Int96) {
+  for _, v := range in {
+    memoIdx, found, err := enc.memo.GetOrInsert(v)
+    if err != nil {
+      panic(err)
+    }
+    if !found {
+      enc.dictEncodedSize += parquet.Int96SizeBytes
+    }
+    enc.addIndex(memoIdx)
+  }
+}
+
+// PutSpaced is like Put but assumes space for nulls
+func (enc *DictInt96Encoder) PutSpaced(in []parquet.Int96, validBits []byte, validBitsOffset int64) {
+  utils.VisitSetBitRuns(validBits, validBitsOffset, int64(len(in)), func(pos, length int64) error {
+    enc.Put(in[pos : pos+length])
+    return nil
+  })
+}
+{{end}}
 {{end}}
 
 // Dict{{.Name}}Decoder is a decoder for decoding dictionary encoded data for {{.name}} columns
@@ -302,7 +336,7 @@ func (dc *{{.Name}}DictConverter) Copy(out interface{}, vals []utils.IndexType)
 // decoder as the decoder to decode the dictionary index.
 func NewDictConverter(dict TypedDecoder) utils.DictionaryConverter {
   switch dict.Type() {
-  {{ range .In }}{{ if and (ne .Name "Boolean") (ne .Name "Int96") -}}
+  {{ range .In }}{{ if and (ne .Name "Boolean") -}}
   case parquet.Types.{{if .physical }}{{.physical}}{{else}}{{.Name}}{{end}}:
     return &{{.Name}}DictConverter{valueDecoder: dict.({{.Name}}Decoder), dict: make([]{{.name}}, 0, dict.ValuesLeft())}
   {{ end }}{{ end -}}
diff --git a/go/parquet/internal/testutils/pagebuilder.go b/go/parquet/internal/testutils/pagebuilder.go
new file mode 100644
index 0000000000000..f742f1a561aaf
--- /dev/null
+++ b/go/parquet/internal/testutils/pagebuilder.go
@@ -0,0 +1,297 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package testutils
+
+import (
+	"encoding/binary"
+	"io"
+	"reflect"
+
+	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/parquet"
+	"github.com/apache/arrow/go/parquet/compress"
+	"github.com/apache/arrow/go/parquet/file"
+	"github.com/apache/arrow/go/parquet/internal/encoding"
+	"github.com/apache/arrow/go/parquet/internal/utils"
+	"github.com/apache/arrow/go/parquet/schema"
+	"github.com/stretchr/testify/mock"
+)
+
+type DataPageBuilder struct {
+	sink    io.Writer
+	version parquet.DataPageVersion
+
+	nvals          int
+	encoding       parquet.Encoding
+	defLvlEncoding parquet.Encoding
+	repLvlEncoding parquet.Encoding
+	defLvlBytesLen int
+	repLvlBytesLen int
+	hasDefLvls     bool
+	hasRepLvls     bool
+	hasValues      bool
+}
+
+var mem = memory.NewGoAllocator()
+
+func (d *DataPageBuilder) appendLevels(lvls []int16, maxLvl int16, e parquet.Encoding) int {
+	if e != parquet.Encodings.RLE {
+		panic("parquet: only rle encoding currently implemented")
+	}
+
+	buf := encoding.NewBufferWriter(encoding.LevelEncodingMaxBufferSize(e, maxLvl, len(lvls)), memory.DefaultAllocator)
+	var enc encoding.LevelEncoder
+	enc.Init(e, maxLvl, buf)
+	enc.Encode(lvls)
+
+	rleBytes := enc.Len()
+	if d.version == parquet.DataPageV1 {
+		if err := binary.Write(d.sink, binary.LittleEndian, int32(rleBytes)); err != nil {
+			panic(err)
+		}
+	}
+
+	if _, err := d.sink.Write(buf.Bytes()[:rleBytes]); err != nil {
+		panic(err)
+	}
+	return rleBytes
+}
+
+func (d *DataPageBuilder) AppendDefLevels(lvls []int16, maxLvl int16) {
+	d.defLvlBytesLen = d.appendLevels(lvls, maxLvl, parquet.Encodings.RLE)
+
+	d.nvals = utils.MaxInt(len(lvls), d.nvals)
+	d.defLvlEncoding = parquet.Encodings.RLE
+	d.hasDefLvls = true
+}
+
+func (d *DataPageBuilder) AppendRepLevels(lvls []int16, maxLvl int16) {
+	d.repLvlBytesLen = d.appendLevels(lvls, maxLvl, parquet.Encodings.RLE)
+
+	d.nvals = utils.MaxInt(len(lvls), d.nvals)
+	d.repLvlEncoding = parquet.Encodings.RLE
+	d.hasRepLvls = true
+}
+
+func (d *DataPageBuilder) AppendValues(desc *schema.Column, values interface{}, e parquet.Encoding) {
+	enc := encoding.NewEncoder(desc.PhysicalType(), e, false, desc, mem)
+	var sz int
+	switch v := values.(type) {
+	case []int32:
+		enc.(encoding.Int32Encoder).Put(v)
+		sz = len(v)
+	case []int64:
+		enc.(encoding.Int64Encoder).Put(v)
+		sz = len(v)
+	case []parquet.Int96:
+		enc.(encoding.Int96Encoder).Put(v)
+		sz = len(v)
+	case []float32:
+		enc.(encoding.Float32Encoder).Put(v)
+		sz = len(v)
+	case []float64:
+		enc.(encoding.Float64Encoder).Put(v)
+		sz = len(v)
+	case []parquet.ByteArray:
+		enc.(encoding.ByteArrayEncoder).Put(v)
+		sz = len(v)
+	}
+	buf, _ := enc.FlushValues()
+	_, err := d.sink.Write(buf.Bytes())
+	if err != nil {
+		panic(err)
+	}
+
+	d.nvals = utils.MaxInt(sz, d.nvals)
+	d.encoding = e
+	d.hasValues = true
+}
+
+type DictionaryPageBuilder struct {
+	traits        encoding.DictEncoder
+	numDictValues int32
+	hasValues     bool
+}
+
+func NewDictionaryPageBuilder(d *schema.Column) *DictionaryPageBuilder {
+	return &DictionaryPageBuilder{
+		encoding.NewEncoder(d.PhysicalType(), parquet.Encodings.Plain, true, d, mem).(encoding.DictEncoder),
+		0, false}
+}
+
+func (d *DictionaryPageBuilder) AppendValues(values interface{}) encoding.Buffer {
+	switch v := values.(type) {
+	case []int32:
+		d.traits.(encoding.Int32Encoder).Put(v)
+	case []int64:
+		d.traits.(encoding.Int64Encoder).Put(v)
+	case []parquet.Int96:
+		d.traits.(encoding.Int96Encoder).Put(v)
+	case []float32:
+		d.traits.(encoding.Float32Encoder).Put(v)
+	case []float64:
+		d.traits.(encoding.Float64Encoder).Put(v)
+	case []parquet.ByteArray:
+		d.traits.(encoding.ByteArrayEncoder).Put(v)
+	}
+
+	d.numDictValues = int32(d.traits.NumEntries())
+	d.hasValues = true
+	buf, _ := d.traits.FlushValues()
+	return buf
+}
+
+func (d *DictionaryPageBuilder) WriteDict() *memory.Buffer {
+	buf := memory.NewBufferBytes(make([]byte, d.traits.DictEncodedSize()))
+	d.traits.WriteDict(buf.Bytes())
+	return buf
+}
+
+func (d *DictionaryPageBuilder) NumValues() int32 {
+	return d.numDictValues
+}
+
+func MakeDataPage(dataPageVersion parquet.DataPageVersion, d *schema.Column, values interface{}, nvals int, e parquet.Encoding, indexBuffer encoding.Buffer, defLvls, repLvls []int16, maxDef, maxRep int16) file.Page {
+	num := 0
+
+	stream := encoding.NewBufferWriter(1024, mem)
+	builder := DataPageBuilder{sink: stream, version: dataPageVersion}
+
+	if len(repLvls) > 0 {
+		builder.AppendRepLevels(repLvls, maxRep)
+	}
+	if len(defLvls) > 0 {
+		builder.AppendDefLevels(defLvls, maxDef)
+	}
+
+	if e == parquet.Encodings.Plain {
+		builder.AppendValues(d, values, e)
+		num = builder.nvals
+	} else {
+		stream.Write(indexBuffer.Bytes())
+		num = utils.MaxInt(builder.nvals, nvals)
+	}
+
+	buf := stream.Finish()
+	if dataPageVersion == parquet.DataPageV1 {
+		return file.NewDataPageV1(buf, int32(num), e, builder.defLvlEncoding, builder.repLvlEncoding, int64(buf.Len()))
+	}
+	return file.NewDataPageV2(buf, int32(num), 0, int32(num), e, int32(builder.defLvlBytesLen), int32(builder.repLvlBytesLen), int64(buf.Len()), false)
+}
+
+func MakeDictPage(d *schema.Column, values interface{}, valuesPerPage []int, e parquet.Encoding) (*file.DictionaryPage, []encoding.Buffer) {
+	bldr := NewDictionaryPageBuilder(d)
+	npages := len(valuesPerPage)
+
+	ref := reflect.ValueOf(values)
+	valStart := 0
+
+	rleIndices := make([]encoding.Buffer, 0, npages)
+	for _, nvals := range valuesPerPage {
+		rleIndices = append(rleIndices, bldr.AppendValues(ref.Slice(valStart, valStart+nvals).Interface()))
+		valStart += nvals
+	}
+
+	buffer := bldr.WriteDict()
+	return file.NewDictionaryPage(buffer, bldr.NumValues(), parquet.Encodings.Plain), rleIndices
+}
+
+type MockPageReader struct {
+	mock.Mock
+
+	curpage int
+}
+
+func (m *MockPageReader) Err() error {
+	return m.Called().Error(0)
+}
+
+func (m *MockPageReader) Reset(parquet.ReaderAtSeeker, int64, compress.Compression, *file.CryptoContext) {
+}
+
+func (m *MockPageReader) SetMaxPageHeaderSize(int) {}
+
+func (m *MockPageReader) Page() file.Page {
+	return m.TestData().Get("pages").Data().([]file.Page)[m.curpage-1]
+}
+
+func (m *MockPageReader) Next() bool {
+	pageList := m.TestData().Get("pages").Data().([]file.Page)
+	m.curpage++
+	return len(pageList) >= m.curpage
+}
+
+func PaginatePlain(version parquet.DataPageVersion, d *schema.Column, values reflect.Value, defLevels, repLevels []int16,
+	maxDef, maxRep int16, lvlsPerPage int, valuesPerPage []int, enc parquet.Encoding) []file.Page {
+
+	var (
+		npages      = len(valuesPerPage)
+		defLvlStart = 0
+		defLvlEnd   = 0
+		repLvlStart = 0
+		repLvlEnd   = 0
+		valueStart  = 0
+	)
+
+	pageList := make([]file.Page, 0, npages)
+	for i := 0; i < npages; i++ {
+		if maxDef > 0 {
+			defLvlStart = i * lvlsPerPage
+			defLvlEnd = (i + 1) * lvlsPerPage
+		}
+		if maxRep > 0 {
+			repLvlStart = i * lvlsPerPage
+			repLvlEnd = (i + 1) * lvlsPerPage
+		}
+
+		page := MakeDataPage(version, d,
+			values.Slice(valueStart, valueStart+valuesPerPage[i]).Interface(),
+			valuesPerPage[i], enc, nil, defLevels[defLvlStart:defLvlEnd],
+			repLevels[repLvlStart:repLvlEnd], maxDef, maxRep)
+		valueStart += valuesPerPage[i]
+		pageList = append(pageList, page)
+	}
+	return pageList
+}
+
+func PaginateDict(version parquet.DataPageVersion, d *schema.Column, values reflect.Value, defLevels, repLevels []int16, maxDef, maxRep int16, lvlsPerPage int, valuesPerPage []int, enc parquet.Encoding) []file.Page {
+	var (
+		npages   = len(valuesPerPage)
+		pages    = make([]file.Page, 0, npages)
+		defStart = 0
+		defEnd   = 0
+		repStart = 0
+		repEnd   = 0
+	)
+
+	dictPage, rleIndices := MakeDictPage(d, values.Interface(), valuesPerPage, enc)
+	pages = append(pages, dictPage)
+	for i := 0; i < npages; i++ {
+		if maxDef > 0 {
+			defStart = i * lvlsPerPage
+			defEnd = (i + 1) * lvlsPerPage
+		}
+		if maxRep > 0 {
+			repStart = i * lvlsPerPage
+			repEnd = (i + 1) * lvlsPerPage
+		}
+		page := MakeDataPage(version, d, nil, valuesPerPage[i], enc, rleIndices[i],
+			defLevels[defStart:defEnd], repLevels[repStart:repEnd], maxDef, maxRep)
+		pages = append(pages, page)
+	}
+	return pages
+}
diff --git a/go/parquet/reader_properties.go b/go/parquet/reader_properties.go
index 92abae57dc1c2..7e99d9f68705a 100644
--- a/go/parquet/reader_properties.go
+++ b/go/parquet/reader_properties.go
@@ -20,7 +20,6 @@ import (
 	"bytes"
 	"io"
 
-	"github.com/apache/arrow/go/arrow/ipc"
 	"github.com/apache/arrow/go/arrow/memory"
 	"golang.org/x/xerrors"
 )
@@ -61,7 +60,7 @@ func (r *ReaderProperties) Allocator() memory.Allocator { return r.alloc }
 //
 // If BufferedStreamEnabled is true, it creates an io.SectionReader, otherwise it will read the entire section
 // into a buffer in memory and return a bytes.NewReader for that buffer.
-func (r *ReaderProperties) GetStream(source io.ReaderAt, start, nbytes int64) (ipc.ReadAtSeeker, error) {
+func (r *ReaderProperties) GetStream(source io.ReaderAt, start, nbytes int64) (ReaderAtSeeker, error) {
 	if r.BufferedStreamEnabled {
 		return io.NewSectionReader(source, start, nbytes), nil
 	}
diff --git a/go/parquet/types.go b/go/parquet/types.go
index e568984ebe39c..630244ca8e9df 100644
--- a/go/parquet/types.go
+++ b/go/parquet/types.go
@@ -18,6 +18,7 @@ package parquet
 
 import (
 	"encoding/binary"
+	"io"
 	"reflect"
 	"strings"
 	"time"
@@ -47,6 +48,15 @@ var (
 	FixedLenByteArraySizeBytes int = int(reflect.TypeOf(FixedLenByteArray{}).Size())
 )
 
+// ReaderAtSeeker is a combination of the ReaderAt and ReadSeeker interfaces
+// from the io package defining the only functionality that is required
+// in order for a parquet file to be read by the file functions. We just need
+// to be able to call ReadAt, Read, and Seek
+type ReaderAtSeeker interface {
+	io.ReaderAt
+	io.ReadSeeker
+}
+
 // NewInt96 creates a new Int96 from the given 3 uint32 values.
 func NewInt96(v [3]uint32) (out Int96) {
 	binary.LittleEndian.PutUint32(out[0:], v[0])

From e7fca7756063def4c1583b9109067a65ddcb7a53 Mon Sep 17 00:00:00 2001
From: Benson Muite <bkmgit@users.noreply.github.com>
Date: Sun, 24 Oct 2021 05:48:00 +0900
Subject: [PATCH 010/194] ARROW-14451: [Release][Ruby] The `--path` flag is
 deprecated

Closes #11528 from bkmgit/ARROW-14451

Authored-by: Benson Muite <bkmgit@users.noreply.github.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 dev/release/verify-release-candidate.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh
index 212547744c4e4..63561a12b87fc 100755
--- a/dev/release/verify-release-candidate.sh
+++ b/dev/release/verify-release-candidate.sh
@@ -444,7 +444,8 @@ test_ruby() {
 
   for module in ${modules}; do
     pushd ${module}
-    bundle install --path vendor/bundle
+    bundle config set --local path 'vendor/bundle'
+    bundle install
     bundle exec ruby test/run-test.rb
     popd
   done

From e7158c62ae43cbcea3f90c11dcbb40ffbbc94484 Mon Sep 17 00:00:00 2001
From: Matthew Topol <mtopol@factset.com>
Date: Sat, 23 Oct 2021 17:21:15 -0400
Subject: [PATCH 011/194] ARROW-13984: [Go][Parquet] File readers

Looks like I merged #11146 before it finished sync'ing to the apache mirror and was missing a few commits. Here's the missing ones.

Closes #11530 from zeroshade/goparquet-file

Authored-by: Matthew Topol <mtopol@factset.com>
Signed-off-by: Matthew Topol <mtopol@factset.com>
---
 go/parquet/file/page_reader.go      | 16 ++++++++++++----
 go/parquet/file/row_group_reader.go | 10 ++++++++++
 2 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/go/parquet/file/page_reader.go b/go/parquet/file/page_reader.go
index 251499af21ce7..5c36b338ba9f8 100644
--- a/go/parquet/file/page_reader.go
+++ b/go/parquet/file/page_reader.go
@@ -26,7 +26,6 @@ import (
 	"github.com/apache/arrow/go/arrow/memory"
 	"github.com/apache/arrow/go/parquet"
 	"github.com/apache/arrow/go/parquet/compress"
-	"github.com/apache/arrow/go/parquet/internal/debug"
 	"github.com/apache/arrow/go/parquet/internal/encryption"
 	format "github.com/apache/arrow/go/parquet/internal/gen-go/parquet"
 	"github.com/apache/arrow/go/parquet/internal/thrift"
@@ -512,7 +511,10 @@ func (p *serializedPageReader) Next() bool {
 				p.err = err
 				return false
 			}
-			debug.Assert(len(data) == lenUncompressed, "len(data) != lenUncompressed")
+			if len(data) != lenUncompressed {
+				p.err = xerrors.Errorf("parquet: metadata said %d bytes uncompressed dictionary page, got %d bytes", lenUncompressed, len(data))
+				return false
+			}
 
 			// p.buf.Resize(lenUncompressed)
 			// make dictionary page
@@ -540,7 +542,10 @@ func (p *serializedPageReader) Next() bool {
 				p.err = err
 				return false
 			}
-			debug.Assert(len(data) == lenUncompressed, "len(data) != lenUncompressed")
+			if len(data) != lenUncompressed {
+				p.err = xerrors.Errorf("parquet: metadata said %d bytes uncompressed data page, got %d bytes", lenUncompressed, len(data))
+				return false
+			}
 
 			// make datapagev1
 			p.curPage = &DataPageV1{
@@ -589,7 +594,10 @@ func (p *serializedPageReader) Next() bool {
 				io.ReadFull(p.r, p.buf.Bytes())
 				data = p.buf.Bytes()
 			}
-			debug.Assert(len(data) == lenUncompressed, "len(data) != lenUncompressed")
+			if len(data) != lenUncompressed {
+				p.err = xerrors.Errorf("parquet: metadata said %d bytes uncompressed data page, got %d bytes", lenUncompressed, len(data))
+				return false
+			}
 
 			// make datapage v2
 			p.curPage = &DataPageV2{
diff --git a/go/parquet/file/row_group_reader.go b/go/parquet/file/row_group_reader.go
index 9c74a25c11eca..455144e266ac0 100644
--- a/go/parquet/file/row_group_reader.go
+++ b/go/parquet/file/row_group_reader.go
@@ -79,7 +79,17 @@ func (r *RowGroupReader) GetColumnPageReader(i int) (PageReader, error) {
 	}
 
 	colLen := col.TotalCompressedSize()
+	// PARQUET-816 workaround for old files created by older parquet-mr
 	if r.fileMetadata.WriterVersion().LessThan(metadata.Parquet816FixedVersion) {
+		// The Parquet MR writer had a bug in 1.2.8 and below where it didn't include the
+		// dictionary page header size in total_compressed_size and total_uncompressed_size
+		// (see IMPALA-694). We add padding to compensate.
+		if colStart < 0 || colLen < 0 {
+			return nil, xerrors.Errorf("invalid column chunk metadata, offset (%d) and length (%d) should both be positive", colStart, colLen)
+		}
+		if colStart > r.sourceSz || colLen > r.sourceSz {
+			return nil, xerrors.Errorf("invalid column chunk metadata, offset (%d) and length (%d) must both be less than total source size (%d)", colStart, colLen, r.sourceSz)
+		}
 		bytesRemain := r.sourceSz - (colStart + colLen)
 		padding := utils.Min(maxDictHeaderSize, bytesRemain)
 		colLen += padding

From be665ef948cb2c6706c60053c5db918e948713e8 Mon Sep 17 00:00:00 2001
From: Eduardo Ponce <edponce00@gmail.com>
Date: Mon, 25 Oct 2021 08:53:12 -0400
Subject: [PATCH 012/194] ARROW-13879: [C++] Mixed support for binary types in
 regex functions

This PR extends variable-width binary types support for string functions:
* find_substring[_regex]
* count_substring[_regex]
* match_substring[_regex]
* split_pattern[_regex]
* replace_substring[_regex]
* match_like
* starts/ends_with
* extract_regex

Also, updates several scalar string kernel/function registrations.

Closes #11233 from edponce/ARROW-13879-Mixed-support-for-binary-types-in-regex-

Authored-by: Eduardo Ponce <edponce00@gmail.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 cpp/src/arrow/array/array_binary_test.cc      |  12 +-
 .../arrow/compute/kernels/aggregate_test.cc   |   4 +-
 .../arrow/compute/kernels/codegen_internal.h  |  26 +-
 .../compute/kernels/scalar_if_else_test.cc    |   8 +-
 .../compute/kernels/scalar_set_lookup_test.cc |   4 +-
 .../arrow/compute/kernels/scalar_string.cc    | 603 ++++++++++--------
 .../compute/kernels/scalar_string_test.cc     | 455 ++++++++++---
 cpp/src/arrow/compute/kernels/test_util.cc    |   4 +-
 .../arrow/compute/kernels/vector_hash_test.cc |   2 +-
 .../compute/kernels/vector_replace_test.cc    |   2 +-
 .../compute/kernels/vector_selection_test.cc  |   6 +-
 cpp/src/arrow/testing/gtest_util.h            |   4 +-
 docs/source/cpp/compute.rst                   |  52 +-
 13 files changed, 759 insertions(+), 423 deletions(-)

diff --git a/cpp/src/arrow/array/array_binary_test.cc b/cpp/src/arrow/array/array_binary_test.cc
index 6892e5f0a91f7..7840c60f8974d 100644
--- a/cpp/src/arrow/array/array_binary_test.cc
+++ b/cpp/src/arrow/array/array_binary_test.cc
@@ -324,7 +324,7 @@ class TestStringArray : public ::testing::Test {
   std::shared_ptr<ArrayType> strings_;
 };
 
-TYPED_TEST_SUITE(TestStringArray, BinaryArrowTypes);
+TYPED_TEST_SUITE(TestStringArray, BaseBinaryArrowTypes);
 
 TYPED_TEST(TestStringArray, TestArrayBasics) { this->TestArrayBasics(); }
 
@@ -661,7 +661,7 @@ class TestStringBuilder : public TestBuilder {
   std::shared_ptr<ArrayType> result_;
 };
 
-TYPED_TEST_SUITE(TestStringBuilder, BinaryArrowTypes);
+TYPED_TEST_SUITE(TestStringBuilder, BaseBinaryArrowTypes);
 
 TYPED_TEST(TestStringBuilder, TestScalarAppend) { this->TestScalarAppend(); }
 
@@ -863,7 +863,7 @@ struct BinaryAppender {
 };
 
 template <typename T>
-class TestBinaryDataVisitor : public ::testing::Test {
+class TestBaseBinaryDataVisitor : public ::testing::Test {
  public:
   using TypeClass = T;
 
@@ -891,10 +891,10 @@ class TestBinaryDataVisitor : public ::testing::Test {
   std::shared_ptr<DataType> type_;
 };
 
-TYPED_TEST_SUITE(TestBinaryDataVisitor, BinaryArrowTypes);
+TYPED_TEST_SUITE(TestBaseBinaryDataVisitor, BaseBinaryArrowTypes);
 
-TYPED_TEST(TestBinaryDataVisitor, Basics) { this->TestBasics(); }
+TYPED_TEST(TestBaseBinaryDataVisitor, Basics) { this->TestBasics(); }
 
-TYPED_TEST(TestBinaryDataVisitor, Sliced) { this->TestSliced(); }
+TYPED_TEST(TestBaseBinaryDataVisitor, Sliced) { this->TestSliced(); }
 
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc
index 992f73698648d..fe940006cb263 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_test.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc
@@ -1647,7 +1647,7 @@ TEST(TestNullMinMaxKernel, Basics) {
 
 template <typename ArrowType>
 class TestBaseBinaryMinMaxKernel : public ::testing::Test {};
-TYPED_TEST_SUITE(TestBaseBinaryMinMaxKernel, BinaryArrowTypes);
+TYPED_TEST_SUITE(TestBaseBinaryMinMaxKernel, BaseBinaryArrowTypes);
 TYPED_TEST(TestBaseBinaryMinMaxKernel, Basics) {
   std::vector<std::string> chunked_input1 = {R"(["cc", "", "aa", "b", "c"])",
                                              R"(["d", "", null, "b", "c"])"};
@@ -2249,7 +2249,7 @@ TYPED_TEST(TestBooleanIndexKernel, Basics) {
 
 template <typename ArrowType>
 class TestStringIndexKernel : public TestIndexKernel<ArrowType> {};
-TYPED_TEST_SUITE(TestStringIndexKernel, BinaryArrowTypes);
+TYPED_TEST_SUITE(TestStringIndexKernel, BaseBinaryArrowTypes);
 TYPED_TEST(TestStringIndexKernel, Basics) {
   auto buffer = Buffer::FromString("foo");
   auto value = std::make_shared<typename TestFixture::ScalarType>(buffer);
diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.h b/cpp/src/arrow/compute/kernels/codegen_internal.h
index 438362585b5ed..2a1167c48e273 100644
--- a/cpp/src/arrow/compute/kernels/codegen_internal.h
+++ b/cpp/src/arrow/compute/kernels/codegen_internal.h
@@ -1199,7 +1199,7 @@ ArrayKernelExec GenerateTypeAgnosticPrimitive(detail::GetTypeId get_id) {
   }
 }
 
-// similar to GenerateTypeAgnosticPrimitive, but for variable types
+// similar to GenerateTypeAgnosticPrimitive, but for base variable binary types
 template <template <typename...> class Generator, typename... Args>
 ArrayKernelExec GenerateTypeAgnosticVarBinaryBase(detail::GetTypeId get_id) {
   switch (get_id.id) {
@@ -1215,10 +1215,28 @@ ArrayKernelExec GenerateTypeAgnosticVarBinaryBase(detail::GetTypeId get_id) {
   }
 }
 
+// Generate a kernel given a templated functor for binary and string types
+template <template <typename...> class Generator, typename... Args>
+ArrayKernelExec GenerateVarBinaryToVarBinary(detail::GetTypeId get_id) {
+  switch (get_id.id) {
+    case Type::BINARY:
+      return Generator<BinaryType, Args...>::Exec;
+    case Type::STRING:
+      return Generator<StringType, Args...>::Exec;
+    case Type::LARGE_BINARY:
+      return Generator<LargeBinaryType, Args...>::Exec;
+    case Type::LARGE_STRING:
+      return Generator<LargeStringType, Args...>::Exec;
+    default:
+      DCHECK(false);
+      return ExecFail;
+  }
+}
+
 // Generate a kernel given a templated functor for base binary types. Generates
-// a single kernel for binary/string and large binary / large string. If your
-// kernel implementation needs access to the specific type at compile time,
-// please use BaseBinarySpecific.
+// a single kernel for binary/string and large binary/large string. If your kernel
+// implementation needs access to the specific type at compile time, please use
+// BaseBinarySpecific.
 //
 // See "Numeric" above for description of the generator functor
 template <template <typename...> class Generator, typename Type0, typename... Args>
diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc
index 92e0582c6f1ee..7bcbb814ada67 100644
--- a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc
@@ -389,7 +389,7 @@ TEST_F(TestIfElseKernel, IfElseDispatchBest) {
 template <typename Type>
 class TestIfElseBaseBinary : public ::testing::Test {};
 
-TYPED_TEST_SUITE(TestIfElseBaseBinary, BinaryArrowTypes);
+TYPED_TEST_SUITE(TestIfElseBaseBinary, BaseBinaryArrowTypes);
 
 TYPED_TEST(TestIfElseBaseBinary, IfElseBaseBinary) {
   auto type = TypeTraits<TypeParam>::type_singleton();
@@ -1488,7 +1488,7 @@ TEST(TestCaseWhen, FixedSizeBinary) {
 template <typename Type>
 class TestCaseWhenBinary : public ::testing::Test {};
 
-TYPED_TEST_SUITE(TestCaseWhenBinary, BinaryArrowTypes);
+TYPED_TEST_SUITE(TestCaseWhenBinary, BaseBinaryArrowTypes);
 
 TYPED_TEST(TestCaseWhenBinary, Basics) {
   auto type = default_type_instance<TypeParam>();
@@ -2129,7 +2129,7 @@ template <typename Type>
 class TestCoalesceList : public ::testing::Test {};
 
 TYPED_TEST_SUITE(TestCoalesceNumeric, NumericBasedTypes);
-TYPED_TEST_SUITE(TestCoalesceBinary, BinaryArrowTypes);
+TYPED_TEST_SUITE(TestCoalesceBinary, BaseBinaryArrowTypes);
 TYPED_TEST_SUITE(TestCoalesceList, ListArrowTypes);
 
 TYPED_TEST(TestCoalesceNumeric, Basics) {
@@ -2724,7 +2724,7 @@ template <typename Type>
 class TestChooseBinary : public ::testing::Test {};
 
 TYPED_TEST_SUITE(TestChooseNumeric, NumericBasedTypes);
-TYPED_TEST_SUITE(TestChooseBinary, BinaryArrowTypes);
+TYPED_TEST_SUITE(TestChooseBinary, BaseBinaryArrowTypes);
 
 TYPED_TEST(TestChooseNumeric, FixedSize) {
   auto type = default_type_instance<TypeParam>();
diff --git a/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc b/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc
index 284c8ccdebf80..070196924d632 100644
--- a/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc
@@ -219,7 +219,7 @@ TEST_F(TestIsInKernel, Boolean) {
             "[false, true, false, false, true]", /*skip_nulls=*/true);
 }
 
-TYPED_TEST_SUITE(TestIsInKernelBinary, BinaryArrowTypes);
+TYPED_TEST_SUITE(TestIsInKernelBinary, BaseBinaryArrowTypes);
 
 TYPED_TEST(TestIsInKernelBinary, Binary) {
   auto type = TypeTraits<TypeParam>::type_singleton();
@@ -678,7 +678,7 @@ TEST_F(TestIndexInKernel, Boolean) {
 template <typename Type>
 class TestIndexInKernelBinary : public TestIndexInKernel {};
 
-TYPED_TEST_SUITE(TestIndexInKernelBinary, BinaryArrowTypes);
+TYPED_TEST_SUITE(TestIndexInKernelBinary, BaseBinaryArrowTypes);
 
 TYPED_TEST(TestIndexInKernelBinary, Binary) {
   auto type = TypeTraits<TypeParam>::type_singleton();
diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index 11562b06d16cb..3fce9dd8e4aac 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -64,6 +64,22 @@ Status RegexStatus(const RE2& regex) {
   }
   return Status::OK();
 }
+
+RE2::Options MakeRE2Options(bool is_utf8, bool ignore_case = false,
+                            bool literal = false) {
+  RE2::Options options(RE2::Quiet);
+  options.set_encoding(is_utf8 ? RE2::Options::EncodingUTF8
+                               : RE2::Options::EncodingLatin1);
+  options.set_case_sensitive(!ignore_case);
+  options.set_literal(literal);
+  return options;
+}
+
+// Set RE2 encoding based on input type: Latin-1 for BinaryTypes and UTF-8 for StringTypes
+template <typename T>
+RE2::Options MakeRE2Options(bool ignore_case = false, bool literal = false) {
+  return MakeRE2Options(T::is_utf8, ignore_case, literal);
+}
 #endif
 
 // Code units in the range [a-z] can only be an encoding of an ASCII
@@ -967,29 +983,22 @@ struct RegexSubstringMatcher {
   const RE2 regex_match_;
 
   static Result<std::unique_ptr<RegexSubstringMatcher>> Make(
-      const MatchSubstringOptions& options, bool literal = false) {
+      const MatchSubstringOptions& options, bool is_utf8 = true, bool literal = false) {
     auto matcher =
-        ::arrow::internal::make_unique<RegexSubstringMatcher>(options, literal);
+        ::arrow::internal::make_unique<RegexSubstringMatcher>(options, is_utf8, literal);
     RETURN_NOT_OK(RegexStatus(matcher->regex_match_));
     return std::move(matcher);
   }
 
   explicit RegexSubstringMatcher(const MatchSubstringOptions& options,
-                                 bool literal = false)
+                                 bool is_utf8 = true, bool literal = false)
       : options_(options),
-        regex_match_(options_.pattern, MakeRE2Options(options, literal)) {}
+        regex_match_(options_.pattern,
+                     MakeRE2Options(is_utf8, options.ignore_case, literal)) {}
 
   bool Match(util::string_view current) const {
     auto piece = re2::StringPiece(current.data(), current.length());
-    return re2::RE2::PartialMatch(piece, regex_match_);
-  }
-
-  static RE2::RE2::Options MakeRE2Options(const MatchSubstringOptions& options,
-                                          bool literal) {
-    RE2::RE2::Options re2_options(RE2::Quiet);
-    re2_options.set_case_sensitive(!options.ignore_case);
-    re2_options.set_literal(literal);
-    return re2_options;
+    return RE2::PartialMatch(piece, regex_match_);
   }
 };
 #endif
@@ -1030,14 +1039,29 @@ struct MatchSubstring {
   }
 };
 
+#ifdef ARROW_WITH_RE2
+template <typename Type>
+struct MatchSubstring<Type, RegexSubstringMatcher> {
+  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    // TODO Cache matcher across invocations (for regex compilation)
+    ARROW_ASSIGN_OR_RAISE(auto matcher,
+                          RegexSubstringMatcher::Make(MatchSubstringState::Get(ctx),
+                                                      /*is_utf8=*/Type::is_utf8));
+    return MatchSubstringImpl<Type, RegexSubstringMatcher>::Exec(ctx, batch, out,
+                                                                 matcher.get());
+  }
+};
+#endif
+
 template <typename Type>
 struct MatchSubstring<Type, PlainSubstringMatcher> {
   static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
     auto options = MatchSubstringState::Get(ctx);
     if (options.ignore_case) {
 #ifdef ARROW_WITH_RE2
-      ARROW_ASSIGN_OR_RAISE(auto matcher,
-                            RegexSubstringMatcher::Make(options, /*literal=*/true));
+      ARROW_ASSIGN_OR_RAISE(
+          auto matcher, RegexSubstringMatcher::Make(options, /*is_utf8=*/Type::is_utf8,
+                                                    /*literal=*/true));
       return MatchSubstringImpl<Type, RegexSubstringMatcher>::Exec(ctx, batch, out,
                                                                    matcher.get());
 #else
@@ -1058,7 +1082,9 @@ struct MatchSubstring<Type, PlainStartsWithMatcher> {
 #ifdef ARROW_WITH_RE2
       MatchSubstringOptions converted_options = options;
       converted_options.pattern = "^" + RE2::QuoteMeta(options.pattern);
-      ARROW_ASSIGN_OR_RAISE(auto matcher, RegexSubstringMatcher::Make(converted_options));
+      ARROW_ASSIGN_OR_RAISE(
+          auto matcher,
+          RegexSubstringMatcher::Make(converted_options, /*is_utf8=*/Type::is_utf8));
       return MatchSubstringImpl<Type, RegexSubstringMatcher>::Exec(ctx, batch, out,
                                                                    matcher.get());
 #else
@@ -1079,7 +1105,9 @@ struct MatchSubstring<Type, PlainEndsWithMatcher> {
 #ifdef ARROW_WITH_RE2
       MatchSubstringOptions converted_options = options;
       converted_options.pattern = RE2::QuoteMeta(options.pattern) + "$";
-      ARROW_ASSIGN_OR_RAISE(auto matcher, RegexSubstringMatcher::Make(converted_options));
+      ARROW_ASSIGN_OR_RAISE(
+          auto matcher,
+          RegexSubstringMatcher::Make(converted_options, /*is_utf8=*/Type::is_utf8));
       return MatchSubstringImpl<Type, RegexSubstringMatcher>::Exec(ctx, batch, out,
                                                                    matcher.get());
 #else
@@ -1175,40 +1203,44 @@ template <typename StringType>
 struct MatchLike {
   static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
     // NOTE: avoid making those constants global to avoid compiling regexes at startup
+    static const RE2::Options kRE2Options = MakeRE2Options<StringType>();
     // A LIKE pattern matching this regex can be translated into a substring search.
-    static const RE2 kLikePatternIsSubstringMatch(R"(%+([^%_]*[^\\%_])?%+)");
+    static const RE2 kLikePatternIsSubstringMatch(R"(%+([^%_]*[^\\%_])?%+)", kRE2Options);
     // A LIKE pattern matching this regex can be translated into a prefix search.
-    static const RE2 kLikePatternIsStartsWith(R"(([^%_]*[^\\%_])?%+)");
+    static const RE2 kLikePatternIsStartsWith(R"(([^%_]*[^\\%_])?%+)", kRE2Options);
     // A LIKE pattern matching this regex can be translated into a suffix search.
-    static const RE2 kLikePatternIsEndsWith(R"(%+([^%_]*))");
+    static const RE2 kLikePatternIsEndsWith(R"(%+([^%_]*))", kRE2Options);
 
     auto original_options = MatchSubstringState::Get(ctx);
     auto original_state = ctx->state();
 
     Status status;
     std::string pattern;
-    if (!original_options.ignore_case &&
-        re2::RE2::FullMatch(original_options.pattern, kLikePatternIsSubstringMatch,
-                            &pattern)) {
-      MatchSubstringOptions converted_options{pattern, original_options.ignore_case};
-      MatchSubstringState converted_state(converted_options);
-      ctx->SetState(&converted_state);
-      status = MatchSubstring<StringType, PlainSubstringMatcher>::Exec(ctx, batch, out);
-    } else if (!original_options.ignore_case &&
-               re2::RE2::FullMatch(original_options.pattern, kLikePatternIsStartsWith,
-                                   &pattern)) {
-      MatchSubstringOptions converted_options{pattern, original_options.ignore_case};
-      MatchSubstringState converted_state(converted_options);
-      ctx->SetState(&converted_state);
-      status = MatchSubstring<StringType, PlainStartsWithMatcher>::Exec(ctx, batch, out);
-    } else if (!original_options.ignore_case &&
-               re2::RE2::FullMatch(original_options.pattern, kLikePatternIsEndsWith,
-                                   &pattern)) {
-      MatchSubstringOptions converted_options{pattern, original_options.ignore_case};
-      MatchSubstringState converted_state(converted_options);
-      ctx->SetState(&converted_state);
-      status = MatchSubstring<StringType, PlainEndsWithMatcher>::Exec(ctx, batch, out);
-    } else {
+    bool matched = false;
+    if (!original_options.ignore_case) {
+      if ((matched = RE2::FullMatch(original_options.pattern,
+                                    kLikePatternIsSubstringMatch, &pattern))) {
+        MatchSubstringOptions converted_options{pattern, original_options.ignore_case};
+        MatchSubstringState converted_state(converted_options);
+        ctx->SetState(&converted_state);
+        status = MatchSubstring<StringType, PlainSubstringMatcher>::Exec(ctx, batch, out);
+      } else if ((matched = RE2::FullMatch(original_options.pattern,
+                                           kLikePatternIsStartsWith, &pattern))) {
+        MatchSubstringOptions converted_options{pattern, original_options.ignore_case};
+        MatchSubstringState converted_state(converted_options);
+        ctx->SetState(&converted_state);
+        status =
+            MatchSubstring<StringType, PlainStartsWithMatcher>::Exec(ctx, batch, out);
+      } else if ((matched = RE2::FullMatch(original_options.pattern,
+                                           kLikePatternIsEndsWith, &pattern))) {
+        MatchSubstringOptions converted_options{pattern, original_options.ignore_case};
+        MatchSubstringState converted_state(converted_options);
+        ctx->SetState(&converted_state);
+        status = MatchSubstring<StringType, PlainEndsWithMatcher>::Exec(ctx, batch, out);
+      }
+    }
+
+    if (!matched) {
       MatchSubstringOptions converted_options{MakeLikeRegex(original_options),
                                               original_options.ignore_case};
       MatchSubstringState converted_state(converted_options);
@@ -1235,52 +1267,53 @@ void AddMatchSubstring(FunctionRegistry* registry) {
   {
     auto func = std::make_shared<ScalarFunction>("match_substring", Arity::Unary(),
                                                  &match_substring_doc);
-    auto exec_32 = MatchSubstring<StringType, PlainSubstringMatcher>::Exec;
-    auto exec_64 = MatchSubstring<LargeStringType, PlainSubstringMatcher>::Exec;
-    DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, MatchSubstringState::Init));
-    DCHECK_OK(
-        func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init));
+    for (const auto& ty : BaseBinaryTypes()) {
+      auto exec = GenerateVarBinaryToVarBinary<MatchSubstring, PlainSubstringMatcher>(ty);
+      DCHECK_OK(
+          func->AddKernel({ty}, boolean(), std::move(exec), MatchSubstringState::Init));
+    }
     DCHECK_OK(registry->AddFunction(std::move(func)));
   }
   {
-    auto func = std::make_shared<ScalarFunction>("starts_with", Arity::Unary(),
-                                                 &match_substring_doc);
-    auto exec_32 = MatchSubstring<StringType, PlainStartsWithMatcher>::Exec;
-    auto exec_64 = MatchSubstring<LargeStringType, PlainStartsWithMatcher>::Exec;
-    DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, MatchSubstringState::Init));
-    DCHECK_OK(
-        func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init));
+    auto func =
+        std::make_shared<ScalarFunction>("starts_with", Arity::Unary(), &starts_with_doc);
+    for (const auto& ty : BaseBinaryTypes()) {
+      auto exec =
+          GenerateVarBinaryToVarBinary<MatchSubstring, PlainStartsWithMatcher>(ty);
+      DCHECK_OK(
+          func->AddKernel({ty}, boolean(), std::move(exec), MatchSubstringState::Init));
+    }
     DCHECK_OK(registry->AddFunction(std::move(func)));
   }
   {
-    auto func = std::make_shared<ScalarFunction>("ends_with", Arity::Unary(),
-                                                 &match_substring_doc);
-    auto exec_32 = MatchSubstring<StringType, PlainEndsWithMatcher>::Exec;
-    auto exec_64 = MatchSubstring<LargeStringType, PlainEndsWithMatcher>::Exec;
-    DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, MatchSubstringState::Init));
-    DCHECK_OK(
-        func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init));
+    auto func =
+        std::make_shared<ScalarFunction>("ends_with", Arity::Unary(), &ends_with_doc);
+    for (const auto& ty : BaseBinaryTypes()) {
+      auto exec = GenerateVarBinaryToVarBinary<MatchSubstring, PlainEndsWithMatcher>(ty);
+      DCHECK_OK(
+          func->AddKernel({ty}, boolean(), std::move(exec), MatchSubstringState::Init));
+    }
     DCHECK_OK(registry->AddFunction(std::move(func)));
   }
 #ifdef ARROW_WITH_RE2
   {
     auto func = std::make_shared<ScalarFunction>("match_substring_regex", Arity::Unary(),
                                                  &match_substring_regex_doc);
-    auto exec_32 = MatchSubstring<StringType, RegexSubstringMatcher>::Exec;
-    auto exec_64 = MatchSubstring<LargeStringType, RegexSubstringMatcher>::Exec;
-    DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, MatchSubstringState::Init));
-    DCHECK_OK(
-        func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init));
+    for (const auto& ty : BaseBinaryTypes()) {
+      auto exec = GenerateVarBinaryToVarBinary<MatchSubstring, RegexSubstringMatcher>(ty);
+      DCHECK_OK(
+          func->AddKernel({ty}, boolean(), std::move(exec), MatchSubstringState::Init));
+    }
     DCHECK_OK(registry->AddFunction(std::move(func)));
   }
   {
     auto func =
         std::make_shared<ScalarFunction>("match_like", Arity::Unary(), &match_like_doc);
-    auto exec_32 = MatchLike<StringType>::Exec;
-    auto exec_64 = MatchLike<LargeStringType>::Exec;
-    DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, MatchSubstringState::Init));
-    DCHECK_OK(
-        func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init));
+    for (const auto& ty : BaseBinaryTypes()) {
+      auto exec = GenerateVarBinaryToVarBinary<MatchLike>(ty);
+      DCHECK_OK(
+          func->AddKernel({ty}, boolean(), std::move(exec), MatchSubstringState::Init));
+    }
     DCHECK_OK(registry->AddFunction(std::move(func)));
   }
 #endif
@@ -1303,21 +1336,21 @@ struct FindSubstring {
 struct FindSubstringRegex {
   std::unique_ptr<RE2> regex_match_;
 
-  explicit FindSubstringRegex(const MatchSubstringOptions& options,
+  explicit FindSubstringRegex(const MatchSubstringOptions& options, bool is_utf8 = true,
                               bool literal = false) {
     std::string regex = "(";
     regex.reserve(options.pattern.length() + 2);
     regex += literal ? RE2::QuoteMeta(options.pattern) : options.pattern;
     regex += ")";
-    regex_match_.reset(new RE2(std::move(regex), RegexSubstringMatcher::MakeRE2Options(
-                                                     options, /*literal=*/false)));
+    regex_match_.reset(
+        new RE2(regex, MakeRE2Options(is_utf8, options.ignore_case, /*literal=*/false)));
   }
 
   template <typename OutValue, typename... Ignored>
   OutValue Call(KernelContext*, util::string_view val, Status*) const {
     re2::StringPiece piece(val.data(), val.length());
     re2::StringPiece match;
-    if (re2::RE2::PartialMatch(piece, *regex_match_, &match)) {
+    if (RE2::PartialMatch(piece, *regex_match_, &match)) {
       return static_cast<OutValue>(match.data() - piece.data());
     }
     return -1;
@@ -1333,10 +1366,12 @@ struct FindSubstringExec {
     if (options.ignore_case) {
 #ifdef ARROW_WITH_RE2
       applicator::ScalarUnaryNotNullStateful<OffsetType, InputType, FindSubstringRegex>
-          kernel{FindSubstringRegex(options, /*literal=*/true)};
+          kernel{FindSubstringRegex(options, /*is_utf8=*/InputType::is_utf8,
+                                    /*literal=*/true)};
       return kernel.Exec(ctx, batch, out);
-#endif
+#else
       return Status::NotImplemented("ignore_case requires RE2");
+#endif
     }
     applicator::ScalarUnaryNotNullStateful<OffsetType, InputType, FindSubstring> kernel{
         FindSubstring(PlainSubstringMatcher(options))};
@@ -1378,7 +1413,7 @@ void AddFindSubstring(FunctionRegistry* registry) {
     for (const auto& ty : BaseBinaryTypes()) {
       auto offset_type = offset_bit_width(ty->id()) == 64 ? int64() : int32();
       DCHECK_OK(func->AddKernel({ty}, offset_type,
-                                GenerateTypeAgnosticVarBinaryBase<FindSubstringExec>(ty),
+                                GenerateVarBinaryToVarBinary<FindSubstringExec>(ty),
                                 MatchSubstringState::Init));
     }
     DCHECK_OK(func->AddKernel({InputType(Type::FIXED_SIZE_BINARY)}, int32(),
@@ -1392,10 +1427,9 @@ void AddFindSubstring(FunctionRegistry* registry) {
                                                  &find_substring_regex_doc);
     for (const auto& ty : BaseBinaryTypes()) {
       auto offset_type = offset_bit_width(ty->id()) == 64 ? int64() : int32();
-      DCHECK_OK(
-          func->AddKernel({ty}, offset_type,
-                          GenerateTypeAgnosticVarBinaryBase<FindSubstringRegexExec>(ty),
-                          MatchSubstringState::Init));
+      DCHECK_OK(func->AddKernel({ty}, offset_type,
+                                GenerateVarBinaryToVarBinary<FindSubstringRegexExec>(ty),
+                                MatchSubstringState::Init));
     }
     DCHECK_OK(func->AddKernel({InputType(Type::FIXED_SIZE_BINARY)}, int32(),
                               FindSubstringRegexExec<FixedSizeBinaryType>::Exec,
@@ -1434,13 +1468,14 @@ struct CountSubstring {
 struct CountSubstringRegex {
   std::unique_ptr<RE2> regex_match_;
 
-  explicit CountSubstringRegex(const MatchSubstringOptions& options, bool literal = false)
+  explicit CountSubstringRegex(const MatchSubstringOptions& options, bool is_utf8 = true,
+                               bool literal = false)
       : regex_match_(new RE2(options.pattern,
-                             RegexSubstringMatcher::MakeRE2Options(options, literal))) {}
+                             MakeRE2Options(is_utf8, options.ignore_case, literal))) {}
 
   static Result<CountSubstringRegex> Make(const MatchSubstringOptions& options,
-                                          bool literal = false) {
-    CountSubstringRegex counter(options, literal);
+                                          bool is_utf8 = true, bool literal = false) {
+    CountSubstringRegex counter(options, is_utf8, literal);
     RETURN_NOT_OK(RegexStatus(*counter.regex_match_));
     return std::move(counter);
   }
@@ -1450,7 +1485,7 @@ struct CountSubstringRegex {
     OutValue count = 0;
     re2::StringPiece input(val.data(), val.size());
     auto last_size = input.size();
-    while (re2::RE2::FindAndConsume(&input, *regex_match_)) {
+    while (RE2::FindAndConsume(&input, *regex_match_)) {
       count++;
       if (last_size == input.size()) {
         // 0-length match
@@ -1471,7 +1506,8 @@ struct CountSubstringRegexExec {
   using OffsetType = typename TypeTraits<InputType>::OffsetType;
   static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
     const MatchSubstringOptions& options = MatchSubstringState::Get(ctx);
-    ARROW_ASSIGN_OR_RAISE(auto counter, CountSubstringRegex::Make(options));
+    ARROW_ASSIGN_OR_RAISE(
+        auto counter, CountSubstringRegex::Make(options, /*is_utf8=*/InputType::is_utf8));
     applicator::ScalarUnaryNotNullStateful<OffsetType, InputType, CountSubstringRegex>
         kernel{std::move(counter)};
     return kernel.Exec(ctx, batch, out);
@@ -1486,8 +1522,9 @@ struct CountSubstringExec {
     const MatchSubstringOptions& options = MatchSubstringState::Get(ctx);
     if (options.ignore_case) {
 #ifdef ARROW_WITH_RE2
-      ARROW_ASSIGN_OR_RAISE(auto counter,
-                            CountSubstringRegex::Make(options, /*literal=*/true));
+      ARROW_ASSIGN_OR_RAISE(
+          auto counter, CountSubstringRegex::Make(options, /*is_utf8=*/InputType::is_utf8,
+                                                  /*literal=*/true));
       applicator::ScalarUnaryNotNullStateful<OffsetType, InputType, CountSubstringRegex>
           kernel{std::move(counter)};
       return kernel.Exec(ctx, batch, out);
@@ -1524,7 +1561,7 @@ void AddCountSubstring(FunctionRegistry* registry) {
     for (const auto& ty : BaseBinaryTypes()) {
       auto offset_type = offset_bit_width(ty->id()) == 64 ? int64() : int32();
       DCHECK_OK(func->AddKernel({ty}, offset_type,
-                                GenerateTypeAgnosticVarBinaryBase<CountSubstringExec>(ty),
+                                GenerateVarBinaryToVarBinary<CountSubstringExec>(ty),
                                 MatchSubstringState::Init));
     }
     DCHECK_OK(func->AddKernel({InputType(Type::FIXED_SIZE_BINARY)}, int32(),
@@ -1538,10 +1575,9 @@ void AddCountSubstring(FunctionRegistry* registry) {
                                                  &count_substring_regex_doc);
     for (const auto& ty : BaseBinaryTypes()) {
       auto offset_type = offset_bit_width(ty->id()) == 64 ? int64() : int32();
-      DCHECK_OK(
-          func->AddKernel({ty}, offset_type,
-                          GenerateTypeAgnosticVarBinaryBase<CountSubstringRegexExec>(ty),
-                          MatchSubstringState::Init));
+      DCHECK_OK(func->AddKernel({ty}, offset_type,
+                                GenerateVarBinaryToVarBinary<CountSubstringRegexExec>(ty),
+                                MatchSubstringState::Init));
     }
     DCHECK_OK(func->AddKernel({InputType(Type::FIXED_SIZE_BINARY)}, int32(),
                               CountSubstringRegexExec<FixedSizeBinaryType>::Exec,
@@ -1743,12 +1779,11 @@ const FunctionDoc utf8_slice_codeunits_doc(
 void AddSlice(FunctionRegistry* registry) {
   auto func = std::make_shared<ScalarFunction>("utf8_slice_codeunits", Arity::Unary(),
                                                &utf8_slice_codeunits_doc);
-  using t32 = SliceCodeunits<StringType>;
-  using t64 = SliceCodeunits<LargeStringType>;
-  DCHECK_OK(
-      func->AddKernel({utf8()}, utf8(), t32::Exec, SliceCodeunitsTransform::State::Init));
-  DCHECK_OK(func->AddKernel({large_utf8()}, large_utf8(), t64::Exec,
-                            SliceCodeunitsTransform::State::Init));
+  for (const auto& ty : StringTypes()) {
+    auto exec = GenerateVarBinaryToVarBinary<SliceCodeunits>(ty);
+    DCHECK_OK(
+        func->AddKernel({ty}, ty, std::move(exec), SliceCodeunitsTransform::State::Init));
+  }
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
 
@@ -2172,6 +2207,8 @@ struct SplitExec {
   }
 };
 
+using SplitPatternState = OptionsWrapper<SplitPatternOptions>;
+
 struct SplitPatternFinder : public SplitFinderBase<SplitPatternOptions> {
   using Options = SplitPatternOptions;
 
@@ -2260,14 +2297,16 @@ const FunctionDoc utf8_split_whitespace_doc(
 void AddSplitPattern(FunctionRegistry* registry) {
   auto func = std::make_shared<ScalarFunction>("split_pattern", Arity::Unary(),
                                                &split_pattern_doc);
-  using t32 = SplitPatternExec<StringType, ListType>;
-  using t64 = SplitPatternExec<LargeStringType, ListType>;
-  DCHECK_OK(func->AddKernel({utf8()}, {list(utf8())}, t32::Exec, t32::State::Init));
-  DCHECK_OK(
-      func->AddKernel({large_utf8()}, {list(large_utf8())}, t64::Exec, t64::State::Init));
+  for (const auto& ty : BaseBinaryTypes()) {
+    auto exec = GenerateVarBinaryToVarBinary<SplitPatternExec, ListType>(ty);
+    DCHECK_OK(
+        func->AddKernel({ty}, {list(ty)}, std::move(exec), SplitPatternState::Init));
+  }
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
 
+using SplitState = OptionsWrapper<SplitOptions>;
+
 struct SplitWhitespaceAsciiFinder : public SplitFinderBase<SplitOptions> {
   using Options = SplitOptions;
 
@@ -2316,11 +2355,11 @@ void AddSplitWhitespaceAscii(FunctionRegistry* registry) {
   auto func =
       std::make_shared<ScalarFunction>("ascii_split_whitespace", Arity::Unary(),
                                        &ascii_split_whitespace_doc, &default_options);
-  using t32 = SplitWhitespaceAsciiExec<StringType, ListType>;
-  using t64 = SplitWhitespaceAsciiExec<LargeStringType, ListType>;
-  DCHECK_OK(func->AddKernel({utf8()}, {list(utf8())}, t32::Exec, t32::State::Init));
-  DCHECK_OK(
-      func->AddKernel({large_utf8()}, {list(large_utf8())}, t64::Exec, t64::State::Init));
+
+  for (const auto& ty : StringTypes()) {
+    auto exec = GenerateVarBinaryToVarBinary<SplitWhitespaceAsciiExec, ListType>(ty);
+    DCHECK_OK(func->AddKernel({ty}, {list(ty)}, std::move(exec), SplitState::Init));
+  }
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
 
@@ -2383,24 +2422,24 @@ template <typename Type, typename ListType>
 using SplitWhitespaceUtf8Exec = SplitExec<Type, ListType, SplitWhitespaceUtf8Finder>;
 
 void AddSplitWhitespaceUTF8(FunctionRegistry* registry) {
-  static const SplitOptions default_options{};
+  static const SplitOptions default_options;
   auto func =
       std::make_shared<ScalarFunction>("utf8_split_whitespace", Arity::Unary(),
                                        &utf8_split_whitespace_doc, &default_options);
-  using t32 = SplitWhitespaceUtf8Exec<StringType, ListType>;
-  using t64 = SplitWhitespaceUtf8Exec<LargeStringType, ListType>;
-  DCHECK_OK(func->AddKernel({utf8()}, {list(utf8())}, t32::Exec, t32::State::Init));
-  DCHECK_OK(
-      func->AddKernel({large_utf8()}, {list(large_utf8())}, t64::Exec, t64::State::Init));
+  for (const auto& ty : StringTypes()) {
+    auto exec = GenerateVarBinaryToVarBinary<SplitWhitespaceUtf8Exec, ListType>(ty);
+    DCHECK_OK(func->AddKernel({ty}, {list(ty)}, std::move(exec), SplitState::Init));
+  }
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
 #endif  // ARROW_WITH_UTF8PROC
 
 #ifdef ARROW_WITH_RE2
+template <typename Type>
 struct SplitRegexFinder : public SplitFinderBase<SplitPatternOptions> {
   using Options = SplitPatternOptions;
 
-  util::optional<RE2> regex_split;
+  std::unique_ptr<RE2> regex_split;
 
   Status PreExec(const SplitPatternOptions& options) override {
     if (options.reverse) {
@@ -2412,7 +2451,7 @@ struct SplitRegexFinder : public SplitFinderBase<SplitPatternOptions> {
     pattern.reserve(options.pattern.size() + 2);
     pattern += options.pattern;
     pattern += ')';
-    regex_split.emplace(std::move(pattern));
+    regex_split = arrow::internal::make_unique<RE2>(pattern, MakeRE2Options<Type>());
     return RegexStatus(*regex_split);
   }
 
@@ -2422,7 +2461,7 @@ struct SplitRegexFinder : public SplitFinderBase<SplitPatternOptions> {
                            std::distance(begin, end));
     // "StringPiece is mutated to point to matched piece"
     re2::StringPiece result;
-    if (!re2::RE2::PartialMatch(piece, *regex_split, &result)) {
+    if (!RE2::PartialMatch(piece, *regex_split, &result)) {
       return false;
     }
     *separator_begin = reinterpret_cast<const uint8_t*>(result.data());
@@ -2439,7 +2478,7 @@ struct SplitRegexFinder : public SplitFinderBase<SplitPatternOptions> {
 };
 
 template <typename Type, typename ListType>
-using SplitRegexExec = SplitExec<Type, ListType, SplitRegexFinder>;
+using SplitRegexExec = SplitExec<Type, ListType, SplitRegexFinder<Type>>;
 
 const FunctionDoc split_pattern_regex_doc(
     "Split string according to regex pattern",
@@ -2454,11 +2493,11 @@ const FunctionDoc split_pattern_regex_doc(
 void AddSplitRegex(FunctionRegistry* registry) {
   auto func = std::make_shared<ScalarFunction>("split_pattern_regex", Arity::Unary(),
                                                &split_pattern_regex_doc);
-  using t32 = SplitRegexExec<StringType, ListType>;
-  using t64 = SplitRegexExec<LargeStringType, ListType>;
-  DCHECK_OK(func->AddKernel({utf8()}, {list(utf8())}, t32::Exec, t32::State::Init));
-  DCHECK_OK(
-      func->AddKernel({large_utf8()}, {list(large_utf8())}, t64::Exec, t64::State::Init));
+  for (const auto& ty : BaseBinaryTypes()) {
+    auto exec = GenerateVarBinaryToVarBinary<SplitRegexExec, ListType>(ty);
+    DCHECK_OK(
+        func->AddKernel({ty}, {list(ty)}, std::move(exec), SplitPatternState::Init));
+  }
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
 #endif  // ARROW_WITH_RE2
@@ -2477,17 +2516,18 @@ void AddSplit(FunctionRegistry* registry) {
 // ----------------------------------------------------------------------
 // Replace substring (plain, regex)
 
+using ReplaceState = OptionsWrapper<ReplaceSubstringOptions>;
+
 template <typename Type, typename Replacer>
-struct ReplaceSubString {
+struct ReplaceSubstring {
   using ScalarType = typename TypeTraits<Type>::ScalarType;
   using offset_type = typename Type::offset_type;
   using ValueDataBuilder = TypedBufferBuilder<uint8_t>;
   using OffsetBuilder = TypedBufferBuilder<offset_type>;
-  using State = OptionsWrapper<ReplaceSubstringOptions>;
 
   static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
     // TODO Cache replacer across invocations (for regex compilation)
-    ARROW_ASSIGN_OR_RAISE(auto replacer, Replacer::Make(State::Get(ctx)));
+    ARROW_ASSIGN_OR_RAISE(auto replacer, Replacer::Make(ReplaceState::Get(ctx)));
     return Replace(ctx, batch, *replacer, out);
   }
 
@@ -2535,15 +2575,15 @@ struct ReplaceSubString {
   }
 };
 
-struct PlainSubStringReplacer {
+struct PlainSubstringReplacer {
   const ReplaceSubstringOptions& options_;
 
-  static Result<std::unique_ptr<PlainSubStringReplacer>> Make(
+  static Result<std::unique_ptr<PlainSubstringReplacer>> Make(
       const ReplaceSubstringOptions& options) {
-    return arrow::internal::make_unique<PlainSubStringReplacer>(options);
+    return arrow::internal::make_unique<PlainSubstringReplacer>(options);
   }
 
-  explicit PlainSubStringReplacer(const ReplaceSubstringOptions& options)
+  explicit PlainSubstringReplacer(const ReplaceSubstringOptions& options)
       : options_(options) {}
 
   Status ReplaceString(util::string_view s, TypedBufferBuilder<uint8_t>* builder) const {
@@ -2577,14 +2617,15 @@ struct PlainSubStringReplacer {
 };
 
 #ifdef ARROW_WITH_RE2
-struct RegexSubStringReplacer {
+template <typename Type>
+struct RegexSubstringReplacer {
   const ReplaceSubstringOptions& options_;
   const RE2 regex_find_;
   const RE2 regex_replacement_;
 
-  static Result<std::unique_ptr<RegexSubStringReplacer>> Make(
+  static Result<std::unique_ptr<RegexSubstringReplacer>> Make(
       const ReplaceSubstringOptions& options) {
-    auto replacer = arrow::internal::make_unique<RegexSubStringReplacer>(options);
+    auto replacer = arrow::internal::make_unique<RegexSubstringReplacer>(options);
 
     RETURN_NOT_OK(RegexStatus(replacer->regex_find_));
     RETURN_NOT_OK(RegexStatus(replacer->regex_replacement_));
@@ -2601,17 +2642,17 @@ struct RegexSubStringReplacer {
 
   // Using RE2::FindAndConsume we can only find the pattern if it is a group, therefore
   // we have 2 regexes, one with () around it, one without.
-  explicit RegexSubStringReplacer(const ReplaceSubstringOptions& options)
+  explicit RegexSubstringReplacer(const ReplaceSubstringOptions& options)
       : options_(options),
-        regex_find_("(" + options_.pattern + ")", RE2::Quiet),
-        regex_replacement_(options_.pattern, RE2::Quiet) {}
+        regex_find_("(" + options_.pattern + ")", MakeRE2Options<Type>()),
+        regex_replacement_(options_.pattern, MakeRE2Options<Type>()) {}
 
   Status ReplaceString(util::string_view s, TypedBufferBuilder<uint8_t>* builder) const {
     re2::StringPiece replacement(options_.replacement);
 
     if (options_.max_replacements == -1) {
       std::string s_copy(s.to_string());
-      re2::RE2::GlobalReplace(&s_copy, regex_replacement_, replacement);
+      RE2::GlobalReplace(&s_copy, regex_replacement_, replacement);
       return builder->Append(reinterpret_cast<const uint8_t*>(s_copy.data()),
                              s_copy.length());
     }
@@ -2626,7 +2667,7 @@ struct RegexSubStringReplacer {
     int64_t max_replacements = options_.max_replacements;
     while ((i < end) && (max_replacements != 0)) {
       std::string found;
-      if (!re2::RE2::FindAndConsume(&piece, regex_find_, &found)) {
+      if (!RE2::FindAndConsume(&piece, regex_find_, &found)) {
         RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(i),
                                       static_cast<int64_t>(end - i)));
         i = end;
@@ -2637,7 +2678,7 @@ struct RegexSubStringReplacer {
         RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(i),
                                       static_cast<int64_t>(pos - i)));
         // replace the pattern in what we found
-        if (!re2::RE2::Replace(&found, regex_replacement_, replacement)) {
+        if (!RE2::Replace(&found, regex_replacement_, replacement)) {
           return Status::Invalid("Regex found, but replacement failed");
         }
         RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(found.data()),
@@ -2655,7 +2696,7 @@ struct RegexSubStringReplacer {
 #endif
 
 template <typename Type>
-using ReplaceSubStringPlain = ReplaceSubString<Type, PlainSubStringReplacer>;
+using ReplaceSubstringPlain = ReplaceSubstring<Type, PlainSubstringReplacer>;
 
 const FunctionDoc replace_substring_doc(
     "Replace non-overlapping substrings that match pattern by replacement",
@@ -2667,7 +2708,7 @@ const FunctionDoc replace_substring_doc(
 
 #ifdef ARROW_WITH_RE2
 template <typename Type>
-using ReplaceSubStringRegex = ReplaceSubString<Type, RegexSubStringReplacer>;
+using ReplaceSubstringRegex = ReplaceSubstring<Type, RegexSubstringReplacer<Type>>;
 
 const FunctionDoc replace_substring_regex_doc(
     "Replace non-overlapping substrings that match regex `pattern` by `replacement`",
@@ -2804,7 +2845,7 @@ struct Utf8ReplaceSliceTransform : ReplaceSliceTransformBase {
           return kTransformError;
         }
       } else {
-        // zero-length slice
+        // Zero-length slice
         end_sliced = begin_sliced;
       }
     }
@@ -2857,11 +2898,12 @@ void AddReplaceSlice(FunctionRegistry* registry) {
   {
     auto func = std::make_shared<ScalarFunction>("utf8_replace_slice", Arity::Unary(),
                                                  &utf8_replace_slice_doc);
-    DCHECK_OK(func->AddKernel({utf8()}, utf8(), Utf8ReplaceSlice<StringType>::Exec,
-                              ReplaceSliceTransformBase::State::Init));
-    DCHECK_OK(func->AddKernel({large_utf8()}, large_utf8(),
-                              Utf8ReplaceSlice<LargeStringType>::Exec,
-                              ReplaceSliceTransformBase::State::Init));
+
+    for (const auto& ty : StringTypes()) {
+      auto exec = GenerateVarBinaryToVarBinary<Utf8ReplaceSlice>(ty);
+      DCHECK_OK(func->AddKernel({ty}, ty, std::move(exec),
+                                ReplaceSliceTransformBase::State::Init));
+    }
     DCHECK_OK(registry->AddFunction(std::move(func)));
   }
 }
@@ -2871,14 +2913,17 @@ void AddReplaceSlice(FunctionRegistry* registry) {
 
 #ifdef ARROW_WITH_RE2
 
+using ExtractRegexState = OptionsWrapper<ExtractRegexOptions>;
+
 // TODO cache this once per ExtractRegexOptions
 struct ExtractRegexData {
-  // Use unique_ptr<> because RE2 is non-movable
+  // Use unique_ptr<> because RE2 is non-movable (for ARROW_ASSIGN_OR_RAISE)
   std::unique_ptr<RE2> regex;
   std::vector<std::string> group_names;
 
-  static Result<ExtractRegexData> Make(const ExtractRegexOptions& options) {
-    ExtractRegexData data(options.pattern);
+  static Result<ExtractRegexData> Make(const ExtractRegexOptions& options,
+                                       bool is_utf8 = true) {
+    ExtractRegexData data(options.pattern, is_utf8);
     RETURN_NOT_OK(RegexStatus(*data.regex));
 
     const int group_count = data.regex->NumberOfCapturingGroups();
@@ -2902,9 +2947,9 @@ struct ExtractRegexData {
       // No input type specified => propagate shape
       return args[0];
     }
-    // Input type is either String or LargeString and is also the type of each
-    // field in the output struct type.
-    DCHECK(input_type->id() == Type::STRING || input_type->id() == Type::LARGE_STRING);
+    // Input type is either [Large]Binary or [Large]String and is also the type
+    // of each field in the output struct type.
+    DCHECK(is_base_binary_like(input_type->id()));
     FieldVector fields;
     fields.reserve(group_names.size());
     std::transform(group_names.begin(), group_names.end(), std::back_inserter(fields),
@@ -2913,14 +2958,13 @@ struct ExtractRegexData {
   }
 
  private:
-  explicit ExtractRegexData(const std::string& pattern)
-      : regex(new RE2(pattern, RE2::Quiet)) {}
+  explicit ExtractRegexData(const std::string& pattern, bool is_utf8 = true)
+      : regex(new RE2(pattern, MakeRE2Options(is_utf8))) {}
 };
 
 Result<ValueDescr> ResolveExtractRegexOutput(KernelContext* ctx,
                                              const std::vector<ValueDescr>& args) {
-  using State = OptionsWrapper<ExtractRegexOptions>;
-  ExtractRegexOptions options = State::Get(ctx);
+  ExtractRegexOptions options = ExtractRegexState::Get(ctx);
   ARROW_ASSIGN_OR_RAISE(auto data, ExtractRegexData::Make(options));
   return data.ResolveOutputType(args);
 }
@@ -2929,10 +2973,10 @@ struct ExtractRegexBase {
   const ExtractRegexData& data;
   const int group_count;
   std::vector<re2::StringPiece> found_values;
-  std::vector<re2::RE2::Arg> args;
-  std::vector<const re2::RE2::Arg*> args_pointers;
-  const re2::RE2::Arg** args_pointers_start;
-  const re2::RE2::Arg* null_arg = nullptr;
+  std::vector<RE2::Arg> args;
+  std::vector<const RE2::Arg*> args_pointers;
+  const RE2::Arg** args_pointers_start;
+  const RE2::Arg* null_arg = nullptr;
 
   explicit ExtractRegexBase(const ExtractRegexData& data)
       : data(data),
@@ -2951,8 +2995,8 @@ struct ExtractRegexBase {
   }
 
   bool Match(util::string_view s) {
-    return re2::RE2::PartialMatchN(ToStringPiece(s), *data.regex, args_pointers_start,
-                                   group_count);
+    return RE2::PartialMatchN(ToStringPiece(s), *data.regex, args_pointers_start,
+                              group_count);
   }
 };
 
@@ -2961,13 +3005,11 @@ struct ExtractRegex : public ExtractRegexBase {
   using ArrayType = typename TypeTraits<Type>::ArrayType;
   using ScalarType = typename TypeTraits<Type>::ScalarType;
   using BuilderType = typename TypeTraits<Type>::BuilderType;
-  using State = OptionsWrapper<ExtractRegexOptions>;
-
   using ExtractRegexBase::ExtractRegexBase;
 
   static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    ExtractRegexOptions options = State::Get(ctx);
-    ARROW_ASSIGN_OR_RAISE(auto data, ExtractRegexData::Make(options));
+    ExtractRegexOptions options = ExtractRegexState::Get(ctx);
+    ARROW_ASSIGN_OR_RAISE(auto data, ExtractRegexData::Make(options, Type::is_utf8));
     return ExtractRegex{data}.Extract(ctx, batch, out);
   }
 
@@ -3011,8 +3053,8 @@ struct ExtractRegex : public ExtractRegexBase {
       if (input.is_valid && Match(util::string_view(*input.value))) {
         result->value.reserve(group_count);
         for (int i = 0; i < group_count; i++) {
-          result->value.push_back(
-              std::make_shared<ScalarType>(found_values[i].as_string()));
+          result->value.push_back(std::make_shared<ScalarType>(
+              Buffer::FromString(found_values[i].as_string())));
         }
         result->is_valid = true;
       } else {
@@ -3038,23 +3080,17 @@ const FunctionDoc extract_regex_doc(
 void AddExtractRegex(FunctionRegistry* registry) {
   auto func = std::make_shared<ScalarFunction>("extract_regex", Arity::Unary(),
                                                &extract_regex_doc);
-  using t32 = ExtractRegex<StringType>;
-  using t64 = ExtractRegex<LargeStringType>;
   OutputType out_ty(ResolveExtractRegexOutput);
-  ScalarKernel kernel;
-
-  // Null values will be computed based on regex match or not
-  kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
-  kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
-  kernel.signature.reset(new KernelSignature({utf8()}, out_ty));
-  kernel.exec = t32::Exec;
-  kernel.init = t32::State::Init;
-  DCHECK_OK(func->AddKernel(kernel));
-  kernel.signature.reset(new KernelSignature({large_utf8()}, out_ty));
-  kernel.exec = t64::Exec;
-  kernel.init = t64::State::Init;
-  DCHECK_OK(func->AddKernel(kernel));
-
+  for (const auto& ty : BaseBinaryTypes()) {
+    ScalarKernel kernel{{ty},
+                        out_ty,
+                        GenerateVarBinaryToVarBinary<ExtractRegex>(ty),
+                        ExtractRegexState::Init};
+    // Null values will be computed based on regex match or not
+    kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
+    kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
+    DCHECK_OK(func->AddKernel(kernel));
+  }
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
 #endif  // ARROW_WITH_RE2
@@ -3083,13 +3119,16 @@ struct ParseStrptime {
 };
 
 template <typename InputType>
-Status StrptimeExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-  applicator::ScalarUnaryNotNullStateful<TimestampType, InputType, ParseStrptime> kernel{
-      ParseStrptime(StrptimeState::Get(ctx))};
-  return kernel.Exec(ctx, batch, out);
-}
+struct StrptimeExec {
+  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    applicator::ScalarUnaryNotNullStateful<TimestampType, InputType, ParseStrptime>
+        kernel{ParseStrptime(StrptimeState::Get(ctx))};
+    return kernel.Exec(ctx, batch, out);
+  }
+};
 
-Result<ValueDescr> StrptimeResolve(KernelContext* ctx, const std::vector<ValueDescr>&) {
+Result<ValueDescr> ResolveStrptimeOutput(KernelContext* ctx,
+                                         const std::vector<ValueDescr>&) {
   if (ctx->state()) {
     return ::arrow::timestamp(StrptimeState::Get(ctx).unit);
   }
@@ -3564,25 +3603,29 @@ const FunctionDoc utf8_length_doc("Compute UTF8 string lengths",
 
 void AddStrptime(FunctionRegistry* registry) {
   auto func = std::make_shared<ScalarFunction>("strptime", Arity::Unary(), &strptime_doc);
-  DCHECK_OK(func->AddKernel({utf8()}, OutputType(StrptimeResolve),
-                            StrptimeExec<StringType>, StrptimeState::Init));
-  DCHECK_OK(func->AddKernel({large_utf8()}, OutputType(StrptimeResolve),
-                            StrptimeExec<LargeStringType>, StrptimeState::Init));
+
+  OutputType out_ty(ResolveStrptimeOutput);
+  for (const auto& ty : StringTypes()) {
+    auto exec = GenerateVarBinaryToVarBinary<StrptimeExec>(ty);
+    DCHECK_OK(func->AddKernel({ty}, out_ty, std::move(exec), StrptimeState::Init));
+  }
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
 
 void AddBinaryLength(FunctionRegistry* registry) {
   auto func = std::make_shared<ScalarFunction>("binary_length", Arity::Unary(),
                                                &binary_length_doc);
-  ArrayKernelExec exec_offset_32 =
-      applicator::ScalarUnaryNotNull<Int32Type, StringType, BinaryLength>::Exec;
-  ArrayKernelExec exec_offset_64 =
-      applicator::ScalarUnaryNotNull<Int64Type, LargeStringType, BinaryLength>::Exec;
-  for (const auto& input_type : {binary(), utf8()}) {
-    DCHECK_OK(func->AddKernel({input_type}, int32(), exec_offset_32));
-  }
-  for (const auto& input_type : {large_binary(), large_utf8()}) {
-    DCHECK_OK(func->AddKernel({input_type}, int64(), exec_offset_64));
+  for (const auto& ty : {binary(), utf8()}) {
+    auto exec =
+        GenerateVarBinaryBase<applicator::ScalarUnaryNotNull, Int32Type, BinaryLength>(
+            ty);
+    DCHECK_OK(func->AddKernel({ty}, int32(), std::move(exec)));
+  }
+  for (const auto& ty : {large_binary(), large_utf8()}) {
+    auto exec =
+        GenerateVarBinaryBase<applicator::ScalarUnaryNotNull, Int64Type, BinaryLength>(
+            ty);
+    DCHECK_OK(func->AddKernel({ty}, int64(), std::move(exec)));
   }
   DCHECK_OK(func->AddKernel({InputType(Type::FIXED_SIZE_BINARY)}, int32(),
                             BinaryLength::FixedSizeExec));
@@ -3592,15 +3635,15 @@ void AddBinaryLength(FunctionRegistry* registry) {
 void AddUtf8Length(FunctionRegistry* registry) {
   auto func =
       std::make_shared<ScalarFunction>("utf8_length", Arity::Unary(), &utf8_length_doc);
-
-  ArrayKernelExec exec_offset_32 =
-      applicator::ScalarUnaryNotNull<Int32Type, StringType, Utf8Length>::Exec;
-  DCHECK_OK(func->AddKernel({utf8()}, int32(), std::move(exec_offset_32)));
-
-  ArrayKernelExec exec_offset_64 =
-      applicator::ScalarUnaryNotNull<Int64Type, LargeStringType, Utf8Length>::Exec;
-  DCHECK_OK(func->AddKernel({large_utf8()}, int64(), std::move(exec_offset_64)));
-
+  {
+    auto exec = applicator::ScalarUnaryNotNull<Int32Type, StringType, Utf8Length>::Exec;
+    DCHECK_OK(func->AddKernel({utf8()}, int32(), std::move(exec)));
+  }
+  {
+    auto exec =
+        applicator::ScalarUnaryNotNull<Int64Type, LargeStringType, Utf8Length>::Exec;
+    DCHECK_OK(func->AddKernel({large_utf8()}, int64(), std::move(exec)));
+  }
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
 
@@ -4082,10 +4125,10 @@ const auto kDefaultJoinOptions = JoinOptions::Defaults();
 
 template <typename ListType>
 void AddBinaryJoinForListType(ScalarFunction* func) {
-  for (const std::shared_ptr<DataType>& ty : BaseBinaryTypes()) {
+  for (const auto& ty : BaseBinaryTypes()) {
     auto exec = GenerateTypeAgnosticVarBinaryBase<BinaryJoin, ListType>(*ty);
     auto list_ty = std::make_shared<ListType>(ty);
-    DCHECK_OK(func->AddKernel({InputType(list_ty), InputType(ty)}, ty, exec));
+    DCHECK_OK(func->AddKernel({InputType(list_ty), InputType(ty)}, ty, std::move(exec)));
   }
 }
 
@@ -4118,15 +4161,9 @@ void MakeUnaryStringBatchKernel(
     std::string name, FunctionRegistry* registry, const FunctionDoc* doc,
     MemAllocation::type mem_allocation = MemAllocation::PREALLOCATE) {
   auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
-  {
-    auto exec_32 = ExecFunctor<StringType>::Exec;
-    ScalarKernel kernel{{utf8()}, utf8(), exec_32};
-    kernel.mem_allocation = mem_allocation;
-    DCHECK_OK(func->AddKernel(std::move(kernel)));
-  }
-  {
-    auto exec_64 = ExecFunctor<LargeStringType>::Exec;
-    ScalarKernel kernel{{large_utf8()}, large_utf8(), exec_64};
+  for (const auto& ty : StringTypes()) {
+    auto exec = GenerateVarBinaryToVarBinary<ExecFunctor>(ty);
+    ScalarKernel kernel{{ty}, ty, std::move(exec)};
     kernel.mem_allocation = mem_allocation;
     DCHECK_OK(func->AddKernel(std::move(kernel)));
   }
@@ -4153,68 +4190,87 @@ void MakeUnaryStringBatchKernelWithState(
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
 
+void AddReplaceSubstring(FunctionRegistry* registry) {
+  {
+    auto func = std::make_shared<ScalarFunction>("replace_substring", Arity::Unary(),
+                                                 &replace_substring_doc);
+    for (const auto& ty : BaseBinaryTypes()) {
+      auto exec = GenerateVarBinaryToVarBinary<ReplaceSubstringPlain>(ty);
+      ScalarKernel kernel{{ty}, ty, std::move(exec), ReplaceState::Init};
+      kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
+      DCHECK_OK(func->AddKernel(std::move(kernel)));
+    }
+    DCHECK_OK(registry->AddFunction(std::move(func)));
+  }
+#ifdef ARROW_WITH_RE2
+  {
+    auto func = std::make_shared<ScalarFunction>(
+        "replace_substring_regex", Arity::Unary(), &replace_substring_regex_doc);
+    for (const auto& ty : BaseBinaryTypes()) {
+      auto exec = GenerateVarBinaryToVarBinary<ReplaceSubstringRegex>(ty);
+      ScalarKernel kernel{{ty}, ty, std::move(exec), ReplaceState::Init};
+      kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
+      DCHECK_OK(func->AddKernel(std::move(kernel)));
+    }
+    DCHECK_OK(registry->AddFunction(std::move(func)));
+  }
+#endif
+}
+
 #ifdef ARROW_WITH_UTF8PROC
 
 template <template <typename> class Transformer>
 void MakeUnaryStringUTF8TransformKernel(std::string name, FunctionRegistry* registry,
                                         const FunctionDoc* doc) {
   auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
-  ArrayKernelExec exec_32 = Transformer<StringType>::Exec;
-  ArrayKernelExec exec_64 = Transformer<LargeStringType>::Exec;
-  DCHECK_OK(func->AddKernel({utf8()}, utf8(), exec_32));
-  DCHECK_OK(func->AddKernel({large_utf8()}, large_utf8(), exec_64));
+  for (const auto& ty : StringTypes()) {
+    auto exec = GenerateVarBinaryToVarBinary<Transformer>(ty);
+    DCHECK_OK(func->AddKernel({ty}, ty, std::move(exec)));
+  }
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
 
 #endif
 
-// NOTE: Predicate should only populate 'status' with errors,
-//       leave it unmodified to indicate Status::OK()
-using StringPredicate =
-    std::function<bool(KernelContext*, const uint8_t*, size_t, Status*)>;
-
-template <typename Type>
-Status ApplyPredicate(KernelContext* ctx, const ExecBatch& batch,
-                      StringPredicate predicate, Datum* out) {
-  Status st = Status::OK();
-  EnsureLookupTablesFilled();
-  if (batch[0].kind() == Datum::ARRAY) {
-    const ArrayData& input = *batch[0].array();
-    ArrayIterator<Type> input_it(input);
-    ArrayData* out_arr = out->mutable_array();
-    ::arrow::internal::GenerateBitsUnrolled(
-        out_arr->buffers[1]->mutable_data(), out_arr->offset, input.length,
-        [&]() -> bool {
-          util::string_view val = input_it();
-          return predicate(ctx, reinterpret_cast<const uint8_t*>(val.data()), val.size(),
-                           &st);
-        });
-  } else {
-    const auto& input = checked_cast<const BaseBinaryScalar&>(*batch[0].scalar());
-    if (input.is_valid) {
-      bool boolean_result = predicate(ctx, input.value->data(),
-                                      static_cast<size_t>(input.value->size()), &st);
-      // UTF decoding can lead to issues
-      if (st.ok()) {
-        out->value = std::make_shared<BooleanScalar>(boolean_result);
+template <typename Type, typename Predicate>
+struct StringPredicateFunctor {
+  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    Status st = Status::OK();
+    EnsureLookupTablesFilled();
+    if (batch[0].kind() == Datum::ARRAY) {
+      const ArrayData& input = *batch[0].array();
+      ArrayIterator<Type> input_it(input);
+      ArrayData* out_arr = out->mutable_array();
+      ::arrow::internal::GenerateBitsUnrolled(
+          out_arr->buffers[1]->mutable_data(), out_arr->offset, input.length,
+          [&]() -> bool {
+            util::string_view val = input_it();
+            return Predicate::Call(ctx, reinterpret_cast<const uint8_t*>(val.data()),
+                                   val.size(), &st);
+          });
+    } else {
+      const auto& input = checked_cast<const BaseBinaryScalar&>(*batch[0].scalar());
+      if (input.is_valid) {
+        bool boolean_result = Predicate::Call(
+            ctx, input.value->data(), static_cast<size_t>(input.value->size()), &st);
+        // UTF decoding can lead to issues
+        if (st.ok()) {
+          out->value = std::make_shared<BooleanScalar>(boolean_result);
+        }
       }
     }
+    return st;
   }
-  return st;
-}
+};
 
 template <typename Predicate>
 void AddUnaryStringPredicate(std::string name, FunctionRegistry* registry,
                              const FunctionDoc* doc) {
   auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
-  auto exec_32 = [](KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    return ApplyPredicate<StringType>(ctx, batch, Predicate::Call, out);
-  };
-  auto exec_64 = [](KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    return ApplyPredicate<LargeStringType>(ctx, batch, Predicate::Call, out);
-  };
-  DCHECK_OK(func->AddKernel({utf8()}, boolean(), std::move(exec_32)));
-  DCHECK_OK(func->AddKernel({large_utf8()}, boolean(), std::move(exec_64)));
+  for (const auto& ty : StringTypes()) {
+    auto exec = GenerateVarBinaryToVarBinary<StringPredicateFunctor, Predicate>(ty);
+    DCHECK_OK(func->AddKernel({ty}, boolean(), std::move(exec)));
+  }
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
 
@@ -4469,13 +4525,8 @@ void RegisterScalarStringAscii(FunctionRegistry* registry) {
   AddMatchSubstring(registry);
   AddFindSubstring(registry);
   AddCountSubstring(registry);
-  MakeUnaryStringBatchKernelWithState<ReplaceSubStringPlain>(
-      "replace_substring", registry, &replace_substring_doc,
-      MemAllocation::NO_PREALLOCATE);
+  AddReplaceSubstring(registry);
 #ifdef ARROW_WITH_RE2
-  MakeUnaryStringBatchKernelWithState<ReplaceSubStringRegex>(
-      "replace_substring_regex", registry, &replace_substring_regex_doc,
-      MemAllocation::NO_PREALLOCATE);
   AddExtractRegex(registry);
 #endif
   AddReplaceSlice(registry);
diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
index e16d9b2dcf67b..be22ef4a7c1b3 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
@@ -16,6 +16,7 @@
 // under the License.
 
 #include <memory>
+#include <string>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
@@ -34,9 +35,9 @@ namespace compute {
 // interesting utf8 characters for testing (lower case / upper case):
 //  * ῦ / Υ͂ (3 to 4 code units) (Note, we don't support this yet, utf8proc does not use
 //  SpecialCasing.txt)
-//  * ɑ /  Ɑ (2 to 3 code units)
+//  * ɑ / Ɑ (2 to 3 code units)
 //  * ı / I (2 to 1 code units)
-//  * Ⱥ / ⱥ  (2 to 3 code units)
+//  * Ⱥ / ⱥ (2 to 3 code units)
 
 template <typename TestType>
 class BaseTestStringKernels : public ::testing::Test {
@@ -50,6 +51,19 @@ class BaseTestStringKernels : public ::testing::Test {
     CheckScalarUnary(func_name, type(), json_input, out_ty, json_expected, options);
   }
 
+  void CheckUnary(std::string func_name, const std::shared_ptr<Array>& input,
+                  std::shared_ptr<DataType> out_ty, std::string json_expected,
+                  const FunctionOptions* options = nullptr) {
+    CheckScalar(func_name, {Datum(input)}, Datum(ArrayFromJSON(out_ty, json_expected)),
+                options);
+  }
+
+  void CheckUnary(std::string func_name, const std::shared_ptr<Array>& input,
+                  const std::shared_ptr<Array>& expected,
+                  const FunctionOptions* options = nullptr) {
+    CheckScalar(func_name, {Datum(input)}, Datum(expected), options);
+  }
+
   void CheckBinaryScalar(std::string func_name, std::string json_left_input,
                          std::string json_right_scalar, std::shared_ptr<DataType> out_ty,
                          std::string json_expected,
@@ -72,7 +86,7 @@ class BaseTestStringKernels : public ::testing::Test {
     CheckScalar(func_name, inputs, ScalarFromJSON(out_ty, json_expected), options);
   }
 
-  void CheckVarArgs(std::string func_name, const std::vector<Datum>& inputs,
+  void CheckVarArgs(std::string func_name, const DatumVector& inputs,
                     std::shared_ptr<DataType> out_ty, std::string json_expected,
                     const FunctionOptions* options = nullptr) {
     CheckScalar(func_name, inputs, ArrayFromJSON(out_ty, json_expected), options);
@@ -88,19 +102,300 @@ class BaseTestStringKernels : public ::testing::Test {
   std::shared_ptr<DataType> offset_type() {
     return TypeTraits<OffsetType>::type_singleton();
   }
+
+  template <typename CType = const char*>
+  std::shared_ptr<Array> MakeArray(const std::vector<CType>& values,
+                                   const std::vector<bool>& is_valid = {}) {
+    return _MakeArray<TestType, CType>(type(), values, is_valid);
+  }
 };
 
+template <typename TestType>
+class TestBaseBinaryKernels : public BaseTestStringKernels<TestType> {};
+
+TYPED_TEST_SUITE(TestBaseBinaryKernels, BaseBinaryArrowTypes);
+
 template <typename TestType>
 class TestBinaryKernels : public BaseTestStringKernels<TestType> {};
 
 TYPED_TEST_SUITE(TestBinaryKernels, BinaryArrowTypes);
 
-TYPED_TEST(TestBinaryKernels, BinaryLength) {
+template <typename TestType>
+class TestStringKernels : public BaseTestStringKernels<TestType> {};
+
+TYPED_TEST_SUITE(TestStringKernels, StringArrowTypes);
+
+TYPED_TEST(TestBaseBinaryKernels, BinaryLength) {
   this->CheckUnary("binary_length", R"(["aaa", null, "áéíóú", "", "b"])",
                    this->offset_type(), "[3, null, 10, 0, 1]");
+
+  // Invalid UTF-8 inputs
+  this->CheckUnary("binary_length", this->MakeArray({"\xf7\x0f\xab", "\xff\x9b\xc3\xbb"}),
+                   this->offset_type(), "[3, 4]");
+
+  // Invalid UTF-8 inputs with null bytes
+  this->CheckUnary("binary_length",
+                   this->template MakeArray<std::string>(
+                       {{"\xf7\x00\xab", 3}, {"\x00\x9b\x00\xbb", 4}, {"\x00\x00", 2}}),
+                   this->offset_type(), "[3, 4, 2]");
+}
+
+// The NonUtf8XXX tests use kernels that do not accept invalid UTF-8 when
+// processing [Large]StringType data. These tests use invalid UTF-8 inputs.
+TYPED_TEST(TestBinaryKernels, NonUtf8) {
+#ifdef ARROW_WITH_RE2
+  for (auto ignore_case : {true, false}) {
+#else
+  for (auto ignore_case : {false}) {
+#endif
+    MatchSubstringOptions options("\xfc\x40", ignore_case);
+    this->CheckUnary(
+        "find_substring",
+        this->MakeArray({"\xfc\x40\xab", "\xff\x9b\xfc\x40\xab", "\x01\xfc\x41"}),
+        this->offset_type(), "[0, 2, -1]", &options);
+
+    options.pattern = "\x40";
+    this->CheckUnary(
+        "find_substring",
+        this->MakeArray({"\xfc\x40\xab", "\xff\x9b\xfc\x40\xab", "\x01\xfc\x41"}),
+        this->offset_type(), "[1, 3, -1]", &options);
+
+    options.pattern = "\xfc\x40";
+    this->CheckUnary("count_substring",
+                     this->MakeArray({"\xfc\x40\xab", "\xff\x9b\xfc\x40\xab",
+                                      "\x01\xfc\x41", "\x01\xfc\x40\x40\xfc\x40\xab"}),
+                     this->offset_type(), "[1, 1, 0, 2]", &options);
+
+    options.pattern = "\xfc\x40";
+    this->CheckUnary("match_substring",
+                     this->MakeArray({"\xfc\x40\xab", "\xff\x9b\xfc\x40\xab",
+                                      "\x01\xfc\x41", "\x01\xfc\x40\x40\xfc\x40\xab"}),
+                     boolean(), "[true, true, false, true]", &options);
+
+    options.pattern = "\xfc\x40";
+    this->CheckUnary("starts_with",
+                     this->MakeArray({"\xfc\x40\xab", "\xff\x9b\xfc\x40\xab",
+                                      "\x01\xfc\x41", "\x01\xfc\x40\x40\xfc\x40\xab"}),
+                     boolean(), "[true, false, false, false]", &options);
+
+    options.pattern = "\xfc\x40";
+    this->CheckUnary("ends_with",
+                     this->MakeArray({"\xfc\x40\xab", "\xff\x9b\xfc\x40\xab",
+                                      "\x01\xfc\x41", "\x01\xfc\x40\x40\xfc\x40"}),
+                     boolean(), "[false, false, false, true]", &options);
+  }
+  {
+    // "foo<non-UTF8>bar" = \x66\x6f\x6f\xfc\x62\x61\x72
+    SplitPatternOptions options("\xfc");
+    this->CheckUnary("split_pattern",
+                     this->MakeArray({"\x66\x6f\x6f\xfc\x62\x61\x72", "foo"}),
+                     list(this->type()), R"([["foo", "bar"], ["foo"]])", &options);
+  }
+  {
+    ReplaceSubstringOptions options("\xfc\x40", "bazz", 1);
+    this->CheckUnary("replace_substring",
+                     this->MakeArray({"\xfc\x40", "this \xfc\x40 that \xfc\x40"}),
+                     this->MakeArray({"bazz", "this bazz that \xfc\x40"}), &options);
+  }
+  {
+    ReplaceSliceOptions options(1, 2, "\xfc\x40");
+    this->CheckUnary(
+        "binary_replace_slice", this->MakeArray({"\xf7\x0f\xab", "\xff\x9b\xc3\xbb"}),
+        this->MakeArray({"\xf7\xfc\x40\xab", "\xff\xfc\x40\xc3\xbb"}), &options);
+  }
+}
+
+TYPED_TEST(TestBinaryKernels, NonUtf8WithNull) {
+#ifdef ARROW_WITH_RE2
+  for (auto ignore_case : {true, false}) {
+#else
+  for (auto ignore_case : {false}) {
+#endif
+    MatchSubstringOptions options{std::string("\x00\x40", 2), ignore_case};
+    this->CheckUnary(
+        "find_substring",
+        this->template MakeArray<std::string>(
+            {{"\x00\x40\xab", 3}, {"\x00\x9b\x00\x40\xab", 5}, {"\x40\x00\x41", 3}}),
+        this->offset_type(), "[0, 2, -1]", &options);
+
+    this->CheckUnary(
+        "count_substring",
+        this->template MakeArray<std::string>({{"\x00\x40\xab", 3},
+                                               {"\x01\xfc\x41", 3},
+                                               {"\x01\x00\x00\x40\x00\x40\xab", 7}}),
+        this->offset_type(), "[1, 0, 2]", &options);
+
+    this->CheckUnary(
+        "match_substring",
+        this->template MakeArray<std::string>({{"\x00\x40\xab", 3},
+                                               {"\x00\xfc\x41", 3},
+                                               {"\x01\xfc\x00\x40\x00\x40\xab", 7}}),
+        boolean(), "[true, false, true]", &options);
+
+    this->CheckUnary(
+        "starts_with",
+        this->template MakeArray<std::string>(
+            {{"\x00\x40\xab", 3}, {"\x01\xfc\x41", 3}, {"\x00\x00\x00\x00\x00\x40", 6}}),
+        boolean(), "[true, false, false]", &options);
+
+    this->CheckUnary(
+        "ends_with",
+        this->template MakeArray<std::string>(
+            {{"\x00\x40\xab", 3}, {"\x01\xfc\x41", 3}, {"\x00\x00\x00\x00\x00\x40", 6}}),
+        boolean(), "[false, false, true]", &options);
+  }
+  {
+    // "foo<non-UTF8>bar" = \x66\x6f\x6f\xfc\x62\x61\x72
+    SplitPatternOptions options(std::string("\xfc\x00", 2));
+    this->CheckUnary(
+        "split_pattern",
+        this->template MakeArray<std::string>({{"\x66\x6f\x6f\xfc\x00\x62\x61\x72", 8}}),
+        list(this->type()), R"([["foo", "bar"]])", &options);
+  }
+  {
+    ReplaceSubstringOptions options(std::string("\x00\x40", 2), "bazz", 1);
+    this->CheckUnary("replace_substring",
+                     this->template MakeArray<std::string>({{"\x00\x40", 2}}),
+                     this->type(), R"(["bazz"])", &options);
+  }
+  {
+    ReplaceSliceOptions options(1, 2, std::string("\x00\x40", 2));
+    this->CheckUnary("binary_replace_slice",
+                     this->template MakeArray<std::string>(
+                         {{"\x00\x0f\xab", 3}, {"\x00\x9b\xc3\xbb", 4}}),
+                     this->template MakeArray<std::string>(
+                         {{"\x00\x00\x40\xab", 4}, {"\x00\x00\x40\xc3\xbb", 5}}),
+                     &options);
+  }
+}
+
+#ifdef ARROW_WITH_RE2
+TYPED_TEST(TestBinaryKernels, NonUtf8Regex) {
+  for (auto ignore_case : {true, false}) {
+    MatchSubstringOptions options("\xfc\x40", ignore_case);
+    options.pattern = "\x40+";
+    this->CheckUnary(
+        "find_substring_regex",
+        this->MakeArray({"\xfc\x40\xab", "\xff\x9b\xfc\x40\xab", "\x01\xfc\x41"}),
+        this->offset_type(), "[1, 3, -1]", &options);
+
+    options.pattern = "\x40*\x41";
+    this->CheckUnary("count_substring_regex",
+                     this->MakeArray({"\xfc\x42\xab", "\xff\x9b\x40\x41\xab",
+                                      "\x01\x41\x41", "\x01\x40\x41\x40\x40\x41\xab"}),
+                     this->offset_type(), "[0, 1, 2, 2]", &options);
+
+    options.pattern = "\xfc*\xab";
+    this->CheckUnary("match_substring_regex",
+                     this->MakeArray({"\xfc\x42\xab", "\xff\x9b\x40\x41\xab",
+                                      "\x01\x41\x41", "\x01\x40\x41\x40\x40\x41\xab"}),
+                     boolean(), "[true, true, false, true]", &options);
+
+    options.pattern = "%\xfc\x40";
+    this->CheckUnary("match_like",
+                     this->MakeArray({"\xfc\x40\xab", "\xff\x9b\xfc\x40\xab",
+                                      "\x01\xfc\x41", "\x01\xfc\x40\x40\xfc\x40"}),
+                     boolean(), "[false, false, false, true]", &options);
+  }
+  {
+    // "foo<non-UTF8>bar" = \x66\x6f\x6f\xfc\x62\x61\x72
+    SplitPatternOptions options("\xfc");
+    this->CheckUnary("split_pattern_regex",
+                     this->MakeArray({"\x66\x6f\x6f\xfc\x62\x61\x72", "foo"}),
+                     list(this->type()), R"([["foo", "bar"], ["foo"]])", &options);
+
+    options.pattern = "\xfc+|\x10";
+    this->CheckUnary("split_pattern_regex",
+                     this->MakeArray({"\x66\xfc\xfc\x6f\xfc\x62\x10\x72", "bar"}),
+                     list(this->type()), R"([["f", "o", "b", "r"], ["bar"]])", &options);
+  }
+  {
+    ReplaceSubstringOptions options("\xfc\x40", "bazz", 1);
+    this->CheckUnary("replace_substring_regex",
+                     this->MakeArray({"\xfc\x40", "this \xfc\x40 that \xfc\x40"}),
+                     this->MakeArray({"bazz", "this bazz that \xfc\x40"}), &options);
+  }
+  {
+    ExtractRegexOptions options("(?P<letter>[\\xfc])(?P<digit>\\d)");
+    auto null_bitmap = std::make_shared<Buffer>("0");
+    auto output = StructArray::Make(
+        {this->MakeArray({"\xfc", "1"}), this->MakeArray({"\xfc", "2"})},
+        {field("letter", this->type()), field("digit", this->type())}, null_bitmap);
+    this->CheckUnary("extract_regex", this->MakeArray({"foo\xfc 1bar", "\x02\xfc\x40"}),
+                     std::static_pointer_cast<Array>(*output), &options);
+  }
 }
 
-TYPED_TEST(TestBinaryKernels, BinaryReplaceSlice) {
+TYPED_TEST(TestBinaryKernels, NonUtf8WithNullRegex) {
+  for (auto ignore_case : {true, false}) {
+    MatchSubstringOptions options{std::string("\x00\x40", 2), ignore_case};
+    this->CheckUnary(
+        "find_substring_regex",
+        this->template MakeArray<std::string>(
+            {{"\x00\x40\xab", 3}, {"\x00\x9b\x00\x40\xab", 5}, {"\x40\x00\x41", 3}}),
+        this->offset_type(), "[0, 2, -1]", &options);
+
+    this->CheckUnary(
+        "count_substring_regex",
+        this->template MakeArray<std::string>({{"\x00\x40\xab", 3},
+                                               {"\x01\xfc\x41", 3},
+                                               {"\x01\x00\x00\x40\x00\x40\xab", 7}}),
+        this->offset_type(), "[1, 0, 2]", &options);
+
+    this->CheckUnary(
+        "match_substring_regex",
+        this->template MakeArray<std::string>({{"\x00\x40\xab", 3},
+                                               {"\x00\xfc\x41", 3},
+                                               {"\x01\xfc\x00\x40\x00\x40\xab", 7}}),
+        boolean(), "[true, false, true]", &options);
+
+    options.pattern = std::string("%\x00\x40", 3);
+    this->CheckUnary(
+        "match_like",
+        this->template MakeArray<std::string>({{"\x00\x40\xab", 3},
+                                               {"\xff\x9b\x00\x40\xab", 5},
+                                               {"\xff\xfc\x40\x40\x00\x40", 6}}),
+        boolean(), "[false, false, true]", &options);
+  }
+  {
+    // "foo<non-UTF8>bar" = \x66\x6f\x6f\xfc\x62\x61\x72
+    SplitPatternOptions options(std::string("\xfc\x00", 2));
+    this->CheckUnary(
+        "split_pattern_regex",
+        this->template MakeArray<std::string>({{"\x66\x6f\x6f\xfc\x00\x62\x61\x72", 8}}),
+        list(this->type()), R"([["foo", "bar"]])", &options);
+  }
+  {
+    ReplaceSubstringOptions options(std::string("\x00\x40", 2), "bazz", 1);
+    this->CheckUnary("replace_substring_regex",
+                     this->template MakeArray<std::string>({{"\x00\x40", 2}}),
+                     this->type(), R"(["bazz"])", &options);
+  }
+  {
+    ExtractRegexOptions options("(?P<null>[\\x00])(?P<digit>\\d)");
+    auto null_bitmap = std::make_shared<Buffer>("0");
+    auto output = StructArray::Make(
+        {this->template MakeArray<std::string>({{"\x00", 1}, {"1", 1}}),
+         this->template MakeArray<std::string>({{"\x00", 1}, {"2", 1}})},
+        {field("null", this->type()), field("digit", this->type())}, null_bitmap);
+    this->CheckUnary(
+        "extract_regex",
+        this->template MakeArray<std::string>({{"foo\x00 1bar", 9}, {"\x02\x00\x40", 3}}),
+        std::static_pointer_cast<Array>(*output), &options);
+  }
+  {
+    ReplaceSliceOptions options(1, 2, std::string("\x00\x40", 2));
+    this->CheckUnary("binary_replace_slice",
+                     this->template MakeArray<std::string>(
+                         {{"\x00\x0f\xab", 3}, {"\x00\x9b\xc3\xbb", 4}}),
+                     this->template MakeArray<std::string>(
+                         {{"\x00\x00\x40\xab", 4}, {"\x00\x00\x40\xc3\xbb", 5}}),
+                     &options);
+  }
+}
+#endif
+
+TYPED_TEST(TestBaseBinaryKernels, BinaryReplaceSlice) {
   ReplaceSliceOptions options{0, 1, "XX"};
   this->CheckUnary("binary_replace_slice", "[]", this->type(), "[]", &options);
   this->CheckUnary("binary_replace_slice", R"([null, "", "a", "ab", "abc"])",
@@ -150,7 +445,7 @@ TYPED_TEST(TestBinaryKernels, BinaryReplaceSlice) {
                    &options_neg_flip);
 }
 
-TYPED_TEST(TestBinaryKernels, FindSubstring) {
+TYPED_TEST(TestBaseBinaryKernels, FindSubstring) {
   MatchSubstringOptions options{"ab"};
   this->CheckUnary("find_substring", "[]", this->offset_type(), "[]", &options);
   this->CheckUnary("find_substring", R"(["abc", "acb", "cab", null, "bac"])",
@@ -174,7 +469,7 @@ TYPED_TEST(TestBinaryKernels, FindSubstring) {
 }
 
 #ifdef ARROW_WITH_RE2
-TYPED_TEST(TestBinaryKernels, FindSubstringIgnoreCase) {
+TYPED_TEST(TestBaseBinaryKernels, FindSubstringIgnoreCase) {
   MatchSubstringOptions options{"?AB)", /*ignore_case=*/true};
   this->CheckUnary("find_substring", "[]", this->offset_type(), "[]", &options);
   this->CheckUnary("find_substring",
@@ -182,7 +477,7 @@ TYPED_TEST(TestBinaryKernels, FindSubstringIgnoreCase) {
                    this->offset_type(), "[0, -1, 1, null, -1, -1]", &options);
 }
 
-TYPED_TEST(TestBinaryKernels, FindSubstringRegex) {
+TYPED_TEST(TestBaseBinaryKernels, FindSubstringRegex) {
   MatchSubstringOptions options{"a+", /*ignore_case=*/false};
   this->CheckUnary("find_substring_regex", "[]", this->offset_type(), "[]", &options);
   this->CheckUnary("find_substring_regex", R"(["a", "A", "baaa", null, "", "AaaA"])",
@@ -194,7 +489,7 @@ TYPED_TEST(TestBinaryKernels, FindSubstringRegex) {
                    this->offset_type(), "[0, 0, 1, null, -1, 0]", &options);
 }
 #else
-TYPED_TEST(TestBinaryKernels, FindSubstringIgnoreCase) {
+TYPED_TEST(TestBaseBinaryKernels, FindSubstringIgnoreCase) {
   MatchSubstringOptions options{"a+", /*ignore_case=*/true};
   Datum input = ArrayFromJSON(this->type(), R"(["a"])");
   EXPECT_RAISES_WITH_MESSAGE_THAT(NotImplemented,
@@ -203,7 +498,7 @@ TYPED_TEST(TestBinaryKernels, FindSubstringIgnoreCase) {
 }
 #endif
 
-TYPED_TEST(TestBinaryKernels, CountSubstring) {
+TYPED_TEST(TestBaseBinaryKernels, CountSubstring) {
   MatchSubstringOptions options{"aba"};
   this->CheckUnary("count_substring", "[]", this->offset_type(), "[]", &options);
   this->CheckUnary(
@@ -221,7 +516,7 @@ TYPED_TEST(TestBinaryKernels, CountSubstring) {
 }
 
 #ifdef ARROW_WITH_RE2
-TYPED_TEST(TestBinaryKernels, CountSubstringRegex) {
+TYPED_TEST(TestBaseBinaryKernels, CountSubstringRegex) {
   MatchSubstringOptions options{"aba"};
   this->CheckUnary("count_substring_regex", "[]", this->offset_type(), "[]", &options);
   this->CheckUnary(
@@ -247,7 +542,7 @@ TYPED_TEST(TestBinaryKernels, CountSubstringRegex) {
                    this->offset_type(), "[0, 1, 1, 2, 0]", &options_repeated);
 }
 
-TYPED_TEST(TestBinaryKernels, CountSubstringIgnoreCase) {
+TYPED_TEST(TestBaseBinaryKernels, CountSubstringIgnoreCase) {
   MatchSubstringOptions options{"aba", /*ignore_case=*/true};
   this->CheckUnary("count_substring", "[]", this->offset_type(), "[]", &options);
   this->CheckUnary(
@@ -260,7 +555,7 @@ TYPED_TEST(TestBinaryKernels, CountSubstringIgnoreCase) {
                    "[1, null, 4]", &options_empty);
 }
 
-TYPED_TEST(TestBinaryKernels, CountSubstringRegexIgnoreCase) {
+TYPED_TEST(TestBaseBinaryKernels, CountSubstringRegexIgnoreCase) {
   MatchSubstringOptions options_as{"a+", /*ignore_case=*/true};
   this->CheckUnary("count_substring_regex", R"(["", "bacAaAdaAaA", "c", "AAA"])",
                    this->offset_type(), "[0, 3, 0, 1]", &options_as);
@@ -270,7 +565,7 @@ TYPED_TEST(TestBinaryKernels, CountSubstringRegexIgnoreCase) {
                    this->offset_type(), "[1, 7, 2, 2]", &options_empty_match);
 }
 #else
-TYPED_TEST(TestBinaryKernels, CountSubstringIgnoreCase) {
+TYPED_TEST(TestBaseBinaryKernels, CountSubstringIgnoreCase) {
   Datum input = ArrayFromJSON(this->type(), R"(["a"])");
   MatchSubstringOptions options{"a", /*ignore_case=*/true};
   EXPECT_RAISES_WITH_MESSAGE_THAT(NotImplemented,
@@ -279,7 +574,7 @@ TYPED_TEST(TestBinaryKernels, CountSubstringIgnoreCase) {
 }
 #endif
 
-TYPED_TEST(TestBinaryKernels, BinaryJoinElementWise) {
+TYPED_TEST(TestBaseBinaryKernels, BinaryJoinElementWise) {
   const auto ty = this->type();
   JoinOptions options;
   JoinOptions options_skip(JoinOptions::SKIP);
@@ -572,11 +867,6 @@ TEST_F(TestFixedSizeBinaryKernels, FindSubstringIgnoreCase) {
 }
 #endif
 
-template <typename TestType>
-class TestStringKernels : public BaseTestStringKernels<TestType> {};
-
-TYPED_TEST_SUITE(TestStringKernels, StringArrowTypes);
-
 TYPED_TEST(TestStringKernels, AsciiUpper) {
   this->CheckUnary("ascii_upper", "[]", this->type(), "[]");
   this->CheckUnary("ascii_upper", "[\"aAazZæÆ&\", null, \"\", \"bbb\"]", this->type(),
@@ -918,7 +1208,7 @@ TYPED_TEST(TestStringKernels, IsUpperAscii) {
                    "[false, null, false, true, true, false, false]");
 }
 
-TYPED_TEST(TestStringKernels, MatchSubstring) {
+TYPED_TEST(TestBaseBinaryKernels, MatchSubstring) {
   MatchSubstringOptions options{"ab"};
   this->CheckUnary("match_substring", "[]", boolean(), "[]", &options);
   this->CheckUnary("match_substring", R"(["abc", "acb", "cab", null, "bac", "AB"])",
@@ -951,7 +1241,7 @@ TYPED_TEST(TestStringKernels, MatchSubstringIgnoreCase) {
                    &options_insensitive);
 }
 #else
-TYPED_TEST(TestStringKernels, MatchSubstringIgnoreCase) {
+TYPED_TEST(TestBaseBinaryKernels, MatchSubstringIgnoreCase) {
   Datum input = ArrayFromJSON(this->type(), R"(["a"])");
   MatchSubstringOptions options{"a", /*ignore_case=*/true};
   EXPECT_RAISES_WITH_MESSAGE_THAT(NotImplemented,
@@ -960,7 +1250,7 @@ TYPED_TEST(TestStringKernels, MatchSubstringIgnoreCase) {
 }
 #endif
 
-TYPED_TEST(TestStringKernels, MatchStartsWith) {
+TYPED_TEST(TestBaseBinaryKernels, MatchStartsWith) {
   MatchSubstringOptions options{"abab"};
   this->CheckUnary("starts_with", "[]", boolean(), "[]", &options);
   this->CheckUnary("starts_with", R"([null, "", "ab", "abab", "$abab", "abab$"])",
@@ -969,7 +1259,7 @@ TYPED_TEST(TestStringKernels, MatchStartsWith) {
                    boolean(), "[false, false, false, false, false]", &options);
 }
 
-TYPED_TEST(TestStringKernels, MatchEndsWith) {
+TYPED_TEST(TestBaseBinaryKernels, MatchEndsWith) {
   MatchSubstringOptions options{"abab"};
   this->CheckUnary("ends_with", "[]", boolean(), "[]", &options);
   this->CheckUnary("ends_with", R"([null, "", "ab", "abab", "$abab", "abab$"])",
@@ -979,7 +1269,7 @@ TYPED_TEST(TestStringKernels, MatchEndsWith) {
 }
 
 #ifdef ARROW_WITH_RE2
-TYPED_TEST(TestStringKernels, MatchStartsWithIgnoreCase) {
+TYPED_TEST(TestBaseBinaryKernels, MatchStartsWithIgnoreCase) {
   MatchSubstringOptions options{"aBAb", /*ignore_case=*/true};
   this->CheckUnary("starts_with", "[]", boolean(), "[]", &options);
   this->CheckUnary("starts_with", R"([null, "", "ab", "abab", "$abab", "abab$"])",
@@ -988,7 +1278,7 @@ TYPED_TEST(TestStringKernels, MatchStartsWithIgnoreCase) {
                    boolean(), "[true, false, true, false, true]", &options);
 }
 
-TYPED_TEST(TestStringKernels, MatchEndsWithIgnoreCase) {
+TYPED_TEST(TestBaseBinaryKernels, MatchEndsWithIgnoreCase) {
   MatchSubstringOptions options{"aBAb", /*ignore_case=*/true};
   this->CheckUnary("ends_with", "[]", boolean(), "[]", &options);
   this->CheckUnary("ends_with", R"([null, "", "ab", "abab", "$abab", "abab$"])",
@@ -997,7 +1287,7 @@ TYPED_TEST(TestStringKernels, MatchEndsWithIgnoreCase) {
                    boolean(), "[true, true, false, true, false]", &options);
 }
 #else
-TYPED_TEST(TestStringKernels, MatchStartsWithIgnoreCase) {
+TYPED_TEST(TestBaseBinaryKernels, MatchStartsWithIgnoreCase) {
   Datum input = ArrayFromJSON(this->type(), R"(["a"])");
   MatchSubstringOptions options{"a", /*ignore_case=*/true};
   EXPECT_RAISES_WITH_MESSAGE_THAT(NotImplemented,
@@ -1005,7 +1295,7 @@ TYPED_TEST(TestStringKernels, MatchStartsWithIgnoreCase) {
                                   CallFunction("starts_with", {input}, &options));
 }
 
-TYPED_TEST(TestStringKernels, MatchEndsWithIgnoreCase) {
+TYPED_TEST(TestBaseBinaryKernels, MatchEndsWithIgnoreCase) {
   Datum input = ArrayFromJSON(this->type(), R"(["a"])");
   MatchSubstringOptions options{"a", /*ignore_case=*/true};
   EXPECT_RAISES_WITH_MESSAGE_THAT(NotImplemented,
@@ -1045,12 +1335,12 @@ TYPED_TEST(TestStringKernels, MatchSubstringRegex) {
                    "[true, true, false, false]", &options_unicode);
 }
 
-TYPED_TEST(TestStringKernels, MatchSubstringRegexNoOptions) {
+TYPED_TEST(TestBaseBinaryKernels, MatchSubstringRegexNoOptions) {
   Datum input = ArrayFromJSON(this->type(), "[]");
   ASSERT_RAISES(Invalid, CallFunction("match_substring_regex", {input}));
 }
 
-TYPED_TEST(TestStringKernels, MatchSubstringRegexInvalid) {
+TYPED_TEST(TestBaseBinaryKernels, MatchSubstringRegexInvalid) {
   Datum input = ArrayFromJSON(this->type(), "[null]");
   MatchSubstringOptions options{"invalid["};
   EXPECT_RAISES_WITH_MESSAGE_THAT(
@@ -1094,7 +1384,7 @@ TYPED_TEST(TestStringKernels, MatchLike) {
                    "[false, true, false]", &insensitive_regex);
 }
 
-TYPED_TEST(TestStringKernels, MatchLikeEscaping) {
+TYPED_TEST(TestBaseBinaryKernels, MatchLikeEscaping) {
   auto inputs = R"(["%%foo", "_bar", "({", "\\baz"])";
 
   // N.B. I believe Impala mistakenly optimizes these into substring searches
@@ -1128,26 +1418,7 @@ TYPED_TEST(TestStringKernels, MatchLikeEscaping) {
 }
 #endif
 
-TYPED_TEST(TestStringKernels, FindSubstring) {
-  MatchSubstringOptions options{"ab"};
-  this->CheckUnary("find_substring", "[]", this->offset_type(), "[]", &options);
-  this->CheckUnary("find_substring", R"(["abc", "acb", "cab", null, "bac"])",
-                   this->offset_type(), "[0, -1, 1, null, -1]", &options);
-
-  MatchSubstringOptions options_repeated{"abab"};
-  this->CheckUnary("find_substring", R"(["abab", "ab", "cababc", null, "bac"])",
-                   this->offset_type(), "[0, -1, 1, null, -1]", &options_repeated);
-
-  MatchSubstringOptions options_double_char{"aab"};
-  this->CheckUnary("find_substring", R"(["aacb", "aab", "ab", "aaab"])",
-                   this->offset_type(), "[-1, 0, -1, 1]", &options_double_char);
-
-  MatchSubstringOptions options_double_char_2{"bbcaa"};
-  this->CheckUnary("find_substring", R"(["abcbaabbbcaabccabaab"])", this->offset_type(),
-                   "[7]", &options_double_char_2);
-}
-
-TYPED_TEST(TestStringKernels, SplitBasics) {
+TYPED_TEST(TestBaseBinaryKernels, SplitBasics) {
   SplitPatternOptions options{" "};
   // basics
   this->CheckUnary("split_pattern", R"(["foo bar", "foo"])", list(this->type()),
@@ -1169,7 +1440,7 @@ TYPED_TEST(TestStringKernels, SplitBasics) {
                    &options_long_reverse);
 }
 
-TYPED_TEST(TestStringKernels, SplitMax) {
+TYPED_TEST(TestBaseBinaryKernels, SplitMax) {
   SplitPatternOptions options{"---", 2};
   SplitPatternOptions options_reverse{"---", 2, /*reverse=*/true};
   this->CheckUnary("split_pattern", R"(["foo---bar", "foo", "foo---bar------ar"])",
@@ -1230,7 +1501,7 @@ TYPED_TEST(TestStringKernels, SplitWhitespaceUTF8Reverse) {
 }
 
 #ifdef ARROW_WITH_RE2
-TYPED_TEST(TestStringKernels, SplitRegex) {
+TYPED_TEST(TestBaseBinaryKernels, SplitRegex) {
   SplitPatternOptions options{"a+|b"};
 
   this->CheckUnary(
@@ -1247,7 +1518,7 @@ TYPED_TEST(TestStringKernels, SplitRegex) {
       &options);
 }
 
-TYPED_TEST(TestStringKernels, SplitRegexReverse) {
+TYPED_TEST(TestBaseBinaryKernels, SplitRegexReverse) {
   SplitPatternOptions options{"a+|b", /*max_splits=*/1, /*reverse=*/true};
   Datum input = ArrayFromJSON(this->type(), R"(["a"])");
 
@@ -1307,74 +1578,70 @@ TYPED_TEST(TestStringKernels, Utf8ReplaceSlice) {
                    &options_neg_flip);
 }
 
-TYPED_TEST(TestStringKernels, ReplaceSubstring) {
+TYPED_TEST(TestBaseBinaryKernels, ReplaceSubstring) {
   ReplaceSubstringOptions options{"foo", "bazz"};
   this->CheckUnary("replace_substring", R"(["foo", "this foo that foo", null])",
                    this->type(), R"(["bazz", "this bazz that bazz", null])", &options);
-}
 
-TYPED_TEST(TestStringKernels, ReplaceSubstringLimited) {
-  ReplaceSubstringOptions options{"foo", "bazz", 1};
+  options = ReplaceSubstringOptions{"foo", "bazz", 1};
   this->CheckUnary("replace_substring", R"(["foo", "this foo that foo", null])",
                    this->type(), R"(["bazz", "this bazz that foo", null])", &options);
-}
 
-TYPED_TEST(TestStringKernels, ReplaceSubstringNoOptions) {
   Datum input = ArrayFromJSON(this->type(), "[]");
   ASSERT_RAISES(Invalid, CallFunction("replace_substring", {input}));
 }
 
 #ifdef ARROW_WITH_RE2
-TYPED_TEST(TestStringKernels, ReplaceSubstringRegex) {
-  ReplaceSubstringOptions options_regex{"(fo+)\\s*", "\\1-bazz"};
+TYPED_TEST(TestBaseBinaryKernels, ReplaceSubstringRegex) {
+  ReplaceSubstringOptions options{"(fo+)\\s*", "\\1-bazz"};
   this->CheckUnary("replace_substring_regex", R"(["foo ", "this foo   that foo", null])",
                    this->type(), R"(["foo-bazz", "this foo-bazzthat foo-bazz", null])",
-                   &options_regex);
+                   &options);
   // make sure we match non-overlapping
-  ReplaceSubstringOptions options_regex2{"(a.a)", "aba\\1"};
+  options = ReplaceSubstringOptions{"(a.a)", "aba\\1"};
   this->CheckUnary("replace_substring_regex", R"(["aaaaaa"])", this->type(),
-                   R"(["abaaaaabaaaa"])", &options_regex2);
+                   R"(["abaaaaabaaaa"])", &options);
 
   // ARROW-12774
-  ReplaceSubstringOptions options_regex3{"X", "Y"};
+  options = ReplaceSubstringOptions{"X", "Y"};
   this->CheckUnary("replace_substring_regex",
                    R"(["A","A","A","A","A","A","A","A","A","A","A","A","A","A","A","A"])",
                    this->type(),
                    R"(["A","A","A","A","A","A","A","A","A","A","A","A","A","A","A","A"])",
-                   &options_regex3);
-}
+                   &options);
 
-TYPED_TEST(TestStringKernels, ReplaceSubstringRegexLimited) {
   // With a finite number of replacements
-  ReplaceSubstringOptions options1{"foo", "bazz", 1};
+  options = ReplaceSubstringOptions{"foo", "bazz", 1};
   this->CheckUnary("replace_substring", R"(["foo", "this foo that foo", null])",
-                   this->type(), R"(["bazz", "this bazz that foo", null])", &options1);
-  ReplaceSubstringOptions options_regex1{"(fo+)\\s*", "\\1-bazz", 1};
+                   this->type(), R"(["bazz", "this bazz that foo", null])", &options);
+
+  options = ReplaceSubstringOptions{"(fo+)\\s*", "\\1-bazz", 1};
   this->CheckUnary("replace_substring_regex", R"(["foo ", "this foo   that foo", null])",
                    this->type(), R"(["foo-bazz", "this foo-bazzthat foo", null])",
-                   &options_regex1);
-}
-
-TYPED_TEST(TestStringKernels, ReplaceSubstringRegexNoOptions) {
-  Datum input = ArrayFromJSON(this->type(), "[]");
-  ASSERT_RAISES(Invalid, CallFunction("replace_substring_regex", {input}));
+                   &options);
 }
 
-TYPED_TEST(TestStringKernels, ReplaceSubstringRegexInvalid) {
-  Datum input = ArrayFromJSON(this->type(), R"(["foo"])");
-  ReplaceSubstringOptions options{"invalid[", ""};
-  EXPECT_RAISES_WITH_MESSAGE_THAT(
-      Invalid, ::testing::HasSubstr("Invalid regular expression: missing ]"),
-      CallFunction("replace_substring_regex", {input}, &options));
-
-  // Capture group number out of range
-  options = ReplaceSubstringOptions{"(.)", "\\9"};
-  EXPECT_RAISES_WITH_MESSAGE_THAT(
-      Invalid, ::testing::HasSubstr("Invalid replacement string"),
-      CallFunction("replace_substring_regex", {input}, &options));
+TYPED_TEST(TestBaseBinaryKernels, ReplaceSubstringRegexInvalid) {
+  {
+    Datum input = ArrayFromJSON(this->type(), "[]");
+    ASSERT_RAISES(Invalid, CallFunction("replace_substring_regex", {input}));
+  }
+  {
+    Datum input = ArrayFromJSON(this->type(), R"(["foo"])");
+    ReplaceSubstringOptions options{"invalid[", ""};
+    EXPECT_RAISES_WITH_MESSAGE_THAT(
+        Invalid, ::testing::HasSubstr("Invalid regular expression: missing ]"),
+        CallFunction("replace_substring_regex", {input}, &options));
+
+    // Capture group number out of range
+    options = ReplaceSubstringOptions{"(.)", "\\9"};
+    EXPECT_RAISES_WITH_MESSAGE_THAT(
+        Invalid, ::testing::HasSubstr("Invalid replacement string"),
+        CallFunction("replace_substring_regex", {input}, &options));
+  }
 }
 
-TYPED_TEST(TestStringKernels, ExtractRegex) {
+TYPED_TEST(TestBaseBinaryKernels, ExtractRegex) {
   ExtractRegexOptions options{"(?P<letter>[ab])(?P<digit>\\d)"};
   auto type = struct_({field("letter", this->type()), field("digit", this->type())});
   this->CheckUnary("extract_regex", R"([])", type, R"([])", &options);
@@ -1394,7 +1661,7 @@ TYPED_TEST(TestStringKernels, ExtractRegex) {
                    &options);
 }
 
-TYPED_TEST(TestStringKernels, ExtractRegexNoCapture) {
+TYPED_TEST(TestBaseBinaryKernels, ExtractRegexNoCapture) {
   // XXX Should we accept this or is it a user error?
   ExtractRegexOptions options{"foo"};
   auto type = struct_({});
@@ -1402,12 +1669,12 @@ TYPED_TEST(TestStringKernels, ExtractRegexNoCapture) {
                    R"([{}, null, null])", &options);
 }
 
-TYPED_TEST(TestStringKernels, ExtractRegexNoOptions) {
+TYPED_TEST(TestBaseBinaryKernels, ExtractRegexNoOptions) {
   Datum input = ArrayFromJSON(this->type(), "[]");
   ASSERT_RAISES(Invalid, CallFunction("extract_regex", {input}));
 }
 
-TYPED_TEST(TestStringKernels, ExtractRegexInvalid) {
+TYPED_TEST(TestBaseBinaryKernels, ExtractRegexInvalid) {
   Datum input = ArrayFromJSON(this->type(), "[]");
   ExtractRegexOptions options{"invalid["};
   EXPECT_RAISES_WITH_MESSAGE_THAT(
diff --git a/cpp/src/arrow/compute/kernels/test_util.cc b/cpp/src/arrow/compute/kernels/test_util.cc
index e72c3dce294fb..1c6a51d09b7f7 100644
--- a/cpp/src/arrow/compute/kernels/test_util.cc
+++ b/cpp/src/arrow/compute/kernels/test_util.cc
@@ -40,7 +40,7 @@ namespace {
 
 template <typename T>
 DatumVector GetDatums(const std::vector<T>& inputs) {
-  std::vector<Datum> datums;
+  DatumVector datums;
   for (const auto& input : inputs) {
     datums.emplace_back(input);
   }
@@ -250,7 +250,7 @@ void CheckDictionary(const std::string& func_name, const DatumVector& args,
 
 void CheckScalarUnary(std::string func_name, Datum input, Datum expected,
                       const FunctionOptions* options) {
-  std::vector<Datum> input_vector = {std::move(input)};
+  DatumVector input_vector = {std::move(input)};
   CheckScalar(std::move(func_name), input_vector, expected, options);
 }
 
diff --git a/cpp/src/arrow/compute/kernels/vector_hash_test.cc b/cpp/src/arrow/compute/kernels/vector_hash_test.cc
index a10667e49008e..df19490b9f20d 100644
--- a/cpp/src/arrow/compute/kernels/vector_hash_test.cc
+++ b/cpp/src/arrow/compute/kernels/vector_hash_test.cc
@@ -364,7 +364,7 @@ class TestHashKernelBinaryTypes : public TestHashKernel {
   }
 };
 
-TYPED_TEST_SUITE(TestHashKernelBinaryTypes, BinaryArrowTypes);
+TYPED_TEST_SUITE(TestHashKernelBinaryTypes, BaseBinaryArrowTypes);
 
 TYPED_TEST(TestHashKernelBinaryTypes, ZeroChunks) {
   auto type = this->type();
diff --git a/cpp/src/arrow/compute/kernels/vector_replace_test.cc b/cpp/src/arrow/compute/kernels/vector_replace_test.cc
index 9eecc5309df40..f1e5750ca9577 100644
--- a/cpp/src/arrow/compute/kernels/vector_replace_test.cc
+++ b/cpp/src/arrow/compute/kernels/vector_replace_test.cc
@@ -162,7 +162,7 @@ using NumericBasedTypes =
 
 TYPED_TEST_SUITE(TestReplaceNumeric, NumericBasedTypes);
 TYPED_TEST_SUITE(TestReplaceDecimal, DecimalArrowTypes);
-TYPED_TEST_SUITE(TestReplaceBinary, BinaryArrowTypes);
+TYPED_TEST_SUITE(TestReplaceBinary, BaseBinaryArrowTypes);
 
 TYPED_TEST(TestReplaceNumeric, ReplaceWithMask) {
   this->Assert(ReplaceWithMask, this->array("[]"), this->mask_scalar(false),
diff --git a/cpp/src/arrow/compute/kernels/vector_selection_test.cc b/cpp/src/arrow/compute/kernels/vector_selection_test.cc
index 46751ec9cd69d..0adb0dd59f81d 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection_test.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection_test.cc
@@ -518,7 +518,7 @@ class TestFilterKernelWithString : public TestFilterKernel {
   }
 };
 
-TYPED_TEST_SUITE(TestFilterKernelWithString, BinaryArrowTypes);
+TYPED_TEST_SUITE(TestFilterKernelWithString, BaseBinaryArrowTypes);
 
 TYPED_TEST(TestFilterKernelWithString, FilterString) {
   this->AssertFilter(R"(["a", "b", "c"])", "[0, 1, 0]", R"(["b"])");
@@ -1156,7 +1156,7 @@ class TestTakeKernelWithString : public TestTakeKernelTyped<TypeClass> {
   }
 };
 
-TYPED_TEST_SUITE(TestTakeKernelWithString, BinaryArrowTypes);
+TYPED_TEST_SUITE(TestTakeKernelWithString, BaseBinaryArrowTypes);
 
 TYPED_TEST(TestTakeKernelWithString, TakeString) {
   this->AssertTake(R"(["a", "b", "c"])", "[0, 1, 0]", R"(["a", "b", "a"])");
@@ -1917,7 +1917,7 @@ class TestDropNullKernelWithString : public TestDropNullKernelTyped<TypeClass> {
   }
 };
 
-TYPED_TEST_SUITE(TestDropNullKernelWithString, BinaryArrowTypes);
+TYPED_TEST_SUITE(TestDropNullKernelWithString, BaseBinaryArrowTypes);
 
 TYPED_TEST(TestDropNullKernelWithString, DropNullString) {
   this->AssertDropNull(R"(["a", "b", "c"])", R"(["a", "b", "c"])");
diff --git a/cpp/src/arrow/testing/gtest_util.h b/cpp/src/arrow/testing/gtest_util.h
index da145bdfa0108..e7faea721016b 100644
--- a/cpp/src/arrow/testing/gtest_util.h
+++ b/cpp/src/arrow/testing/gtest_util.h
@@ -175,9 +175,11 @@ using TemporalArrowTypes =
 
 using DecimalArrowTypes = ::testing::Types<Decimal128Type, Decimal256Type>;
 
-using BinaryArrowTypes =
+using BaseBinaryArrowTypes =
     ::testing::Types<BinaryType, LargeBinaryType, StringType, LargeStringType>;
 
+using BinaryArrowTypes = ::testing::Types<BinaryType, LargeBinaryType>;
+
 using StringArrowTypes = ::testing::Types<StringType, LargeStringType>;
 
 using ListArrowTypes = ::testing::Types<ListType, LargeListType>;
diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst
index dd5696020819b..fce02afef5ab4 100644
--- a/docs/source/cpp/compute.rst
+++ b/docs/source/cpp/compute.rst
@@ -826,11 +826,11 @@ String transforms
 +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
 | binary_length           | Unary | Binary- or String-like | Int32 or Int64         |                                   | \(3)  |
 +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| binary_replace_slice    | Unary | String-like            | Binary- or String-like | :struct:`ReplaceSliceOptions`     | \(4)  |
+| binary_replace_slice    | Unary | Binary- or String-like | Binary- or String-like | :struct:`ReplaceSliceOptions`     | \(4)  |
 +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| replace_substring       | Unary | String-like            | String-like            | :struct:`ReplaceSubstringOptions` | \(5)  |
+| replace_substring       | Unary | Binary- or String-like | Binary- or String-like | :struct:`ReplaceSubstringOptions` | \(5)  |
 +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| replace_substring_regex | Unary | String-like            | String-like            | :struct:`ReplaceSubstringOptions` | \(6)  |
+| replace_substring_regex | Unary | Binary- or String-like | Binary- or String-like | :struct:`ReplaceSubstringOptions` | \(6)  |
 +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
 | utf8_capitalize         | Unary | String-like            | String-like            |                                   | \(8)  |
 +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
@@ -856,7 +856,7 @@ String transforms
   are present, ``Invalid`` :class:`Status` will be returned.
 
 * \(3) Output is the physical length in bytes of each input element.  Output
-  type is Int32 for Binary / String, Int64 for LargeBinary / LargeString.
+  type is Int32 for Binary/String, Int64 for LargeBinary/LargeString.
 
 * \(4) Replace the slice of the substring from :member:`ReplaceSliceOptions::start`
   (inclusive) to :member:`ReplaceSliceOptions::stop` (exclusive) by
@@ -967,9 +967,9 @@ when a positive ``max_splits`` is given.
 +==========================+============+=========================+===================+==================================+=========+
 | ascii_split_whitespace   | Unary      | String-like             | List-like         | :struct:`SplitOptions`           | \(1)    |
 +--------------------------+------------+-------------------------+-------------------+----------------------------------+---------+
-| split_pattern            | Unary      | String-like             | List-like         | :struct:`SplitPatternOptions`    | \(2)    |
+| split_pattern            | Unary      | Binary- or String-like  | List-like         | :struct:`SplitPatternOptions`    | \(2)    |
 +--------------------------+------------+-------------------------+-------------------+----------------------------------+---------+
-| split_pattern_regex      | Unary      | String-like             | List-like         | :struct:`SplitPatternOptions`    | \(3)    |
+| split_pattern_regex      | Unary      | Binary- or String-like  | List-like         | :struct:`SplitPatternOptions`    | \(3)    |
 +--------------------------+------------+-------------------------+-------------------+----------------------------------+---------+
 | utf8_split_whitespace    | Unary      | String-like             | List-like         | :struct:`SplitOptions`           | \(4)    |
 +--------------------------+------------+-------------------------+-------------------+----------------------------------+---------+
@@ -990,11 +990,11 @@ when a positive ``max_splits`` is given.
 String component extraction
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-+---------------+-------+-------------+-------------+-------------------------------+-------+
-| Function name | Arity | Input types | Output type | Options class                 | Notes |
-+===============+=======+=============+=============+===============================+=======+
-| extract_regex | Unary | String-like | Struct      | :struct:`ExtractRegexOptions` | \(1)  |
-+---------------+-------+-------------+-------------+-------------------------------+-------+
++---------------+-------+------------------------+-------------+-------------------------------+-------+
+| Function name | Arity | Input types            | Output type | Options class                 | Notes |
++===============+=======+========================+=============+===============================+=======+
+| extract_regex | Unary | Binary- or String-like | Struct      | :struct:`ExtractRegexOptions` | \(1)  |
++---------------+-------+------------------------+-------------+-------------------------------+-------+
 
 * \(1) Extract substrings defined by a regular expression using the Google RE2
   library.  The output struct field names refer to the named capture groups,
@@ -1006,13 +1006,13 @@ String joining
 
 These functions do the inverse of string splitting.
 
-+--------------------------+-----------+-----------------------+----------------+-------------------+-----------------------+---------+
-| Function name            | Arity     | Input type 1          | Input type 2   | Output type       | Options class         | Notes   |
-+==========================+===========+=======================+================+===================+=======================+=========+
-| binary_join              | Binary    | List of string-like   | String-like    | String-like       |                       | \(1)    |
-+--------------------------+-----------+-----------------------+----------------+-------------------+-----------------------+---------+
-| binary_join_element_wise | Varargs   | String-like (varargs) | String-like    | String-like       | :struct:`JoinOptions` | \(2)    |
-+--------------------------+-----------+-----------------------+----------------+-------------------+-----------------------+---------+
++--------------------------+---------+----------------------------------+------------------------+------------------------+-----------------------+---------+
+| Function name            | Arity   | Input type 1                     | Input type 2           | Output type            | Options class         | Notes   |
++==========================+=========+==================================+========================+========================+=======================+=========+
+| binary_join              | Binary  | List of Binary- or String-like   | String-like            | String-like            |                       | \(1)    |
++--------------------------+---------+----------------------------------+------------------------+------------------------+-----------------------+---------+
+| binary_join_element_wise | Varargs | Binary- or String-like (varargs) | Binary- or String-like | Binary- or String-like | :struct:`JoinOptions` | \(2)    |
++--------------------------+---------+----------------------------------+------------------------+------------------------+-----------------------+---------+
 
 * \(1) The first input must be an array, while the second can be a scalar or array.
   Each list of values in the first input is joined using each second input
@@ -1049,11 +1049,11 @@ Containment tests
 +-----------------------+-------+-----------------------------------+----------------+---------------------------------+-------+
 | Function name         | Arity | Input types                       | Output type    | Options class                   | Notes |
 +=======================+=======+===================================+================+=================================+=======+
-| count_substring       | Unary | String-like                       | Int32 or Int64 | :struct:`MatchSubstringOptions` | \(1)  |
+| count_substring       | Unary | Binary- or String-like            | Int32 or Int64 | :struct:`MatchSubstringOptions` | \(1)  |
 +-----------------------+-------+-----------------------------------+----------------+---------------------------------+-------+
-| count_substring_regex | Unary | String-like                       | Int32 or Int64 | :struct:`MatchSubstringOptions` | \(1)  |
+| count_substring_regex | Unary | Binary- or String-like            | Int32 or Int64 | :struct:`MatchSubstringOptions` | \(1)  |
 +-----------------------+-------+-----------------------------------+----------------+---------------------------------+-------+
-| ends_with             | Unary | String-like                       | Boolean        | :struct:`MatchSubstringOptions` | \(2)  |
+| ends_with             | Unary | Binary- or String-like            | Boolean        | :struct:`MatchSubstringOptions` | \(2)  |
 +-----------------------+-------+-----------------------------------+----------------+---------------------------------+-------+
 | find_substring        | Unary | Binary- and String-like           | Int32 or Int64 | :struct:`MatchSubstringOptions` | \(3)  |
 +-----------------------+-------+-----------------------------------+----------------+---------------------------------+-------+
@@ -1065,13 +1065,13 @@ Containment tests
 | is_in                 | Unary | Boolean, Null, Numeric, Temporal, | Boolean        | :struct:`SetLookupOptions`      | \(5)  |
 |                       |       | Binary- and String-like           |                |                                 |       |
 +-----------------------+-------+-----------------------------------+----------------+---------------------------------+-------+
-| match_like            | Unary | String-like                       | Boolean        | :struct:`MatchSubstringOptions` | \(6)  |
+| match_like            | Unary | Binary- or String-like            | Boolean        | :struct:`MatchSubstringOptions` | \(6)  |
 +-----------------------+-------+-----------------------------------+----------------+---------------------------------+-------+
-| match_substring       | Unary | String-like                       | Boolean        | :struct:`MatchSubstringOptions` | \(7)  |
+| match_substring       | Unary | Binary- or String-like            | Boolean        | :struct:`MatchSubstringOptions` | \(7)  |
 +-----------------------+-------+-----------------------------------+----------------+---------------------------------+-------+
-| match_substring_regex | Unary | String-like                       | Boolean        | :struct:`MatchSubstringOptions` | \(8)  |
+| match_substring_regex | Unary | Binary- or String-like            | Boolean        | :struct:`MatchSubstringOptions` | \(8)  |
 +-----------------------+-------+-----------------------------------+----------------+---------------------------------+-------+
-| starts_with           | Unary | String-like                       | Boolean        | :struct:`MatchSubstringOptions` | \(2)  |
+| starts_with           | Unary | Binary- or String-like            | Boolean        | :struct:`MatchSubstringOptions` | \(2)  |
 +-----------------------+-------+-----------------------------------+----------------+---------------------------------+-------+
 
 * \(1) Output is the number of occurrences of
@@ -1319,7 +1319,6 @@ null input value is converted into a null output value.
   input value type to the output value type (if a conversion is
   available).
 
-
 Temporal component extraction
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -1458,7 +1457,6 @@ An error is returned if the timestamps already have the timezone metadata set.
   allows choosing the behaviour when a timestamp is ambiguous or nonexistent
   in the given timezone (because of DST shifts).
 
-
 Array-wise ("vector") functions
 -------------------------------
 

From b4bc846fcdf189ae0443b8445c3ef69fc4131764 Mon Sep 17 00:00:00 2001
From: Benson Muite <benson_muite@emailplus.org>
Date: Mon, 25 Oct 2021 14:44:14 -0400
Subject: [PATCH 013/194] ARROW-14452: [Release][JS] Update JavaScript testing

Error message:

    + yarn run-s clean:all lint build
    yarn run v1.22.17
    $ /tmp/arrow-6.0.0.BDnN3/apache-arrow-6.0.0/js/node_modules/.bin/run-s clean:all lint build
    events.js:377
          throw er; // Unhandled 'error' event
          ^

    Error: EBADF: bad file descriptor, read
    Emitted 'error' event on ReadStream instance at:
        at internal/fs/streams.js:173:14
        at FSReqCallback.wrapper [as oncomplete] (fs.js:562:5) {
      errno: -9,
      code: 'EBADF',
      syscall: 'read'
    }
    error Command failed with exit code 1.

Closes #11529 from bkmgit/ARROW-14452

Lead-authored-by: Benson Muite <benson_muite@emailplus.org>
Co-authored-by: Benson Muite <bkmgit@users.noreply.github.com>
Signed-off-by: Dominik Moritz <domoritz@gmail.com>
---
 dev/README.md                           |  8 ---
 dev/release/verify-release-candidate.sh |  6 +-
 js/package.json                         |  3 +-
 js/yarn.lock                            | 78 +------------------------
 4 files changed, 8 insertions(+), 87 deletions(-)

diff --git a/dev/README.md b/dev/README.md
index 258792b805a0b..16784b06d0bd7 100644
--- a/dev/README.md
+++ b/dev/README.md
@@ -135,14 +135,6 @@ Studio 2015):
 dev/release/verify-release-candidate.bat apache-arrow-0.7.0.tar.gz
 ```
 
-### Verifying the JavaScript release
-
-For JavaScript-specific releases, use a different verification script:
-
-```shell
-bash dev/release/js-verify-release-candidate.sh 0.7.0 0
-```
-
 # Integration testing
 
 Build the following base image used by multiple tests:
diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh
index 63561a12b87fc..870f9ec3849e4 100755
--- a/dev/release/verify-release-candidate.sh
+++ b/dev/release/verify-release-candidate.sh
@@ -414,7 +414,7 @@ test_js() {
   if [ "${INSTALL_NODE}" -gt 0 ]; then
     export NVM_DIR="`pwd`/.nvm"
     mkdir -p $NVM_DIR
-    curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.35.3/install.sh | \
+    curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.39.0/install.sh | \
       PROFILE=/dev/null bash
     [ -s "$NVM_DIR/nvm.sh" ] && \. "$NVM_DIR/nvm.sh"
 
@@ -423,7 +423,9 @@ test_js() {
   fi
 
   yarn --frozen-lockfile
-  yarn run-s clean:all lint build
+  yarn clean:all
+  yarn lint
+  yarn build
   yarn test
   popd
 }
diff --git a/js/package.json b/js/package.json
index efc5f0ff253f0..12144de9b86af 100644
--- a/js/package.json
+++ b/js/package.json
@@ -13,7 +13,7 @@
     "perf": "ts-node-transpile-only ./perf/index.ts",
     "test:integration": "node ./bin/integration.js --mode validate",
     "release": "./npm-release.sh",
-    "clean:all": "run-p clean clean:testdata",
+    "clean:all": "yarn clean && yarn clean:testdata",
     "clean:testdata": "gulp clean:testdata",
     "create:testdata": "gulp create:testdata",
     "test:coverage": "gulp test -t src --coverage",
@@ -88,7 +88,6 @@
     "memfs": "3.2.2",
     "mkdirp": "1.0.4",
     "multistream": "4.1.0",
-    "npm-run-all": "4.1.5",
     "randomatic": "3.1.1",
     "rxjs": "7.2.0",
     "ts-jest": "27.0.3",
diff --git a/js/yarn.lock b/js/yarn.lock
index 3c77aeed0303b..40b9fd648186f 100644
--- a/js/yarn.lock
+++ b/js/yarn.lock
@@ -2476,7 +2476,7 @@ caseless@~0.12.0:
   resolved "https://registry.yarnpkg.com/caseless/-/caseless-0.12.0.tgz#1b681c21ff84033c826543090689420d187151dc"
   integrity sha1-G2gcIf+EAzyCZUMJBolCDRhxUdw=
 
-chalk@2.x, chalk@^2.0.0, chalk@^2.4.1, chalk@^2.4.2:
+chalk@2.x, chalk@^2.0.0, chalk@^2.4.2:
   version "2.4.2"
   resolved "https://registry.yarnpkg.com/chalk/-/chalk-2.4.2.tgz#cd42541677a54333cf541a49108c1432b44c9424"
   integrity sha512-Mti+f9lpJNcwF4tWV8/OrTTtF1gZi+f8FqlyAdouralcFWFQWF2+NgCHShjkCb+IFBLq9buZwE1xckQU4peSuQ==
@@ -2950,17 +2950,6 @@ cross-env@7.0.3:
   dependencies:
     cross-spawn "^7.0.1"
 
-cross-spawn@^6.0.5:
-  version "6.0.5"
-  resolved "https://registry.yarnpkg.com/cross-spawn/-/cross-spawn-6.0.5.tgz#4a5ec7c64dfae22c3a14124dbacdee846d80cbc4"
-  integrity sha512-eTVLrBSt7fjbDygz805pMnstIs2VTBNkRm0qxZd+M7A5XDdxVRWO5MxGBXZhjY4cqLYLdtrGqRf8mBPmzwSpWQ==
-  dependencies:
-    nice-try "^1.0.4"
-    path-key "^2.0.1"
-    semver "^5.5.0"
-    shebang-command "^1.2.0"
-    which "^1.2.9"
-
 cross-spawn@^7.0.1, cross-spawn@^7.0.2, cross-spawn@^7.0.3:
   version "7.0.3"
   resolved "https://registry.yarnpkg.com/cross-spawn/-/cross-spawn-7.0.3.tgz#f73a85b9d5d41d045551c177e2882d4ac85728a6"
@@ -6159,11 +6148,6 @@ memoizee@0.4.X:
     next-tick "^1.1.0"
     timers-ext "^0.1.7"
 
-memorystream@^0.3.1:
-  version "0.3.1"
-  resolved "https://registry.yarnpkg.com/memorystream/-/memorystream-0.3.1.tgz#86d7090b30ce455d63fbae12dda51a47ddcaf9b2"
-  integrity sha1-htcJCzDORV1j+64S3aUaR93K+bI=
-
 meow@^10.1.0:
   version "10.1.0"
   resolved "https://registry.yarnpkg.com/meow/-/meow-10.1.0.tgz#43edce35b3c5b7056d74bd9d63897220d3c190a6"
@@ -6503,11 +6487,6 @@ next-tick@~1.0.0:
   resolved "https://registry.yarnpkg.com/next-tick/-/next-tick-1.0.0.tgz#ca86d1fe8828169b0120208e3dc8424b9db8342c"
   integrity sha1-yobR/ogoFpsBICCOPchCS524NCw=
 
-nice-try@^1.0.4:
-  version "1.0.5"
-  resolved "https://registry.yarnpkg.com/nice-try/-/nice-try-1.0.5.tgz#a3378a7696ce7d223e88fc9b764bd7ef1089e366"
-  integrity sha512-1nh45deeb5olNY7eX82BkPO7SSxR5SSYJiPTrTdFUVYwAl8CKMA5N9PjTYkHiRjisVcxcQ1HXdLhx2qxxJzLNQ==
-
 node-fetch@^2.6.1:
   version "2.6.1"
   resolved "https://registry.yarnpkg.com/node-fetch/-/node-fetch-2.6.1.tgz#045bd323631f76ed2e2b55573394416b639a0052"
@@ -6708,21 +6687,6 @@ npm-registry-fetch@^9.0.0:
     minizlib "^2.0.0"
     npm-package-arg "^8.0.0"
 
-npm-run-all@4.1.5:
-  version "4.1.5"
-  resolved "https://registry.yarnpkg.com/npm-run-all/-/npm-run-all-4.1.5.tgz#04476202a15ee0e2e214080861bff12a51d98fba"
-  integrity sha512-Oo82gJDAVcaMdi3nuoKFavkIHBRVqQ1qvMb+9LHk/cF4P6B2m8aP04hGf7oL6wZ9BuGwX1onlLhpuoofSyoQDQ==
-  dependencies:
-    ansi-styles "^3.2.1"
-    chalk "^2.4.1"
-    cross-spawn "^6.0.5"
-    memorystream "^0.3.1"
-    minimatch "^3.0.4"
-    pidtree "^0.3.0"
-    read-pkg "^3.0.0"
-    shell-quote "^1.6.1"
-    string.prototype.padend "^3.0.0"
-
 npm-run-path@^4.0.1:
   version "4.0.1"
   resolved "https://registry.yarnpkg.com/npm-run-path/-/npm-run-path-4.0.1.tgz#b7ecd1e5ed53da8e37a55e1c2269e0b97ed748ea"
@@ -7194,11 +7158,6 @@ path-is-absolute@^1.0.0:
   resolved "https://registry.yarnpkg.com/path-is-absolute/-/path-is-absolute-1.0.1.tgz#174b9268735534ffbc7ace6bf53a5a9e1b5c5f5f"
   integrity sha1-F0uSaHNVNP+8es5r9TpanhtcX18=
 
-path-key@^2.0.1:
-  version "2.0.1"
-  resolved "https://registry.yarnpkg.com/path-key/-/path-key-2.0.1.tgz#411cadb574c5a140d3a4b1910d40d80cc9f40b40"
-  integrity sha1-QRyttXTFoUDTpLGRDUDYDMn0C0A=
-
 path-key@^3.0.0, path-key@^3.1.0:
   version "3.1.1"
   resolved "https://registry.yarnpkg.com/path-key/-/path-key-3.1.1.tgz#581f6ade658cbba65a0d3380de7753295054f375"
@@ -7252,11 +7211,6 @@ picomatch@^2.0.4, picomatch@^2.2.3:
   resolved "https://registry.yarnpkg.com/picomatch/-/picomatch-2.3.0.tgz#f1f061de8f6a4bf022892e2d128234fb98302972"
   integrity sha512-lY1Q/PiJGC2zOv/z391WOTD+Z02bCgsFfvxoXXf6h7kv9o+WmsmzYqrAwY63sNgOxE4xEdq0WyUnXfKeBrSvYw==
 
-pidtree@^0.3.0:
-  version "0.3.1"
-  resolved "https://registry.yarnpkg.com/pidtree/-/pidtree-0.3.1.tgz#ef09ac2cc0533df1f3250ccf2c4d366b0d12114a"
-  integrity sha512-qQbW94hLHEqCg7nhby4yRC7G2+jYHY4Rguc2bjw7Uug4GIJuu1tvf2uHaZv5Q8zdt+WKJ6qK1FOI6amaWUo5FA==
-
 pify@^2.0.0, pify@^2.3.0:
   version "2.3.0"
   resolved "https://registry.yarnpkg.com/pify/-/pify-2.3.0.tgz#ed141a6ac043a849ea588498e7dca8b15330e90c"
@@ -7949,7 +7903,7 @@ semver-greatest-satisfied-range@^1.1.0:
   dependencies:
     sver-compat "^1.5.0"
 
-"semver@2 || 3 || 4 || 5", semver@^5.5.0, semver@^5.6.0, semver@^5.7.1:
+"semver@2 || 3 || 4 || 5", semver@^5.6.0, semver@^5.7.1:
   version "5.7.1"
   resolved "https://registry.yarnpkg.com/semver/-/semver-5.7.1.tgz#a954f931aeba508d307bbf069eff0c01c96116f7"
   integrity sha512-sauaDf/PZdVgrLTNYHRtpXa1iRiKcaebiKQ1BJdpQlWH2lCvexQdX55snPFyK7QzpudqbCI0qXFfOasHdyNDGQ==
@@ -7988,13 +7942,6 @@ shallow-clone@^3.0.0:
   dependencies:
     kind-of "^6.0.2"
 
-shebang-command@^1.2.0:
-  version "1.2.0"
-  resolved "https://registry.yarnpkg.com/shebang-command/-/shebang-command-1.2.0.tgz#44aac65b695b03398968c39f363fee5deafdf1ea"
-  integrity sha1-RKrGW2lbAzmJaMOfNj/uXer98eo=
-  dependencies:
-    shebang-regex "^1.0.0"
-
 shebang-command@^2.0.0:
   version "2.0.0"
   resolved "https://registry.yarnpkg.com/shebang-command/-/shebang-command-2.0.0.tgz#ccd0af4f8835fbdc265b82461aaf0c36663f34ea"
@@ -8002,21 +7949,11 @@ shebang-command@^2.0.0:
   dependencies:
     shebang-regex "^3.0.0"
 
-shebang-regex@^1.0.0:
-  version "1.0.0"
-  resolved "https://registry.yarnpkg.com/shebang-regex/-/shebang-regex-1.0.0.tgz#da42f49740c0b42db2ca9728571cb190c98efea3"
-  integrity sha1-2kL0l0DAtC2yypcoVxyxkMmO/qM=
-
 shebang-regex@^3.0.0:
   version "3.0.0"
   resolved "https://registry.yarnpkg.com/shebang-regex/-/shebang-regex-3.0.0.tgz#ae16f1644d873ecad843b0307b143362d4c42172"
   integrity sha512-7++dFhtcx3353uBaq8DDR4NuxBetBzC7ZQOhmTQInHEd6bSrXdiEyzCvG07Z44UYdLShWUyXt5M/yhz8ekcb1A==
 
-shell-quote@^1.6.1:
-  version "1.7.2"
-  resolved "https://registry.yarnpkg.com/shell-quote/-/shell-quote-1.7.2.tgz#67a7d02c76c9da24f99d20808fcaded0e0e04be2"
-  integrity sha512-mRz/m/JVscCrkMyPqHc/bczi3OQHkLTqXHEFu0zDhK/qfv3UcOA4SVmRCLmos4bhjr9ekVQubj/R7waKapmiQg==
-
 shiki@^0.9.3:
   version "0.9.5"
   resolved "https://registry.yarnpkg.com/shiki/-/shiki-0.9.5.tgz#c8da81a05fbfd1810729c6873901a729a72ec541"
@@ -8340,15 +8277,6 @@ string-width@^4.1.0, string-width@^4.2.0:
     is-fullwidth-code-point "^3.0.0"
     strip-ansi "^6.0.0"
 
-string.prototype.padend@^3.0.0:
-  version "3.1.2"
-  resolved "https://registry.yarnpkg.com/string.prototype.padend/-/string.prototype.padend-3.1.2.tgz#6858ca4f35c5268ebd5e8615e1327d55f59ee311"
-  integrity sha512-/AQFLdYvePENU3W5rgurfWSMU6n+Ww8n/3cUt7E+vPBB/D7YDG8x+qjoFs4M/alR2bW7Qg6xMjVwWUOvuQ0XpQ==
-  dependencies:
-    call-bind "^1.0.2"
-    define-properties "^1.1.3"
-    es-abstract "^1.18.0-next.2"
-
 string.prototype.trimend@^1.0.4:
   version "1.0.4"
   resolved "https://registry.yarnpkg.com/string.prototype.trimend/-/string.prototype.trimend-1.0.4.tgz#e75ae90c2942c63504686c18b287b4a0b1a45f80"
@@ -9272,7 +9200,7 @@ which-module@^1.0.0:
   resolved "https://registry.yarnpkg.com/which-module/-/which-module-1.0.0.tgz#bba63ca861948994ff307736089e3b96026c2a4f"
   integrity sha1-u6Y8qGGUiZT/MHc2CJ47lgJsKk8=
 
-which@^1.2.14, which@^1.2.9, which@^1.3.1:
+which@^1.2.14, which@^1.3.1:
   version "1.3.1"
   resolved "https://registry.yarnpkg.com/which/-/which-1.3.1.tgz#a45043d54f5805316da8d62f9f50918d3da70b0a"
   integrity sha512-HxJdYWq1MTIQbJ3nw0cqssHoTNU267KlrDuGZ1WYlxDStUtKUhOaJmh112/TZmHxxUfuJqPXSOm7tDyas0OSIQ==

From a2bec32615f94743d4c68df1b5d28aecf49433a9 Mon Sep 17 00:00:00 2001
From: Matthew Topol <mtopol@factset.com>
Date: Mon, 25 Oct 2021 14:53:40 -0400
Subject: [PATCH 014/194] ARROW-14462: [Go][Parquet] Update dependencies

Closes #11537 from zeroshade/update-deps

Authored-by: Matthew Topol <mtopol@factset.com>
Signed-off-by: Matthew Topol <mtopol@factset.com>
---
 go/parquet/go.mod                             |  9 ++--
 go/parquet/go.sum                             | 44 ++++++++++++++++++-
 .../gen-go/parquet/GoUnusedProtection__.go    |  2 +-
 .../internal/gen-go/parquet/parquet-consts.go |  6 +--
 go/parquet/internal/gen-go/parquet/parquet.go | 38 +++++++++-------
 5 files changed, 73 insertions(+), 26 deletions(-)

diff --git a/go/parquet/go.mod b/go/parquet/go.mod
index ccb401e9d5368..d1e28dd936d84 100644
--- a/go/parquet/go.mod
+++ b/go/parquet/go.mod
@@ -20,9 +20,9 @@ go 1.15
 
 require (
 	github.com/JohnCGriffin/overflow v0.0.0-20170615021017-4d914c927216
-	github.com/andybalholm/brotli v1.0.1
-	github.com/apache/arrow/go/arrow v0.0.0-20210909182554-946bdcf83611
-	github.com/apache/thrift/lib/go/thrift v0.0.0-20210120171102-e27e82c46ba4
+	github.com/andybalholm/brotli v1.0.3
+	github.com/apache/arrow/go/arrow v0.0.0-20211025125312-be665ef948cb
+	github.com/apache/thrift v0.15.0
 	github.com/golang/snappy v0.0.3
 	github.com/klauspost/asmfmt v1.2.3
 	github.com/klauspost/compress v1.13.1
@@ -32,6 +32,7 @@ require (
 	github.com/zeebo/xxh3 v0.10.0
 	golang.org/x/exp v0.0.0-20210220032938-85be41e4509f
 	golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c
+	golang.org/x/tools v0.1.4
 	golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1
-	gonum.org/v1/gonum v0.8.2
+	gonum.org/v1/gonum v0.9.3
 )
diff --git a/go/parquet/go.sum b/go/parquet/go.sum
index 46b4f5a555aa9..6df409297b40d 100644
--- a/go/parquet/go.sum
+++ b/go/parquet/go.sum
@@ -1,6 +1,8 @@
 cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
 cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
+dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU=
 dmitri.shuralyov.com/gpu/mtl v0.0.0-20201218220906-28db891af037/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU=
+gioui.org v0.0.0-20210308172011-57750fc8a0a6/go.mod h1:RSH6KIUZ0p2xy5zHDxgAM4zumjgTw83q2ge/PI+yyw8=
 github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
 github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo=
 github.com/JohnCGriffin/overflow v0.0.0-20170615021017-4d914c927216 h1:2ZboyJ8vl75fGesnG9NpMTD2DyQI3FzMXy4x752rGF0=
@@ -8,11 +10,16 @@ github.com/JohnCGriffin/overflow v0.0.0-20170615021017-4d914c927216/go.mod h1:X0
 github.com/ajstarks/svgo v0.0.0-20180226025133-644b8db467af/go.mod h1:K08gAheRH3/J6wwsYMMT4xOr94bZjxIelGM0+d/wbFw=
 github.com/andybalholm/brotli v1.0.1 h1:KqhlKozYbRtJvsPrrEeXcO+N2l6NYT5A2QAFmSULpEc=
 github.com/andybalholm/brotli v1.0.1/go.mod h1:loMXtMfwqflxFJPmdbJO0a3KNoPuLBgiu3qAvBg8x/Y=
+github.com/andybalholm/brotli v1.0.3 h1:fpcw+r1N1h0Poc1F/pHbW40cUm/lMEQslZtCkBQ0UnM=
+github.com/andybalholm/brotli v1.0.3/go.mod h1:fO7iG3H7G2nSZ7m0zPUDn85XEX2GTukHGRSepvi9Eig=
 github.com/antihax/optional v1.0.0/go.mod h1:uupD/76wgC+ih3iEmQUL+0Ugr19nfwCT1kdvxnR2qWY=
 github.com/apache/arrow/go/arrow v0.0.0-20210909182554-946bdcf83611 h1:17eVDdLcmI8iNy6dDimoLT+5jG3Y68KxZkDkKI1rLuw=
 github.com/apache/arrow/go/arrow v0.0.0-20210909182554-946bdcf83611/go.mod h1:2qMFB56yOP3KzkB3PbYZ4AlUFg3a88F67TIx5lB/WwY=
-github.com/apache/thrift/lib/go/thrift v0.0.0-20210120171102-e27e82c46ba4 h1:orNYqmQGnSjgOauLWjHEp9/qIDT98xv/0Aa4Zet3/Y8=
-github.com/apache/thrift/lib/go/thrift v0.0.0-20210120171102-e27e82c46ba4/go.mod h1:V/LzksIyqd3KZuQ2SunvReTG/UkArhII1dAWY5U1sCE=
+github.com/apache/arrow/go/arrow v0.0.0-20211025125312-be665ef948cb h1:4t4siO1kRtmrdKcOKXxZvtFpCP/bJQW7LA3qABUhdEY=
+github.com/apache/arrow/go/arrow v0.0.0-20211025125312-be665ef948cb/go.mod h1:Q7yQnSMnLvcXlZ8RV+jwz/6y1rQTqbX6C82SndT52Zs=
+github.com/apache/thrift v0.15.0 h1:aGvdaR0v1t9XLgjtBYwxcBvBOTMqClzwE26CHOgjW1Y=
+github.com/apache/thrift v0.15.0/go.mod h1:PHK3hniurgQaNMZYaCLEqXKsYK8upmhPbmdP2FXSqgU=
+github.com/boombuler/barcode v1.0.0/go.mod h1:paBWMcWSl3LHKBqUq+rly7CNSldXjb2rDl3JlRe0mD8=
 github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
 github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
 github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc=
@@ -29,11 +36,19 @@ github.com/envoyproxy/go-control-plane v0.9.9-0.20210217033140-668b12f5399d/go.m
 github.com/envoyproxy/go-control-plane v0.9.9-0.20210512163311-63b5d3c536b0/go.mod h1:hliV/p42l8fGbc6Y9bQ70uLwIvmJyVE5k4iMKlh8wCQ=
 github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c=
 github.com/fogleman/gg v1.2.1-0.20190220221249-0403632d5b90/go.mod h1:R/bRT+9gY/C5z7JzPU0zXsXHKM4/ayA+zqcVNZzPa1k=
+github.com/fogleman/gg v1.3.0/go.mod h1:R/bRT+9gY/C5z7JzPU0zXsXHKM4/ayA+zqcVNZzPa1k=
 github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04=
+github.com/go-fonts/dejavu v0.1.0/go.mod h1:4Wt4I4OU2Nq9asgDCteaAaWZOV24E+0/Pwo0gppep4g=
+github.com/go-fonts/latin-modern v0.2.0/go.mod h1:rQVLdDMK+mK1xscDwsqM5J8U2jrRa3T0ecnM9pNujks=
+github.com/go-fonts/liberation v0.1.1/go.mod h1:K6qoJYypsmfVjWg8KOVDQhLc8UDgIK2HYqyqAO9z7GY=
+github.com/go-fonts/stix v0.1.0/go.mod h1:w/c1f0ldAUlJmLBvlbkvVXLAD+tAMqobIIQpmnUIzUY=
+github.com/go-gl/glfw v0.0.0-20190409004039-e6da0acd62b1/go.mod h1:vR7hzQXu2zJy9AVAgeJqvqgH9Q5CA+iKCZ2gyEVpxRU=
 github.com/go-gl/glfw/v3.3/glfw v0.0.0-20200222043503-6f7a984d4dc4/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8=
+github.com/go-latex/latex v0.0.0-20210118124228-b3d85cf34e07/go.mod h1:CO1AlKB2CSIqUrmQPqA0gdRIlnLEY0gK5JGjh37zN5U=
 github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k=
 github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q=
 github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A=
+github.com/golang/mock v1.5.0/go.mod h1:CWnOUgYIOo4TcNZ0wHX3YZCqsaM1I1Jvs6v3mP3KVu8=
 github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
 github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
 github.com/golang/protobuf v1.3.3/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw=
@@ -60,6 +75,7 @@ github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/
 github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
 github.com/grpc-ecosystem/grpc-gateway v1.16.0/go.mod h1:BDjrQk3hbvj6Nolgz8mAMFbcEtjT1g+wF4CSlocrBnw=
+github.com/jung-kurt/gofpdf v1.0.0/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+r32jIOes=
 github.com/jung-kurt/gofpdf v1.0.3-0.20190309125859-24315acbbda5/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+r32jIOes=
 github.com/klauspost/asmfmt v1.2.3 h1:qEM7SLDo6DXXXz5yTpqUoxhsrtwH30nNR2riO2ZjznY=
 github.com/klauspost/asmfmt v1.2.3/go.mod h1:RAoUvqkWr2rUa2I19qKMEVZQe4BVtcHGTMCUOcCU2Lg=
@@ -69,14 +85,20 @@ github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8 h1:AMFGa4R4MiIpsp
 github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8/go.mod h1:mC1jAcsrzbxHt8iiaC+zU4b1ylILSosueou12R++wfY=
 github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3 h1:+n/aFZefKZp7spd8DFdX7uMikMLXX4oubIzJF4kv/wI=
 github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3/go.mod h1:RagcQ7I8IeTMnF8JTXieKnO4Z6JCsikNEzj0DwauVzE=
+github.com/phpdave11/gofpdf v1.4.2/go.mod h1:zpO6xFn9yxo3YLyMvW8HcKWVdbNqgIfOOp2dXMnm1mY=
+github.com/phpdave11/gofpdi v1.0.12/go.mod h1:vBmVV0Do6hSBHC8uKUQ71JGW+ZGQq74llk/7bXwjDoI=
 github.com/pierrec/lz4/v4 v4.1.8 h1:ieHkV+i2BRzngO4Wd/3HGowuZStgq6QkPsD1eolNAO4=
 github.com/pierrec/lz4/v4 v4.1.8/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4=
+github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
+github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
 github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
 github.com/rogpeppe/fastuuid v1.2.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ=
+github.com/ruudk/golang-pdf417 v0.0.0-20181029194003-1af4ab5afa58/go.mod h1:6lfFZQK844Gfx8o5WFuvpxWRwnSoipWe/p622j1v06w=
 github.com/stretchr/objx v0.1.0 h1:4G4v2dO3VZwixGIRoQ5Lfboy6nUhCyYzaqnIAPPhYs4=
 github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
+github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
 github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA=
 github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY=
 github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
@@ -92,22 +114,33 @@ golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL
 golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 golang.org/x/exp v0.0.0-20190125153040-c74c464bbbf2/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
+golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 golang.org/x/exp v0.0.0-20190731235908-ec7cb31e5a56/go.mod h1:JhuoJpWY28nO4Vef9tZUw9qufEGTyX1+7lmHxV5q5G4=
+golang.org/x/exp v0.0.0-20191002040644-a1355ae1e2c3/go.mod h1:NOZ3BPKG0ec/BKJQgnvsSFpcKLM5xXVWnvZS97DWHgE=
 golang.org/x/exp v0.0.0-20210220032938-85be41e4509f h1:GrkO5AtFUU9U/1f5ctbIBXtBGeSJbWwIYfIsTcFMaX4=
 golang.org/x/exp v0.0.0-20210220032938-85be41e4509f/go.mod h1:I6l2HNBLBZEcrOoCpyKLdY2lHoRZ8lI4x60KMCQDft4=
 golang.org/x/image v0.0.0-20180708004352-c73c2afc3b81/go.mod h1:ux5Hcp/YLpHSI86hEcLt0YII63i6oz57MZXIpbrjZUs=
 golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js=
 golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
+golang.org/x/image v0.0.0-20190910094157-69e4b8554b2a/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
+golang.org/x/image v0.0.0-20200119044424-58c23975cae1/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
+golang.org/x/image v0.0.0-20200430140353-33d19683fad8/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
+golang.org/x/image v0.0.0-20200618115811-c13761719519/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
+golang.org/x/image v0.0.0-20201208152932-35266b937fa6/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
+golang.org/x/image v0.0.0-20210216034530-4410531fe030/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
 golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
 golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU=
 golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
 golang.org/x/lint v0.0.0-20210508222113-6edffad5e616/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY=
 golang.org/x/mobile v0.0.0-20190312151609-d3739f865fa6/go.mod h1:z+o9i4GpDbdi3rU15maQ/Ox0txvL9dWGYEHz965HBQE=
+golang.org/x/mobile v0.0.0-20190719004257-d2bd2a29d028/go.mod h1:E/iHnbuqvinMTCcRqshq8CkpyQDoeVncDDYHnLhea+o=
 golang.org/x/mobile v0.0.0-20201217150744-e6ae53a27f4f/go.mod h1:skQtrUTUwhdJvXM/2KKJzY8pDgNr9I/FOMqDVRPBUS4=
 golang.org/x/mod v0.1.0/go.mod h1:0QHyrYULN0/3qlju5TqG8bIK38QM8yzMo5ekMj3DlcY=
 golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg=
 golang.org/x/mod v0.1.1-0.20191209134235-331c550502dd/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
+golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
 golang.org/x/mod v0.3.1-0.20200828183125-ce943fd02449/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
+golang.org/x/mod v0.4.2 h1:Gz96sIWK3OalVv/I/qNygP42zyoKp3xptRVCWRFEBvo=
 golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
 golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
 golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
@@ -128,11 +161,13 @@ golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJ
 golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20191001151750-bb3f8db39f24/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200727154430-2d971f7391a4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210304124612-50617c2ba197/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20210330210617-4fbd30eecc44/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20210510120138-977fb7262007/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
@@ -151,10 +186,12 @@ golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3
 golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
 golang.org/x/tools v0.0.0-20190312151545-0bb0c0a6e846/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
 golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q=
+golang.org/x/tools v0.0.0-20190927191325-030b2cf1153e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
 golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
 golang.org/x/tools v0.0.0-20200117012304-6edc0a871e69/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
 golang.org/x/tools v0.0.0-20200130002326-2f3ba24bd6e7/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
 golang.org/x/tools v0.0.0-20200207183749-b753a1ba74fa/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
+golang.org/x/tools v0.1.4 h1:cVngSRcfgyZCzys3KYOpCFa+4dqX/Oub9tAq00ttGVs=
 golang.org/x/tools v0.1.4/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk=
 golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
@@ -164,9 +201,12 @@ golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8T
 gonum.org/v1/gonum v0.0.0-20180816165407-929014505bf4/go.mod h1:Y+Yx5eoAFn32cQvJDxZx5Dpnq+c3wtXuadVZAcxbbBo=
 gonum.org/v1/gonum v0.8.2 h1:CCXrcPKiGGotvnN6jfUsKk4rRqm7q09/YbKb5xCEvtM=
 gonum.org/v1/gonum v0.8.2/go.mod h1:oe/vMfY3deqTw+1EZJhuvEW2iwGF1bW9wwu7XCu0+v0=
+gonum.org/v1/gonum v0.9.3 h1:DnoIG+QAMaF5NvxnGe/oKsgKcAc6PcUyl8q0VetfQ8s=
+gonum.org/v1/gonum v0.9.3/go.mod h1:TZumC3NeyVQskjXqmyWt4S3bINhy7B4eYwW69EbyX+0=
 gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0 h1:OE9mWmgKkjJyEmDAAtGMPjXu+YNeGvK9VTSHY6+Qihc=
 gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw=
 gonum.org/v1/plot v0.0.0-20190515093506-e2840ee46a6b/go.mod h1:Wt8AAjI+ypCyYX3nZBvf6cAIx93T+c/OS2HFAYskSZc=
+gonum.org/v1/plot v0.9.0/go.mod h1:3Pcqqmp6RHvJI72kgb8fThyUnav364FOsdDo2aGW5lY=
 google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM=
 google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
 google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc=
diff --git a/go/parquet/internal/gen-go/parquet/GoUnusedProtection__.go b/go/parquet/internal/gen-go/parquet/GoUnusedProtection__.go
index b72118e443e39..e3025d7a8c8ee 100644
--- a/go/parquet/internal/gen-go/parquet/GoUnusedProtection__.go
+++ b/go/parquet/internal/gen-go/parquet/GoUnusedProtection__.go
@@ -1,4 +1,4 @@
-// Code generated by Thrift Compiler (0.14.0). DO NOT EDIT.
+// Code generated by Thrift Compiler (0.15.0). DO NOT EDIT.
 
 package parquet
 
diff --git a/go/parquet/internal/gen-go/parquet/parquet-consts.go b/go/parquet/internal/gen-go/parquet/parquet-consts.go
index 8de3a86d224ee..347057e98fe83 100644
--- a/go/parquet/internal/gen-go/parquet/parquet-consts.go
+++ b/go/parquet/internal/gen-go/parquet/parquet-consts.go
@@ -1,13 +1,13 @@
-// Code generated by Thrift Compiler (0.14.0). DO NOT EDIT.
+// Code generated by Thrift Compiler (0.15.0). DO NOT EDIT.
 
 package parquet
 
-import(
+import (
 	"bytes"
 	"context"
 	"fmt"
 	"time"
-	"github.com/apache/thrift/lib/go/thrift"
+	thrift "github.com/apache/thrift/lib/go/thrift"
 )
 
 // (needed to ensure safety because of naive import list construction.)
diff --git a/go/parquet/internal/gen-go/parquet/parquet.go b/go/parquet/internal/gen-go/parquet/parquet.go
index 997b6ab91f352..bb1602851d073 100644
--- a/go/parquet/internal/gen-go/parquet/parquet.go
+++ b/go/parquet/internal/gen-go/parquet/parquet.go
@@ -1,15 +1,15 @@
-// Code generated by Thrift Compiler (0.14.0). DO NOT EDIT.
+// Code generated by Thrift Compiler (0.15.0). DO NOT EDIT.
 
 package parquet
 
-import(
+import (
 	"bytes"
 	"context"
 	"database/sql/driver"
 	"errors"
 	"fmt"
 	"time"
-	"github.com/apache/thrift/lib/go/thrift"
+	thrift "github.com/apache/thrift/lib/go/thrift"
 )
 
 // (needed to ensure safety because of naive import list construction.)
@@ -346,10 +346,11 @@ return int64(*p), nil
 }
 //Supported compression algorithms.
 //
-//Codecs added in 2.4 can be read by readers based on 2.4 and later.
+//Codecs added in format version X.Y can be read by readers based on X.Y and later.
 //Codec support may vary between readers based on the format version and
-//libraries available at runtime. Gzip, Snappy, and LZ4 codecs are
-//widely available, while Zstd and Brotli require additional libraries.
+//libraries available at runtime.
+//
+//See Compression.md for a detailed specification of these algorithms.
 type CompressionCodec int64
 const (
   CompressionCodec_UNCOMPRESSED CompressionCodec = 0
@@ -359,6 +360,7 @@ const (
   CompressionCodec_BROTLI CompressionCodec = 4
   CompressionCodec_LZ4 CompressionCodec = 5
   CompressionCodec_ZSTD CompressionCodec = 6
+  CompressionCodec_LZ4_RAW CompressionCodec = 7
 )
 
 func (p CompressionCodec) String() string {
@@ -370,6 +372,7 @@ func (p CompressionCodec) String() string {
   case CompressionCodec_BROTLI: return "BROTLI"
   case CompressionCodec_LZ4: return "LZ4"
   case CompressionCodec_ZSTD: return "ZSTD"
+  case CompressionCodec_LZ4_RAW: return "LZ4_RAW"
   }
   return "<UNSET>"
 }
@@ -383,6 +386,7 @@ func CompressionCodecFromString(s string) (CompressionCodec, error) {
   case "BROTLI": return CompressionCodec_BROTLI, nil 
   case "LZ4": return CompressionCodec_LZ4, nil 
   case "ZSTD": return CompressionCodec_ZSTD, nil 
+  case "LZ4_RAW": return CompressionCodec_LZ4_RAW, nil 
   }
   return CompressionCodec(0), fmt.Errorf("not a valid CompressionCodec string")
 }
@@ -1801,7 +1805,7 @@ func (p *TimeUnit)  ReadField3(ctx context.Context, iprot thrift.TProtocol) erro
 
 func (p *TimeUnit) Write(ctx context.Context, oprot thrift.TProtocol) error {
   if c := p.CountSetFieldsTimeUnit(); c != 1 {
-    return fmt.Errorf("%T write union: exactly one field must be set (%d set).", p, c)
+    return fmt.Errorf("%T write union: exactly one field must be set (%d set)", p, c)
   }
   if err := oprot.WriteStructBegin(ctx, "TimeUnit"); err != nil {
     return thrift.PrependError(fmt.Sprintf("%T write struct begin error: ", p), err) }
@@ -2967,7 +2971,7 @@ func (p *LogicalType)  ReadField14(ctx context.Context, iprot thrift.TProtocol)
 
 func (p *LogicalType) Write(ctx context.Context, oprot thrift.TProtocol) error {
   if c := p.CountSetFieldsLogicalType(); c != 1 {
-    return fmt.Errorf("%T write union: exactly one field must be set (%d set).", p, c)
+    return fmt.Errorf("%T write union: exactly one field must be set (%d set)", p, c)
   }
   if err := oprot.WriteStructBegin(ctx, "LogicalType"); err != nil {
     return thrift.PrependError(fmt.Sprintf("%T write struct begin error: ", p), err) }
@@ -4333,7 +4337,7 @@ func (p *DictionaryPageHeader) String() string {
 // definition_levels_byte_length + repetition_levels_byte_length + 1 and compressed_page_size (included)
 // is compressed with the compression_codec.
 // If missing it is considered compressed
-//  - Statistics: optional statistics for this column chunk
+//  - Statistics: optional statistics for the data in this page *
 type DataPageHeaderV2 struct {
   NumValues int32 `thrift:"num_values,1,required" db:"num_values" json:"num_values"`
   NumNulls int32 `thrift:"num_nulls,2,required" db:"num_nulls" json:"num_nulls"`
@@ -4875,7 +4879,7 @@ func (p *BloomFilterAlgorithm)  ReadField1(ctx context.Context, iprot thrift.TPr
 
 func (p *BloomFilterAlgorithm) Write(ctx context.Context, oprot thrift.TProtocol) error {
   if c := p.CountSetFieldsBloomFilterAlgorithm(); c != 1 {
-    return fmt.Errorf("%T write union: exactly one field must be set (%d set).", p, c)
+    return fmt.Errorf("%T write union: exactly one field must be set (%d set)", p, c)
   }
   if err := oprot.WriteStructBegin(ctx, "BloomFilterAlgorithm"); err != nil {
     return thrift.PrependError(fmt.Sprintf("%T write struct begin error: ", p), err) }
@@ -5064,7 +5068,7 @@ func (p *BloomFilterHash)  ReadField1(ctx context.Context, iprot thrift.TProtoco
 
 func (p *BloomFilterHash) Write(ctx context.Context, oprot thrift.TProtocol) error {
   if c := p.CountSetFieldsBloomFilterHash(); c != 1 {
-    return fmt.Errorf("%T write union: exactly one field must be set (%d set).", p, c)
+    return fmt.Errorf("%T write union: exactly one field must be set (%d set)", p, c)
   }
   if err := oprot.WriteStructBegin(ctx, "BloomFilterHash"); err != nil {
     return thrift.PrependError(fmt.Sprintf("%T write struct begin error: ", p), err) }
@@ -5248,7 +5252,7 @@ func (p *BloomFilterCompression)  ReadField1(ctx context.Context, iprot thrift.T
 
 func (p *BloomFilterCompression) Write(ctx context.Context, oprot thrift.TProtocol) error {
   if c := p.CountSetFieldsBloomFilterCompression(); c != 1 {
-    return fmt.Errorf("%T write union: exactly one field must be set (%d set).", p, c)
+    return fmt.Errorf("%T write union: exactly one field must be set (%d set)", p, c)
   }
   if err := oprot.WriteStructBegin(ctx, "BloomFilterCompression"); err != nil {
     return thrift.PrependError(fmt.Sprintf("%T write struct begin error: ", p), err) }
@@ -5574,6 +5578,8 @@ func (p *BloomFilterHeader) String() string {
 //     uncompressed definition levels and the compressed column values.
 //     If no compression scheme is specified, the CRC shall be calculated on
 //     the uncompressed concatenation.
+// - In encrypted columns, CRC is calculated after page encryption; the
+//   encryption itself is performed after page compression (if compressed)
 // If enabled, this allows for disabling checksumming in HDFS if only a few
 // pages need to be read.
 // 
@@ -7687,7 +7693,7 @@ func (p *ColumnCryptoMetaData)  ReadField2(ctx context.Context, iprot thrift.TPr
 
 func (p *ColumnCryptoMetaData) Write(ctx context.Context, oprot thrift.TProtocol) error {
   if c := p.CountSetFieldsColumnCryptoMetaData(); c != 1 {
-    return fmt.Errorf("%T write union: exactly one field must be set (%d set).", p, c)
+    return fmt.Errorf("%T write union: exactly one field must be set (%d set)", p, c)
   }
   if err := oprot.WriteStructBegin(ctx, "ColumnCryptoMetaData"); err != nil {
     return thrift.PrependError(fmt.Sprintf("%T write struct begin error: ", p), err) }
@@ -8878,7 +8884,7 @@ func (p *ColumnOrder)  ReadField1(ctx context.Context, iprot thrift.TProtocol) e
 
 func (p *ColumnOrder) Write(ctx context.Context, oprot thrift.TProtocol) error {
   if c := p.CountSetFieldsColumnOrder(); c != 1 {
-    return fmt.Errorf("%T write union: exactly one field must be set (%d set).", p, c)
+    return fmt.Errorf("%T write union: exactly one field must be set (%d set)", p, c)
   }
   if err := oprot.WriteStructBegin(ctx, "ColumnOrder"); err != nil {
     return thrift.PrependError(fmt.Sprintf("%T write struct begin error: ", p), err) }
@@ -9262,7 +9268,7 @@ func (p *OffsetIndex) String() string {
 // be valid values within the column's logical type. Readers must make sure
 // that list entries are populated before using them by inspecting null_pages.
 //  - MaxValues
-//  - BoundaryOrder: Stores whether both min_values and max_values are ordered and if so, in
+//  - BoundaryOrder: Stores whether both min_values and max_values are orderd and if so, in
 // which direction. This allows readers to perform binary searches in both
 // lists. Readers cannot assume that max_values[i] <= min_values[i+1], even
 // if the lists are ordered.
@@ -10158,7 +10164,7 @@ func (p *EncryptionAlgorithm)  ReadField2(ctx context.Context, iprot thrift.TPro
 
 func (p *EncryptionAlgorithm) Write(ctx context.Context, oprot thrift.TProtocol) error {
   if c := p.CountSetFieldsEncryptionAlgorithm(); c != 1 {
-    return fmt.Errorf("%T write union: exactly one field must be set (%d set).", p, c)
+    return fmt.Errorf("%T write union: exactly one field must be set (%d set)", p, c)
   }
   if err := oprot.WriteStructBegin(ctx, "EncryptionAlgorithm"); err != nil {
     return thrift.PrependError(fmt.Sprintf("%T write struct begin error: ", p), err) }

From 6cf5af4dd6400427b608b4a0132f88f14d7ced6e Mon Sep 17 00:00:00 2001
From: Nic Crane <thisisnic@gmail.com>
Date: Mon, 25 Oct 2021 22:56:34 +0300
Subject: [PATCH 015/194] ARROW-14412: [R] Better error handling for
 flight_put() when data arg object is wrong type

Closes #11501 from thisisnic/ARROW-14412_flight_put_error

Authored-by: Nic Crane <thisisnic@gmail.com>
Signed-off-by: Nic Crane <thisisnic@gmail.com>
---
 r/R/flight.R                          | 3 +++
 r/tests/testthat/test-python-flight.R | 4 ++++
 2 files changed, 7 insertions(+)

diff --git a/r/R/flight.R b/r/R/flight.R
index cde29785324d6..4d190de446c18 100644
--- a/r/R/flight.R
+++ b/r/R/flight.R
@@ -51,12 +51,15 @@ flight_connect <- function(host = "localhost", port, scheme = "grpc+tcp") {
 #' @return `client`, invisibly.
 #' @export
 flight_put <- function(client, data, path, overwrite = TRUE) {
+  assert_is(data, c("data.frame", "Table", "RecordBatch"))
+
   if (!overwrite && flight_path_exists(client, path)) {
     stop(path, " exists.", call. = FALSE)
   }
   if (is.data.frame(data)) {
     data <- Table$create(data)
   }
+
   py_data <- reticulate::r_to_py(data)
   writer <- client$do_put(descriptor_for_path(path), py_data$schema)[[1]]
   if (inherits(data, "RecordBatch")) {
diff --git a/r/tests/testthat/test-python-flight.R b/r/tests/testthat/test-python-flight.R
index c87f3a562ac2e..0ffc7e4a43a70 100644
--- a/r/tests/testthat/test-python-flight.R
+++ b/r/tests/testthat/test-python-flight.R
@@ -32,6 +32,10 @@ if (process_is_running("demo_flight_server")) {
     flight_put(client, example_data, path = flight_obj)
     expect_true(flight_path_exists(client, flight_obj))
     expect_true(flight_obj %in% list_flights(client))
+    expect_error(
+      flight_put(client, Array$create(c(1:3)), path = flight_obj),
+      regexp = 'data must be a "data.frame", "Table", or "RecordBatch"'
+    )
   })
 
   test_that("flight_get", {

From 252e207495880eeff1f81e9fe28232b42bbd29ba Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Tue, 26 Oct 2021 11:14:19 +0200
Subject: [PATCH 016/194] ARROW-14459: [Doc] Update the pinned sphinx version
 to 4.2

Closes #11533 from jorisvandenbossche/ARROW-14459-sphinx-update

Authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 ci/conda_env_sphinx.txt        | 3 +--
 docs/source/python/dataset.rst | 4 +++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/ci/conda_env_sphinx.txt b/ci/conda_env_sphinx.txt
index 49388e2b437f1..64e1c16a551f2 100644
--- a/ci/conda_env_sphinx.txt
+++ b/ci/conda_env_sphinx.txt
@@ -19,6 +19,5 @@
 breathe
 doxygen
 ipython
-# Pinned per ARROW-9693
-sphinx=3.1.2
+sphinx=4.2
 pydata-sphinx-theme
diff --git a/docs/source/python/dataset.rst b/docs/source/python/dataset.rst
index 5ca6e7a754e9c..e2d8c900b2528 100644
--- a/docs/source/python/dataset.rst
+++ b/docs/source/python/dataset.rst
@@ -598,8 +598,10 @@ to supply a visitor that will be called as each file is created:
         print(f"path={written_file.path}")
         print(f"metadata={written_file.metadata}")
 
+.. ipython:: python
+
     ds.write_dataset(table, base / "dataset_visited", format="parquet", partitioning=part,
-                    file_visitor=file_visitor)
+                     file_visitor=file_visitor)
 
 This will allow you to collect the filenames that belong to the dataset and store them elsewhere
 which can be useful when you want to avoid scanning directories the next time you need to read

From 836ffa5656d5107fd4895ae8d7eb0e20a3df23ba Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <krassovskysasha@gmail.com>
Date: Tue, 26 Oct 2021 01:00:33 -1000
Subject: [PATCH 017/194] ARROW-13668: [Python] Add `write_batch` and `write`
 methods to `ParquetWriter`

Also adds a small test to make sure these methods do the right thing

Closes #11455 from save-buffer/sasha_13668

Authored-by: Sasha Krassovsky <krassovskysasha@gmail.com>
Signed-off-by: Weston Pace <weston.pace@gmail.com>
---
 python/pyarrow/parquet.py                     | 45 ++++++++++++++++++
 .../tests/parquet/test_parquet_writer.py      | 46 +++++++++++++++++++
 2 files changed, 91 insertions(+)

diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py
index 8041b4e3cc744..980ccb89e79fe 100644
--- a/python/pyarrow/parquet.py
+++ b/python/pyarrow/parquet.py
@@ -687,7 +687,52 @@ def __exit__(self, *args, **kwargs):
         # return false since we want to propagate exceptions
         return False
 
+    def write(self, table_or_batch, row_group_size=None):
+        """
+        Write RecordBatch or Table to the Parquet file.
+
+        Parameters
+        ----------
+        table_or_batch : {RecordBatch, Table}
+        row_group_size : int, default None
+            Maximum size of each written row group. If None, the
+            row group size will be the minimum of the input
+            table or batch length and 64 * 1024 * 1024.
+        """
+        if isinstance(table_or_batch, pa.RecordBatch):
+            self.write_batch(table_or_batch, row_group_size)
+        elif isinstance(table_or_batch, pa.Table):
+            self.write_table(table_or_batch, row_group_size)
+        else:
+            raise TypeError(type(table_or_batch))
+
+    def write_batch(self, batch, row_group_size=None):
+        """
+        Write RecordBatch to the Parquet file.
+
+        Parameters
+        ----------
+        batch : RecordBatch
+        row_group_size : int, default None
+            Maximum size of each written row group. If None, the
+            row group size will be the minimum of the RecordBatch
+            size and 64 * 1024 * 1024.
+        """
+        table = pa.Table.from_batches([batch], batch.schema)
+        self.write_table(table, row_group_size)
+
     def write_table(self, table, row_group_size=None):
+        """
+        Write Table to the Parquet file.
+
+        Parameters
+        ----------
+        table : Table
+        row_group_size : int, default None
+            Maximum size of each written row group. If None, the
+            row group size will be the minimum of the Table size
+            and 64 * 1024 * 1024.
+        """
         if self.schema_changed:
             table = _sanitize_table(table, self.schema, self.flavor)
         assert self.is_open
diff --git a/python/pyarrow/tests/parquet/test_parquet_writer.py b/python/pyarrow/tests/parquet/test_parquet_writer.py
index 9be7634c840be..6880a827226ee 100644
--- a/python/pyarrow/tests/parquet/test_parquet_writer.py
+++ b/python/pyarrow/tests/parquet/test_parquet_writer.py
@@ -153,6 +153,52 @@ def test_parquet_writer_context_obj_with_exception(
     tm.assert_frame_equal(result.to_pandas(), expected)
 
 
+@pytest.mark.pandas
+@pytest.mark.parametrize("filesystem", [
+    None,
+    LocalFileSystem._get_instance(),
+    fs.LocalFileSystem(),
+])
+def test_parquet_writer_write_wrappers(tempdir, filesystem):
+    df = _test_dataframe(100)
+    table = pa.Table.from_pandas(df, preserve_index=False)
+    batch = pa.RecordBatch.from_pandas(df, preserve_index=False)
+    path_table = str(tempdir / 'data_table.parquet')
+    path_batch = str(tempdir / 'data_batch.parquet')
+
+    with pq.ParquetWriter(
+        path_table, table.schema, filesystem=filesystem, version='2.6'
+    ) as writer:
+        writer.write_table(table)
+
+    result = _read_table(path_table).to_pandas()
+    tm.assert_frame_equal(result, df)
+
+    with pq.ParquetWriter(
+        path_batch, table.schema, filesystem=filesystem, version='2.6'
+    ) as writer:
+        writer.write_batch(batch)
+
+    result = _read_table(path_batch).to_pandas()
+    tm.assert_frame_equal(result, df)
+
+    with pq.ParquetWriter(
+        path_table, table.schema, filesystem=filesystem, version='2.6'
+    ) as writer:
+        writer.write(table)
+
+    result = _read_table(path_table).to_pandas()
+    tm.assert_frame_equal(result, df)
+
+    with pq.ParquetWriter(
+        path_batch, table.schema, filesystem=filesystem, version='2.6'
+    ) as writer:
+        writer.write(batch)
+
+    result = _read_table(path_batch).to_pandas()
+    tm.assert_frame_equal(result, df)
+
+
 @pytest.mark.pandas
 @pytest.mark.parametrize("filesystem", [
     None,

From 1cba1cc938e0d9daeec560a84551573a335a4224 Mon Sep 17 00:00:00 2001
From: Nathaniel Bauernfeind <natebauernfeind@deephaven.io>
Date: Tue, 26 Oct 2021 14:07:25 -0400
Subject: [PATCH 018/194] ARROW-13449: [Format] Update documentation related to
 wire format of schema

Closes #11549 from nbauernfeind/arrow_13449

Authored-by: Nathaniel Bauernfeind <natebauernfeind@deephaven.io>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 format/Flight.proto | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/format/Flight.proto b/format/Flight.proto
index b291d9dbd9aa3..2dab84a40bea5 100644
--- a/format/Flight.proto
+++ b/format/Flight.proto
@@ -193,7 +193,10 @@ message Result {
  * Wrap the result of a getSchema call
  */
 message SchemaResult {
-  // schema of the dataset as described in Schema.fbs::Schema.
+  // The schema of the dataset in its IPC form:
+  //   4 bytes - an optional IPC_CONTINUATION_TOKEN prefix
+  //   4 bytes - the byte length of the payload
+  //   a flatbuffer Message whose header is the Schema
   bytes schema = 1;
 }
 
@@ -244,7 +247,10 @@ message FlightDescriptor {
  * consumer is able to determine how to retrieve a dataset.
  */
 message FlightInfo {
-  // schema of the dataset as described in Schema.fbs::Schema.
+  // The schema of the dataset in its IPC form:
+  //   4 bytes - an optional IPC_CONTINUATION_TOKEN prefix
+  //   4 bytes - the byte length of the payload
+  //   a flatbuffer Message whose header is the Schema
   bytes schema = 1;
 
   /*

From 81fb955353e692a294672150630940def77a0edd Mon Sep 17 00:00:00 2001
From: David Li <li.davidm96@gmail.com>
Date: Tue, 26 Oct 2021 14:08:08 -0400
Subject: [PATCH 019/194] ARROW-13947: [C++] Support more types in index kernel

Also clarify the behavior for nulls in the docs.

Closes #11370 from lidavidm/arrow-13947

Authored-by: David Li <li.davidm96@gmail.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 .../arrow/compute/kernels/aggregate_basic.cc  | 43 ++++++++-
 .../arrow/compute/kernels/aggregate_test.cc   | 87 ++++++++++++++++---
 docs/source/cpp/compute.rst                   | 27 +++---
 3 files changed, 132 insertions(+), 25 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc
index 25697f7d33b11..0bc839be8e3cc 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc
@@ -654,6 +654,20 @@ struct IndexImpl : public ScalarAggregator {
   int64_t index = -1;
 };
 
+template <>
+struct IndexImpl<NullType> : public ScalarAggregator {
+  explicit IndexImpl(IndexOptions, KernelState*) {}
+
+  Status Consume(KernelContext*, const ExecBatch&) override { return Status::OK(); }
+
+  Status MergeFrom(KernelContext*, KernelState&&) override { return Status::OK(); }
+
+  Status Finalize(KernelContext*, Datum* out) override {
+    out->value = std::make_shared<Int64Scalar>(-1);
+    return Status::OK();
+  }
+};
+
 struct IndexInit {
   std::unique_ptr<KernelState> state;
   KernelContext* ctx;
@@ -667,6 +681,11 @@ struct IndexInit {
     return Status::NotImplemented("Index kernel not implemented for ", type.ToString());
   }
 
+  Status Visit(const NullType&) {
+    state.reset(new IndexImpl<NullType>(options, ctx->state()));
+    return Status::OK();
+  }
+
   Status Visit(const BooleanType&) {
     state.reset(new IndexImpl<BooleanType>(options, ctx->state()));
     return Status::OK();
@@ -684,6 +703,17 @@ struct IndexInit {
     return Status::OK();
   }
 
+  Status Visit(const FixedSizeBinaryType&) {
+    state.reset(new IndexImpl<FixedSizeBinaryType>(options, ctx->state()));
+    return Status::OK();
+  }
+
+  template <typename Type>
+  enable_if_decimal<Type, Status> Visit(const Type&) {
+    state.reset(new IndexImpl<Type>(options, ctx->state()));
+    return Status::OK();
+  }
+
   template <typename Type>
   enable_if_date<Type, Status> Visit(const Type&) {
     state.reset(new IndexImpl<Type>(options, ctx->state()));
@@ -712,8 +742,14 @@ struct IndexInit {
     if (!args.options) {
       return Status::Invalid("Must provide IndexOptions for index kernel");
     }
-    IndexInit visitor(ctx, static_cast<const IndexOptions&>(*args.options),
-                      *args.inputs[0].type);
+    const auto& options = static_cast<const IndexOptions&>(*args.options);
+    if (!options.value) {
+      return Status::Invalid("Must provide IndexOptions.value for index kernel");
+    } else if (!options.value->type->Equals(*args.inputs[0].type)) {
+      return Status::TypeError("Expected IndexOptions.value to be of type ",
+                               *args.inputs[0].type, ", but got ", *options.value->type);
+    }
+    IndexInit visitor(ctx, options, *args.inputs[0].type);
     return visitor.Create();
   }
 };
@@ -1003,6 +1039,9 @@ void RegisterScalarAggregateBasic(FunctionRegistry* registry) {
   AddBasicAggKernels(IndexInit::Init, BaseBinaryTypes(), int64(), func.get());
   AddBasicAggKernels(IndexInit::Init, PrimitiveTypes(), int64(), func.get());
   AddBasicAggKernels(IndexInit::Init, TemporalTypes(), int64(), func.get());
+  AddBasicAggKernels(IndexInit::Init,
+                     {fixed_size_binary(1), decimal128(1, 0), decimal256(1, 0), null()},
+                     int64(), func.get());
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
 
diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc
index fe940006cb263..c5355a8f4521f 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_test.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc
@@ -2109,24 +2109,23 @@ TEST_F(TestAllKernel, Basics) {
 // Index
 //
 
+void CheckIndex(Datum array, const std::shared_ptr<Scalar>& value, int64_t expected) {
+  IndexOptions options(value);
+  ASSERT_OK_AND_ASSIGN(Datum out, Index(array, options));
+  const Int64Scalar& out_index = out.scalar_as<Int64Scalar>();
+  ASSERT_EQ(out_index.value, expected);
+}
+
 template <typename ArrowType>
 class TestIndexKernel : public ::testing::Test {
  public:
   using ScalarType = typename TypeTraits<ArrowType>::ScalarType;
-  void AssertIndexIs(const Datum& array, const std::shared_ptr<ScalarType>& value,
-                     int64_t expected) {
-    IndexOptions options(value);
-    ASSERT_OK_AND_ASSIGN(Datum out, Index(array, options));
-    const Int64Scalar& out_index = out.scalar_as<Int64Scalar>();
-    ASSERT_EQ(out_index.value, expected);
-  }
-
   void AssertIndexIs(const std::string& json, const std::shared_ptr<ScalarType>& value,
                      int64_t expected) {
     SCOPED_TRACE("Value: " + value->ToString());
     SCOPED_TRACE("Input: " + json);
     auto array = ArrayFromJSON(type_singleton(), json);
-    AssertIndexIs(array, value, expected);
+    CheckIndex(array, value, expected);
   }
 
   void AssertIndexIs(const std::vector<std::string>& json,
@@ -2134,7 +2133,7 @@ class TestIndexKernel : public ::testing::Test {
     SCOPED_TRACE("Value: " + value->ToString());
     auto array = ChunkedArrayFromJSON(type_singleton(), json);
     SCOPED_TRACE("Input: " + array->ToString());
-    AssertIndexIs(array, value, expected);
+    CheckIndex(array, value, expected);
   }
 
   std::shared_ptr<DataType> type_singleton() { return std::make_shared<ArrowType>(); }
@@ -2208,7 +2207,7 @@ TYPED_TEST(TestNumericIndexKernel, Random) {
       if (expected >= 0) break;
     }
 
-    this->AssertIndexIs(Datum(chunked_array), value, expected);
+    CheckIndex(Datum(chunked_array), value, expected);
   }
 }
 
@@ -2265,6 +2264,72 @@ TYPED_TEST(TestStringIndexKernel, Basics) {
   this->AssertIndexIs(R"(["foo", null, null])", null_value, -1);
 }
 
+TEST(TestIndexKernel, FixedSizeBinary) {
+  auto ty = fixed_size_binary(3);
+  auto buffer = Buffer::FromString("foo");
+  auto value = std::make_shared<FixedSizeBinaryScalar>(buffer, ty);
+  auto null_value = std::make_shared<FixedSizeBinaryScalar>(buffer, ty);
+  null_value->is_valid = false;
+
+  CheckIndex(ArrayFromJSON(ty, R"([])"), value, -1);
+  CheckIndex(ArrayFromJSON(ty, R"(["foo"])"), value, 0);
+  CheckIndex(ArrayFromJSON(ty, R"(["bar", "bar", "bar", "bar"])"), value, -1);
+  CheckIndex(ArrayFromJSON(ty, R"(["bar", "bar", "bar", "bar", "foo"])"), value, 4);
+  CheckIndex(ArrayFromJSON(ty, R"([null, null, null])"), value, -1);
+  CheckIndex(ArrayFromJSON(ty, R"([null, null, null])"), null_value, -1);
+  CheckIndex(ArrayFromJSON(ty, R"(["foo", null, null])"), null_value, -1);
+}
+
+TEST(TestIndexKernel, Decimal) {
+  for (const auto& ty : {decimal128(3, 2), decimal256(3, 2)}) {
+    std::shared_ptr<Scalar> value, null_value;
+    if (ty->id() == Type::DECIMAL128) {
+      value = std::make_shared<Decimal128Scalar>(Decimal128(123), ty);
+      null_value = std::make_shared<Decimal128Scalar>(ty);
+    } else {
+      value = std::make_shared<Decimal256Scalar>(Decimal256(123), ty);
+      null_value = std::make_shared<Decimal256Scalar>(ty);
+    }
+
+    CheckIndex(ArrayFromJSON(ty, R"([])"), value, -1);
+    CheckIndex(ArrayFromJSON(ty, R"(["1.23"])"), value, 0);
+    CheckIndex(ArrayFromJSON(ty, R"(["9.99", "9.99", "9.99", "9.99"])"), value, -1);
+    CheckIndex(ArrayFromJSON(ty, R"(["9.99", "9.99", "9.99", "9.99", "1.23"])"), value,
+               4);
+    CheckIndex(ArrayFromJSON(ty, R"([null, null, null])"), value, -1);
+    CheckIndex(ArrayFromJSON(ty, R"([null, null, null])"), null_value, -1);
+    CheckIndex(ArrayFromJSON(ty, R"(["1.23", null, null])"), null_value, -1);
+  }
+}
+
+TEST(TestIndexKernel, Null) {
+  auto ty = null();
+  auto value = std::make_shared<NullScalar>();
+
+  CheckIndex(ArrayFromJSON(ty, R"([])"), value, -1);
+  CheckIndex(ArrayFromJSON(ty, R"([null])"), value, -1);
+  CheckIndex(ArrayFromJSON(ty, R"([null, null, null, null])"), value, -1);
+}
+
+TEST(TestIndexKernel, Errors) {
+  EXPECT_RAISES_WITH_MESSAGE_THAT(
+      TypeError,
+      ::testing::HasSubstr(
+          "Expected IndexOptions.value to be of type string, but got int32"),
+      Index(ArrayFromJSON(utf8(), R"(["a"])"),
+            IndexOptions(ScalarFromJSON(int32(), "1"))));
+  EXPECT_RAISES_WITH_MESSAGE_THAT(
+      TypeError,
+      ::testing::HasSubstr("Expected IndexOptions.value to be of type timestamp[ns], "
+                           "but got timestamp[ms]"),
+      Index(ArrayFromJSON(timestamp(TimeUnit::NANO), R"(["2020-01-01"])"),
+            IndexOptions(ScalarFromJSON(timestamp(TimeUnit::MILLI), R"("2020-01-01")"))));
+
+  EXPECT_RAISES_WITH_MESSAGE_THAT(
+      Invalid, ::testing::HasSubstr("Must provide IndexOptions.value"),
+      Index(ArrayFromJSON(utf8(), R"(["a"])"), IndexOptions(nullptr)));
+}
+
 //
 // Mode
 //
diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst
index fce02afef5ab4..0a87752e92d4b 100644
--- a/docs/source/cpp/compute.rst
+++ b/docs/source/cpp/compute.rst
@@ -203,7 +203,7 @@ the input to a single output value.
 +--------------------+-------+------------------+------------------------+----------------------------------+-------+
 | count_distinct     | Unary | Non-nested types | Scalar Int64           | :struct:`CountOptions`           | \(2)  |
 +--------------------+-------+------------------+------------------------+----------------------------------+-------+
-| index              | Unary | Any              | Scalar Int64           | :struct:`IndexOptions`           |       |
+| index              | Unary | Any              | Scalar Int64           | :struct:`IndexOptions`           | \(3)  |
 +--------------------+-------+------------------+------------------------+----------------------------------+-------+
 | max                | Unary | Non-nested types | Scalar Input type      | :struct:`ScalarAggregateOptions` |       |
 +--------------------+-------+------------------+------------------------+----------------------------------+-------+
@@ -211,19 +211,19 @@ the input to a single output value.
 +--------------------+-------+------------------+------------------------+----------------------------------+-------+
 | min                | Unary | Non-nested types | Scalar Input type      | :struct:`ScalarAggregateOptions` |       |
 +--------------------+-------+------------------+------------------------+----------------------------------+-------+
-| min_max            | Unary | Non-nested types | Scalar Struct          | :struct:`ScalarAggregateOptions` | \(3)  |
+| min_max            | Unary | Non-nested types | Scalar Struct          | :struct:`ScalarAggregateOptions` | \(4)  |
 +--------------------+-------+------------------+------------------------+----------------------------------+-------+
-| mode               | Unary | Numeric          | Struct                 | :struct:`ModeOptions`            | \(4)  |
+| mode               | Unary | Numeric          | Struct                 | :struct:`ModeOptions`            | \(5)  |
 +--------------------+-------+------------------+------------------------+----------------------------------+-------+
-| product            | Unary | Numeric          | Scalar Numeric         | :struct:`ScalarAggregateOptions` | \(5)  |
+| product            | Unary | Numeric          | Scalar Numeric         | :struct:`ScalarAggregateOptions` | \(6)  |
 +--------------------+-------+------------------+------------------------+----------------------------------+-------+
-| quantile           | Unary | Numeric          | Scalar Numeric         | :struct:`QuantileOptions`        | \(6)  |
+| quantile           | Unary | Numeric          | Scalar Numeric         | :struct:`QuantileOptions`        | \(7)  |
 +--------------------+-------+------------------+------------------------+----------------------------------+-------+
 | stddev             | Unary | Numeric          | Scalar Float64         | :struct:`VarianceOptions`        |       |
 +--------------------+-------+------------------+------------------------+----------------------------------+-------+
-| sum                | Unary | Numeric          | Scalar Numeric         | :struct:`ScalarAggregateOptions` | \(5)  |
+| sum                | Unary | Numeric          | Scalar Numeric         | :struct:`ScalarAggregateOptions` | \(6)  |
 +--------------------+-------+------------------+------------------------+----------------------------------+-------+
-| tdigest            | Unary | Numeric          | Float64                | :struct:`TDigestOptions`         | \(7)  |
+| tdigest            | Unary | Numeric          | Float64                | :struct:`TDigestOptions`         | \(8)  |
 +--------------------+-------+------------------+------------------------+----------------------------------+-------+
 | variance           | Unary | Numeric          | Scalar Float64         | :struct:`VarianceOptions`        |       |
 +--------------------+-------+------------------+------------------------+----------------------------------+-------+
@@ -235,24 +235,27 @@ the input to a single output value.
 * \(2) CountMode controls whether only non-null values are counted (the
   default), only null values are counted, or all values are counted.
 
-* \(3) Output is a ``{"min": input type, "max": input type}`` Struct.
+* \(3) Returns -1 if the value is not found. The index of a null value
+  is always -1, regardless of whether there are nulls in the input.
+
+* \(4) Output is a ``{"min": input type, "max": input type}`` Struct.
 
   Of the interval types, only the month interval is supported, as the day-time
   and month-day-nano types are not sortable.
 
-* \(4) Output is an array of ``{"mode": input type, "count": Int64}`` Struct.
+* \(5) Output is an array of ``{"mode": input type, "count": Int64}`` Struct.
   It contains the *N* most common elements in the input, in descending
   order, where *N* is given in :member:`ModeOptions::n`.
   If two values have the same count, the smallest one comes first.
   Note that the output can have less than *N* elements if the input has
   less than *N* distinct values.
 
-* \(5) Output is Int64, UInt64, Float64, or Decimal128/256, depending on the
+* \(6) Output is Int64, UInt64, Float64, or Decimal128/256, depending on the
   input type.
 
-* \(6) Output is Float64 or input type, depending on QuantileOptions.
+* \(7) Output is Float64 or input type, depending on QuantileOptions.
 
-* \(7) tdigest/t-digest computes approximate quantiles, and so only needs a
+* \(8) tdigest/t-digest computes approximate quantiles, and so only needs a
   fixed amount of memory. See the `reference implementation
   <https://github.com/tdunning/t-digest>`_ for details.
 

From 1e1f8b25f61bc7411985aaefe1a1f7b416e7d652 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= <szucs.krisztian@gmail.com>
Date: Tue, 26 Oct 2021 22:18:08 +0200
Subject: [PATCH 020/194] MINOR: [Release] Make post-13-go.sh script executable
 (#11546)

---
 dev/release/post-13-go.sh | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 dev/release/post-13-go.sh

diff --git a/dev/release/post-13-go.sh b/dev/release/post-13-go.sh
old mode 100644
new mode 100755

From 11e359c39b10608274f1d369bf3b43f3a06f57ed Mon Sep 17 00:00:00 2001
From: Benson Muite <bkmgit@users.noreply.github.com>
Date: Wed, 27 Oct 2021 05:26:13 +0900
Subject: [PATCH 021/194] ARROW-14454: [Release] shasum is not available on
 CentOS 8

Closes #11531 from bkmgit/ARROW-14454

Authored-by: Benson Muite <bkmgit@users.noreply.github.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 dev/release/verify-release-candidate.sh | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh
index 870f9ec3849e4..222d00f02327b 100755
--- a/dev/release/verify-release-candidate.sh
+++ b/dev/release/verify-release-candidate.sh
@@ -95,6 +95,14 @@ import_gpg_keys() {
   gpg --import KEYS
 }
 
+if type shasum >/dev/null 2>&1; then
+  sha256_verify="shasum -a 256 -c"
+  sha512_verify="shasum -a 512 -c"
+else
+  sha256_verify="sha256sum -c"
+  sha512_verify="sha512sum -c"
+fi
+
 fetch_archive() {
   local dist_name=$1
   download_rc_file ${dist_name}.tar.gz
@@ -102,8 +110,8 @@ fetch_archive() {
   download_rc_file ${dist_name}.tar.gz.sha256
   download_rc_file ${dist_name}.tar.gz.sha512
   gpg --verify ${dist_name}.tar.gz.asc ${dist_name}.tar.gz
-  shasum -a 256 -c ${dist_name}.tar.gz.sha256
-  shasum -a 512 -c ${dist_name}.tar.gz.sha512
+  ${sha256_verify} ${dist_name}.tar.gz.sha256
+  ${sha512_verify} ${dist_name}.tar.gz.sha512
 }
 
 verify_dir_artifact_signatures() {
@@ -117,9 +125,9 @@ verify_dir_artifact_signatures() {
     pushd $(dirname $artifact)
     base_artifact=$(basename $artifact)
     if [ -f $base_artifact.sha256 ]; then
-      shasum -a 256 -c $base_artifact.sha256 || exit 1
+      ${sha256_verify} $base_artifact.sha256 || exit 1
     fi
-    shasum -a 512 -c $base_artifact.sha512 || exit 1
+    ${sha512_verify} $base_artifact.sha512 || exit 1
     popd
   done
 }

From 1bac50527ed3de83dbf6559caccf94b838305d8f Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Wed, 27 Oct 2021 05:28:49 +0900
Subject: [PATCH 022/194] ARROW-14390: [Packaging][Ubuntu] Add support for
 Ubuntu 21.10

We need a workaround for aws-sdk-cpp with g++ 11 because Ubuntu
21.10 ships g++ 11.
See also: https://github.com/aws/aws-sdk-cpp/issues/1750

Closes #11532 from kou/packaging-ubuntu-21.10

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 cpp/cmake_modules/ThirdpartyToolchain.cmake   |  8 ++
 dev/release/binary-task.rb                    |  2 +-
 dev/release/verify-release-candidate.sh       | 19 +++--
 dev/tasks/linux-packages/Rakefile             |  4 +
 .../apt/ubuntu-impish/Dockerfile              | 41 +++++++++
 .../apache-arrow/apt/ubuntu-impish-arm64/from | 18 ++++
 .../apache-arrow/apt/ubuntu-impish/Dockerfile | 83 +++++++++++++++++++
 .../linux-packages/github.linux.amd64.yml     |  4 +
 dev/tasks/linux-packages/package-task.rb      | 10 ++-
 dev/tasks/tasks.yml                           |  3 +-
 10 files changed, 181 insertions(+), 11 deletions(-)
 create mode 100644 dev/tasks/linux-packages/apache-arrow-apt-source/apt/ubuntu-impish/Dockerfile
 create mode 100644 dev/tasks/linux-packages/apache-arrow/apt/ubuntu-impish-arm64/from
 create mode 100644 dev/tasks/linux-packages/apache-arrow/apt/ubuntu-impish/Dockerfile

diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake
index 673a58eedad8a..a793f30462dac 100644
--- a/cpp/cmake_modules/ThirdpartyToolchain.cmake
+++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake
@@ -3937,11 +3937,19 @@ macro(build_awssdk)
                       DEPENDS aws_checksums_ep)
   add_dependencies(AWS::aws-c-event-stream aws_c_event_stream_ep)
 
+  set(AWSSDK_PATCH_COMMAND)
+  if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER
+                                              "10")
+    # Workaround for https://github.com/aws/aws-sdk-cpp/issues/1750
+    set(AWSSDK_PATCH_COMMAND "sed" "-i.bak" "-e" "s/\"-Werror\"//g"
+                             "<SOURCE_DIR>/cmake/compiler_settings.cmake")
+  endif()
   externalproject_add(awssdk_ep
                       ${EP_LOG_OPTIONS}
                       URL ${AWSSDK_SOURCE_URL}
                       URL_HASH "SHA256=${ARROW_AWSSDK_BUILD_SHA256_CHECKSUM}"
                       CMAKE_ARGS ${AWSSDK_CMAKE_ARGS}
+                      PATCH_COMMAND ${AWSSDK_PATCH_COMMAND}
                       BUILD_BYPRODUCTS ${AWS_CPP_SDK_COGNITO_IDENTITY_STATIC_LIBRARY}
                                        ${AWS_CPP_SDK_CORE_STATIC_LIBRARY}
                                        ${AWS_CPP_SDK_IDENTITY_MANAGEMENT_STATIC_LIBRARY}
diff --git a/dev/release/binary-task.rb b/dev/release/binary-task.rb
index f6a1619c1d8f9..5f88e477e553a 100644
--- a/dev/release/binary-task.rb
+++ b/dev/release/binary-task.rb
@@ -1010,8 +1010,8 @@ def available_apt_targets
       ["debian", "bookworm", "main"],
       ["ubuntu", "bionic", "main"],
       ["ubuntu", "focal", "main"],
-      ["ubuntu", "groovy", "main"],
       ["ubuntu", "hirsute", "main"],
+      ["ubuntu", "impish", "main"],
     ]
   end
 
diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh
index 222d00f02327b..d2d299a3e2c10 100755
--- a/dev/release/verify-release-candidate.sh
+++ b/dev/release/verify-release-candidate.sh
@@ -154,16 +154,21 @@ test_apt() {
                 "ubuntu:focal" \
                 "arm64v8/ubuntu:focal" \
                 "ubuntu:hirsute" \
-                "arm64v8/ubuntu:hirsute"; do \
+                "arm64v8/ubuntu:hirsute" \
+                "ubuntu:impish" \
+                "arm64v8/ubuntu:impish"; do \
     case "${target}" in
-      arm64v8/debian:bullseye|arm64v8/debian:bookworm|arm64v8/ubuntu:hirsute)
-        # qemu-user-static in Ubuntu 20.04 has a crash bug:
-        #   https://bugs.launchpad.net/qemu/+bug/1749393
-        continue
-        ;;
       arm64v8/*)
         if [ "$(arch)" = "aarch64" -o -e /usr/bin/qemu-aarch64-static ]; then
-          : # OK
+          case "${target}" in
+          arm64v8/debian:buster|arm64v8/ubuntu:bionic|arm64v8/ubuntu:focal)
+            ;; # OK
+          *)
+            # qemu-user-static in Ubuntu 20.04 has a crash bug:
+            #   https://bugs.launchpad.net/qemu/+bug/1749393
+            continue
+            ;;
+          esac
         else
           continue
         fi
diff --git a/dev/tasks/linux-packages/Rakefile b/dev/tasks/linux-packages/Rakefile
index 6a67f2875de35..58cd981fed0f0 100644
--- a/dev/tasks/linux-packages/Rakefile
+++ b/dev/tasks/linux-packages/Rakefile
@@ -167,6 +167,8 @@ class LocalBinaryTask < BinaryTask
       # "ubuntu-focal-arm64",
       "ubuntu-hirsute",
       # "ubuntu-hirsute-arm64",
+      "ubuntu-impish",
+      # "ubuntu-impish-arm64",
     ]
   end
 
@@ -211,6 +213,8 @@ class LocalBinaryTask < BinaryTask
     # Disable aarch64 targets by default for now
     # because they require some setups on host.
     [
+      "almalinux-8",
+      # "almalinux-8-aarch64",
       "amazon-linux-2",
       # "amazon-linux-2-aarch64",
       "centos-7",
diff --git a/dev/tasks/linux-packages/apache-arrow-apt-source/apt/ubuntu-impish/Dockerfile b/dev/tasks/linux-packages/apache-arrow-apt-source/apt/ubuntu-impish/Dockerfile
new file mode 100644
index 0000000000000..640cd715da027
--- /dev/null
+++ b/dev/tasks/linux-packages/apache-arrow-apt-source/apt/ubuntu-impish/Dockerfile
@@ -0,0 +1,41 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+FROM ubuntu:impish
+
+RUN \
+  echo "debconf debconf/frontend select Noninteractive" | \
+    debconf-set-selections
+
+RUN \
+  echo 'APT::Install-Recommends "false";' > \
+    /etc/apt/apt.conf.d/disable-install-recommends
+
+ARG DEBUG
+
+RUN \
+  quiet=$([ "${DEBUG}" = "yes" ] || echo "-qq") && \
+  apt update ${quiet} && \
+  apt install -y -V ${quiet} \
+    build-essential \
+    debhelper \
+    devscripts \
+    fakeroot \
+    gnupg \
+    lsb-release && \
+  apt clean && \
+  rm -rf /var/lib/apt/lists/*
diff --git a/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-impish-arm64/from b/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-impish-arm64/from
new file mode 100644
index 0000000000000..0c8a1c7a1b9c8
--- /dev/null
+++ b/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-impish-arm64/from
@@ -0,0 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+arm64v8/ubuntu:impish
diff --git a/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-impish/Dockerfile b/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-impish/Dockerfile
new file mode 100644
index 0000000000000..2e94dd9b6213b
--- /dev/null
+++ b/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-impish/Dockerfile
@@ -0,0 +1,83 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+ARG FROM=ubuntu:impish
+FROM ${FROM}
+
+RUN \
+  echo "debconf debconf/frontend select Noninteractive" | \
+    debconf-set-selections
+
+RUN \
+  echo 'APT::Install-Recommends "false";' > \
+    /etc/apt/apt.conf.d/disable-install-recommends
+
+ARG DEBUG
+RUN \
+  quiet=$([ "${DEBUG}" = "yes" ] || echo "-qq") && \
+  apt update ${quiet} && \
+  apt install -y -V ${quiet} \
+    build-essential \
+    ccache \
+    clang \
+    cmake \
+    debhelper \
+    devscripts \
+    git \
+    gtk-doc-tools \
+    libboost-filesystem-dev \
+    libboost-system-dev \
+    libbrotli-dev \
+    libbz2-dev \
+    libc-ares-dev \
+    libcurl4-openssl-dev \
+    libgirepository1.0-dev \
+    libglib2.0-doc \
+    libgmock-dev \
+    libgoogle-glog-dev \
+    libgrpc++-dev \
+    libgtest-dev \
+    liblz4-dev \
+    libprotoc-dev \
+    libprotobuf-dev \
+    libre2-dev \
+    libsnappy-dev \
+    libssl-dev \
+    libthrift-dev \
+    libutf8proc-dev \
+    libzstd-dev \
+    llvm-dev \
+    lsb-release \
+    ninja-build \
+    pkg-config \
+    protobuf-compiler-grpc \
+    python3-dev \
+    python3-numpy \
+    python3-pip \
+    python3-setuptools \
+    rapidjson-dev \
+    tzdata \
+    zlib1g-dev && \
+  if apt list | grep -q '^libcuda1'; then \
+    apt install -y -V ${quiet} nvidia-cuda-toolkit; \
+  else \
+    :; \
+  fi && \
+  apt clean && \
+  python3 -m pip install --no-use-pep517 meson && \
+  ln -s /usr/local/bin/meson /usr/bin/ && \
+  rm -rf /var/lib/apt/lists/*
diff --git a/dev/tasks/linux-packages/github.linux.amd64.yml b/dev/tasks/linux-packages/github.linux.amd64.yml
index 9f3523872f64e..557c4ab41305b 100644
--- a/dev/tasks/linux-packages/github.linux.amd64.yml
+++ b/dev/tasks/linux-packages/github.linux.amd64.yml
@@ -54,6 +54,10 @@ jobs:
           ARROW_VERSION: {{ arrow.version }}
           REPO: {{ '${{ secrets.REPO }}' }}
           YUM_TARGETS: {{ target }}
+      - uses: actions/upload-artifact@v2
+        with:
+          name: packages
+          path: packages/*/{{ task_namespace }}/repositories/
       - name: Docker Push
         continue-on-error: true
         shell: bash
diff --git a/dev/tasks/linux-packages/package-task.rb b/dev/tasks/linux-packages/package-task.rb
index 6eb95f600bf2d..9dd1b2d935ad4 100644
--- a/dev/tasks/linux-packages/package-task.rb
+++ b/dev/tasks/linux-packages/package-task.rb
@@ -100,9 +100,9 @@ def download(url, output_path)
     unless File.exist?(absolute_output_path)
       mkdir_p(File.dirname(absolute_output_path))
       rake_output_message "Downloading... #{url}"
-      URI(url).open do |downloaded_file|
+      open_url(url) do |downloaded_file|
         File.open(absolute_output_path, "wb") do |output_file|
-          output_file.print(downloaded_file.read)
+          IO.copy_stream(downloaded_file, output_file)
         end
       end
     end
@@ -110,6 +110,10 @@ def download(url, output_path)
     absolute_output_path
   end
 
+  def open_url(url, &block)
+    URI(url).open(&block)
+  end
+
   def substitute_content(content)
     content.gsub(/@(.+?)@/) do |matched|
       yield($1, matched)
@@ -270,6 +274,8 @@ def apt_targets_default
       # "ubuntu-focal-arm64",
       "ubuntu-hirsute",
       # "ubuntu-hirsute-arm64",
+      "ubuntu-impish",
+      # "ubuntu-impish-arm64",
     ]
   end
 
diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml
index c007e7f428af0..7b6946368c5c6 100644
--- a/dev/tasks/tasks.yml
+++ b/dev/tasks/tasks.yml
@@ -446,7 +446,8 @@ tasks:
                   "debian-bookworm",
                   "ubuntu-bionic",
                   "ubuntu-focal",
-                  "ubuntu-hirsute"] %}
+                  "ubuntu-hirsute",
+                  "ubuntu-impish"] %}
   {% for architecture in ["amd64", "arm64"] %}
   {{ target }}-{{ architecture }}:
     {% if architecture == "amd64" %}

From 1b7178d47f4c117a6dba4c8da1733bba31287127 Mon Sep 17 00:00:00 2001
From: Carlos O'Ryan <coryan@google.com>
Date: Wed, 27 Oct 2021 08:33:40 +0900
Subject: [PATCH 023/194] ARROW-14311: [C++] Make GCS FileSystem tests faster

Avoid using the default credentials for the unit test. In some
environments these can be very slow to initialize, as they test several
different potential sources of credentials, some including HTTP
requests.

Closes #11406 from coryan/ARROW-14311-GCS-filesystem-now-uses-insecure-credentials-for-http

Authored-by: Carlos O'Ryan <coryan@google.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 cpp/src/arrow/filesystem/gcsfs.cc | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/cpp/src/arrow/filesystem/gcsfs.cc b/cpp/src/arrow/filesystem/gcsfs.cc
index 898e54cf593bd..beb6c0a89ef90 100644
--- a/cpp/src/arrow/filesystem/gcsfs.cc
+++ b/cpp/src/arrow/filesystem/gcsfs.cc
@@ -106,16 +106,16 @@ class GcsInputStream : public arrow::io::InputStream {
 
 google::cloud::Options AsGoogleCloudOptions(const GcsOptions& o) {
   auto options = google::cloud::Options{};
+  std::string scheme = o.scheme;
+  if (scheme.empty()) scheme = "https";
+  if (scheme == "https") {
+    options.set<google::cloud::UnifiedCredentialsOption>(
+        google::cloud::MakeGoogleDefaultCredentials());
+  } else {
+    options.set<google::cloud::UnifiedCredentialsOption>(
+        google::cloud::MakeInsecureCredentials());
+  }
   if (!o.endpoint_override.empty()) {
-    std::string scheme = o.scheme;
-    if (scheme.empty()) scheme = "https";
-    if (scheme == "https") {
-      options.set<google::cloud::UnifiedCredentialsOption>(
-          google::cloud::MakeGoogleDefaultCredentials());
-    } else {
-      options.set<google::cloud::UnifiedCredentialsOption>(
-          google::cloud::MakeInsecureCredentials());
-    }
     options.set<gcs::RestEndpointOption>(scheme + "://" + o.endpoint_override);
   }
   return options;

From 1779e94403fbef8bc654b5595ea26ccdaf4cac18 Mon Sep 17 00:00:00 2001
From: David Li <li.davidm96@gmail.com>
Date: Wed, 27 Oct 2021 02:12:10 +0000
Subject: [PATCH 024/194] ARROW-14475: [C++] Don't shadow enable_if helpers

In unity builds, this can cause odd build or runtime errors when the wrong template gets used - furthermore they manifest randomly on other PRs if they happen to change things enough that CMake groups source files differently for the unity build.

Closes #11547 from lidavidm/arrow-14475

Authored-by: David Li <li.davidm96@gmail.com>
Signed-off-by: Yibo Cai <yibo.cai@arm.com>
---
 .../arrow/compute/kernels/codegen_internal.h  |  32 +++
 .../compute/kernels/scalar_arithmetic.cc      | 217 ++++++++----------
 .../arrow/compute/kernels/scalar_compare.cc   |  27 +--
 3 files changed, 138 insertions(+), 138 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.h b/cpp/src/arrow/compute/kernels/codegen_internal.h
index 2a1167c48e273..aa199f494da80 100644
--- a/cpp/src/arrow/compute/kernels/codegen_internal.h
+++ b/cpp/src/arrow/compute/kernels/codegen_internal.h
@@ -188,6 +188,38 @@ struct GetOutputType<Decimal256Type> {
   using T = Decimal256;
 };
 
+// ----------------------------------------------------------------------
+// enable_if helpers for C types
+
+template <typename T>
+using is_unsigned_integer_value =
+    std::integral_constant<bool,
+                           std::is_integral<T>::value && std::is_unsigned<T>::value>;
+
+template <typename T>
+using is_signed_integer_value =
+    std::integral_constant<bool, std::is_integral<T>::value && std::is_signed<T>::value>;
+
+template <typename T, typename R = T>
+using enable_if_signed_integer_value = enable_if_t<is_signed_integer_value<T>::value, R>;
+
+template <typename T, typename R = T>
+using enable_if_unsigned_integer_value =
+    enable_if_t<is_unsigned_integer_value<T>::value, R>;
+
+template <typename T, typename R = T>
+using enable_if_integer_value =
+    enable_if_t<is_signed_integer_value<T>::value || is_unsigned_integer_value<T>::value,
+                R>;
+
+template <typename T, typename R = T>
+using enable_if_floating_value = enable_if_t<std::is_floating_point<T>::value, R>;
+
+template <typename T, typename R = T>
+using enable_if_decimal_value =
+    enable_if_t<std::is_same<Decimal128, T>::value || std::is_same<Decimal256, T>::value,
+                R>;
+
 // ----------------------------------------------------------------------
 // Iteration / value access utilities
 
diff --git a/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc b/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc
index 48a5e815b6759..be6f35164457e 100644
--- a/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc
@@ -53,55 +53,30 @@ namespace {
 // N.B. take care not to conflict with type_traits.h as that can cause surprises in a
 // unity build
 
-template <typename T>
-using is_unsigned_integer = std::integral_constant<bool, std::is_integral<T>::value &&
-                                                             std::is_unsigned<T>::value>;
-
-template <typename T>
-using is_signed_integer =
-    std::integral_constant<bool, std::is_integral<T>::value && std::is_signed<T>::value>;
-
-template <typename T, typename R = T>
-using enable_if_signed_c_integer = enable_if_t<is_signed_integer<T>::value, R>;
-
-template <typename T, typename R = T>
-using enable_if_unsigned_c_integer = enable_if_t<is_unsigned_integer<T>::value, R>;
-
-template <typename T, typename R = T>
-using enable_if_c_integer =
-    enable_if_t<is_signed_integer<T>::value || is_unsigned_integer<T>::value, R>;
-
-template <typename T, typename R = T>
-using enable_if_floating_point = enable_if_t<std::is_floating_point<T>::value, R>;
-
-template <typename T, typename R = T>
-using enable_if_decimal_value =
-    enable_if_t<std::is_same<Decimal128, T>::value || std::is_same<Decimal256, T>::value,
-                R>;
-
 struct AbsoluteValue {
   template <typename T, typename Arg>
-  static constexpr enable_if_floating_point<Arg, T> Call(KernelContext*, Arg arg,
+  static constexpr enable_if_floating_value<Arg, T> Call(KernelContext*, Arg arg,
                                                          Status*) {
     return std::fabs(arg);
   }
 
   template <typename T, typename Arg>
-  static constexpr enable_if_unsigned_c_integer<Arg, T> Call(KernelContext*, Arg arg,
-                                                             Status*) {
+  static constexpr enable_if_unsigned_integer_value<Arg, T> Call(KernelContext*, Arg arg,
+                                                                 Status*) {
     return arg;
   }
 
   template <typename T, typename Arg>
-  static constexpr enable_if_signed_c_integer<Arg, T> Call(KernelContext*, Arg arg,
-                                                           Status* st) {
+  static constexpr enable_if_signed_integer_value<Arg, T> Call(KernelContext*, Arg arg,
+                                                               Status* st) {
     return (arg < 0) ? arrow::internal::SafeSignedNegate(arg) : arg;
   }
 };
 
 struct AbsoluteValueChecked {
   template <typename T, typename Arg>
-  static enable_if_signed_c_integer<Arg, T> Call(KernelContext*, Arg arg, Status* st) {
+  static enable_if_signed_integer_value<Arg, T> Call(KernelContext*, Arg arg,
+                                                     Status* st) {
     static_assert(std::is_same<T, Arg>::value, "");
     if (arg == std::numeric_limits<Arg>::min()) {
       *st = Status::Invalid("overflow");
@@ -111,14 +86,14 @@ struct AbsoluteValueChecked {
   }
 
   template <typename T, typename Arg>
-  static enable_if_unsigned_c_integer<Arg, T> Call(KernelContext* ctx, Arg arg,
-                                                   Status* st) {
+  static enable_if_unsigned_integer_value<Arg, T> Call(KernelContext* ctx, Arg arg,
+                                                       Status* st) {
     static_assert(std::is_same<T, Arg>::value, "");
     return arg;
   }
 
   template <typename T, typename Arg>
-  static constexpr enable_if_floating_point<Arg, T> Call(KernelContext*, Arg arg,
+  static constexpr enable_if_floating_value<Arg, T> Call(KernelContext*, Arg arg,
                                                          Status* st) {
     static_assert(std::is_same<T, Arg>::value, "");
     return std::fabs(arg);
@@ -127,20 +102,20 @@ struct AbsoluteValueChecked {
 
 struct Add {
   template <typename T, typename Arg0, typename Arg1>
-  static constexpr enable_if_floating_point<T> Call(KernelContext*, Arg0 left, Arg1 right,
+  static constexpr enable_if_floating_value<T> Call(KernelContext*, Arg0 left, Arg1 right,
                                                     Status*) {
     return left + right;
   }
 
   template <typename T, typename Arg0, typename Arg1>
-  static constexpr enable_if_unsigned_c_integer<T> Call(KernelContext*, Arg0 left,
-                                                        Arg1 right, Status*) {
+  static constexpr enable_if_unsigned_integer_value<T> Call(KernelContext*, Arg0 left,
+                                                            Arg1 right, Status*) {
     return left + right;
   }
 
   template <typename T, typename Arg0, typename Arg1>
-  static constexpr enable_if_signed_c_integer<T> Call(KernelContext*, Arg0 left,
-                                                      Arg1 right, Status*) {
+  static constexpr enable_if_signed_integer_value<T> Call(KernelContext*, Arg0 left,
+                                                          Arg1 right, Status*) {
     return arrow::internal::SafeSignedAdd(left, right);
   }
 
@@ -152,7 +127,8 @@ struct Add {
 
 struct AddChecked {
   template <typename T, typename Arg0, typename Arg1>
-  static enable_if_c_integer<T> Call(KernelContext*, Arg0 left, Arg1 right, Status* st) {
+  static enable_if_integer_value<T> Call(KernelContext*, Arg0 left, Arg1 right,
+                                         Status* st) {
     static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
     T result = 0;
     if (ARROW_PREDICT_FALSE(AddWithOverflow(left, right, &result))) {
@@ -162,7 +138,7 @@ struct AddChecked {
   }
 
   template <typename T, typename Arg0, typename Arg1>
-  static enable_if_floating_point<T> Call(KernelContext*, Arg0 left, Arg1 right,
+  static enable_if_floating_value<T> Call(KernelContext*, Arg0 left, Arg1 right,
                                           Status*) {
     static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
     return left + right;
@@ -176,22 +152,22 @@ struct AddChecked {
 
 struct Subtract {
   template <typename T, typename Arg0, typename Arg1>
-  static constexpr enable_if_floating_point<T> Call(KernelContext*, Arg0 left, Arg1 right,
+  static constexpr enable_if_floating_value<T> Call(KernelContext*, Arg0 left, Arg1 right,
                                                     Status*) {
     static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
     return left - right;
   }
 
   template <typename T, typename Arg0, typename Arg1>
-  static constexpr enable_if_unsigned_c_integer<T> Call(KernelContext*, Arg0 left,
-                                                        Arg1 right, Status*) {
+  static constexpr enable_if_unsigned_integer_value<T> Call(KernelContext*, Arg0 left,
+                                                            Arg1 right, Status*) {
     static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
     return left - right;
   }
 
   template <typename T, typename Arg0, typename Arg1>
-  static constexpr enable_if_signed_c_integer<T> Call(KernelContext*, Arg0 left,
-                                                      Arg1 right, Status*) {
+  static constexpr enable_if_signed_integer_value<T> Call(KernelContext*, Arg0 left,
+                                                          Arg1 right, Status*) {
     static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
     return arrow::internal::SafeSignedSubtract(left, right);
   }
@@ -204,7 +180,8 @@ struct Subtract {
 
 struct SubtractChecked {
   template <typename T, typename Arg0, typename Arg1>
-  static enable_if_c_integer<T> Call(KernelContext*, Arg0 left, Arg1 right, Status* st) {
+  static enable_if_integer_value<T> Call(KernelContext*, Arg0 left, Arg1 right,
+                                         Status* st) {
     static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
     T result = 0;
     if (ARROW_PREDICT_FALSE(SubtractWithOverflow(left, right, &result))) {
@@ -214,7 +191,7 @@ struct SubtractChecked {
   }
 
   template <typename T, typename Arg0, typename Arg1>
-  static enable_if_floating_point<T> Call(KernelContext*, Arg0 left, Arg1 right,
+  static enable_if_floating_value<T> Call(KernelContext*, Arg0 left, Arg1 right,
                                           Status*) {
     static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
     return left - right;
@@ -237,21 +214,21 @@ struct Multiply {
   static_assert(std::is_same<decltype(uint64_t() * uint64_t()), uint64_t>::value, "");
 
   template <typename T, typename Arg0, typename Arg1>
-  static constexpr enable_if_floating_point<T> Call(KernelContext*, T left, T right,
+  static constexpr enable_if_floating_value<T> Call(KernelContext*, T left, T right,
                                                     Status*) {
     return left * right;
   }
 
   template <typename T, typename Arg0, typename Arg1>
   static constexpr enable_if_t<
-      is_unsigned_integer<T>::value && !std::is_same<T, uint16_t>::value, T>
+      is_unsigned_integer_value<T>::value && !std::is_same<T, uint16_t>::value, T>
   Call(KernelContext*, T left, T right, Status*) {
     return left * right;
   }
 
   template <typename T, typename Arg0, typename Arg1>
   static constexpr enable_if_t<
-      is_signed_integer<T>::value && !std::is_same<T, int16_t>::value, T>
+      is_signed_integer_value<T>::value && !std::is_same<T, int16_t>::value, T>
   Call(KernelContext*, T left, T right, Status*) {
     return to_unsigned(left) * to_unsigned(right);
   }
@@ -279,7 +256,8 @@ struct Multiply {
 
 struct MultiplyChecked {
   template <typename T, typename Arg0, typename Arg1>
-  static enable_if_c_integer<T> Call(KernelContext*, Arg0 left, Arg1 right, Status* st) {
+  static enable_if_integer_value<T> Call(KernelContext*, Arg0 left, Arg1 right,
+                                         Status* st) {
     static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
     T result = 0;
     if (ARROW_PREDICT_FALSE(MultiplyWithOverflow(left, right, &result))) {
@@ -289,7 +267,7 @@ struct MultiplyChecked {
   }
 
   template <typename T, typename Arg0, typename Arg1>
-  static enable_if_floating_point<T> Call(KernelContext*, Arg0 left, Arg1 right,
+  static enable_if_floating_value<T> Call(KernelContext*, Arg0 left, Arg1 right,
                                           Status*) {
     static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
     return left * right;
@@ -303,13 +281,14 @@ struct MultiplyChecked {
 
 struct Divide {
   template <typename T, typename Arg0, typename Arg1>
-  static enable_if_floating_point<T> Call(KernelContext*, Arg0 left, Arg1 right,
+  static enable_if_floating_value<T> Call(KernelContext*, Arg0 left, Arg1 right,
                                           Status*) {
     return left / right;
   }
 
   template <typename T, typename Arg0, typename Arg1>
-  static enable_if_c_integer<T> Call(KernelContext*, Arg0 left, Arg1 right, Status* st) {
+  static enable_if_integer_value<T> Call(KernelContext*, Arg0 left, Arg1 right,
+                                         Status* st) {
     T result;
     if (ARROW_PREDICT_FALSE(DivideWithOverflow(left, right, &result))) {
       if (right == 0) {
@@ -335,7 +314,8 @@ struct Divide {
 
 struct DivideChecked {
   template <typename T, typename Arg0, typename Arg1>
-  static enable_if_c_integer<T> Call(KernelContext*, Arg0 left, Arg1 right, Status* st) {
+  static enable_if_integer_value<T> Call(KernelContext*, Arg0 left, Arg1 right,
+                                         Status* st) {
     static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
     T result;
     if (ARROW_PREDICT_FALSE(DivideWithOverflow(left, right, &result))) {
@@ -349,7 +329,7 @@ struct DivideChecked {
   }
 
   template <typename T, typename Arg0, typename Arg1>
-  static enable_if_floating_point<T> Call(KernelContext*, Arg0 left, Arg1 right,
+  static enable_if_floating_value<T> Call(KernelContext*, Arg0 left, Arg1 right,
                                           Status* st) {
     static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
     if (ARROW_PREDICT_FALSE(right == 0)) {
@@ -368,25 +348,27 @@ struct DivideChecked {
 
 struct Negate {
   template <typename T, typename Arg>
-  static constexpr enable_if_floating_point<T> Call(KernelContext*, Arg arg, Status*) {
+  static constexpr enable_if_floating_value<T> Call(KernelContext*, Arg arg, Status*) {
     return -arg;
   }
 
   template <typename T, typename Arg>
-  static constexpr enable_if_unsigned_c_integer<T> Call(KernelContext*, Arg arg,
-                                                        Status*) {
+  static constexpr enable_if_unsigned_integer_value<T> Call(KernelContext*, Arg arg,
+                                                            Status*) {
     return ~arg + 1;
   }
 
   template <typename T, typename Arg>
-  static constexpr enable_if_signed_c_integer<T> Call(KernelContext*, Arg arg, Status*) {
+  static constexpr enable_if_signed_integer_value<T> Call(KernelContext*, Arg arg,
+                                                          Status*) {
     return arrow::internal::SafeSignedNegate(arg);
   }
 };
 
 struct NegateChecked {
   template <typename T, typename Arg>
-  static enable_if_signed_c_integer<Arg, T> Call(KernelContext*, Arg arg, Status* st) {
+  static enable_if_signed_integer_value<Arg, T> Call(KernelContext*, Arg arg,
+                                                     Status* st) {
     static_assert(std::is_same<T, Arg>::value, "");
     T result = 0;
     if (ARROW_PREDICT_FALSE(NegateWithOverflow(arg, &result))) {
@@ -396,8 +378,8 @@ struct NegateChecked {
   }
 
   template <typename T, typename Arg>
-  static enable_if_unsigned_c_integer<Arg, T> Call(KernelContext* ctx, Arg arg,
-                                                   Status* st) {
+  static enable_if_unsigned_integer_value<Arg, T> Call(KernelContext* ctx, Arg arg,
+                                                       Status* st) {
     static_assert(std::is_same<T, Arg>::value, "");
     DCHECK(false) << "This is included only for the purposes of instantiability from the "
                      "arithmetic kernel generator";
@@ -405,7 +387,7 @@ struct NegateChecked {
   }
 
   template <typename T, typename Arg>
-  static constexpr enable_if_floating_point<Arg, T> Call(KernelContext*, Arg arg,
+  static constexpr enable_if_floating_value<Arg, T> Call(KernelContext*, Arg arg,
                                                          Status* st) {
     static_assert(std::is_same<T, Arg>::value, "");
     return -arg;
@@ -426,7 +408,7 @@ struct Power {
   }
 
   template <typename T, typename Arg0, typename Arg1>
-  static enable_if_c_integer<T> Call(KernelContext*, T base, T exp, Status* st) {
+  static enable_if_integer_value<T> Call(KernelContext*, T base, T exp, Status* st) {
     if (exp < 0) {
       *st = Status::Invalid("integers to negative integer powers are not allowed");
       return 0;
@@ -435,14 +417,15 @@ struct Power {
   }
 
   template <typename T, typename Arg0, typename Arg1>
-  static enable_if_floating_point<T> Call(KernelContext*, T base, T exp, Status*) {
+  static enable_if_floating_value<T> Call(KernelContext*, T base, T exp, Status*) {
     return std::pow(base, exp);
   }
 };
 
 struct PowerChecked {
   template <typename T, typename Arg0, typename Arg1>
-  static enable_if_c_integer<T> Call(KernelContext*, Arg0 base, Arg1 exp, Status* st) {
+  static enable_if_integer_value<T> Call(KernelContext*, Arg0 base, Arg1 exp,
+                                         Status* st) {
     if (exp < 0) {
       *st = Status::Invalid("integers to negative integer powers are not allowed");
       return 0;
@@ -468,7 +451,7 @@ struct PowerChecked {
   }
 
   template <typename T, typename Arg0, typename Arg1>
-  static enable_if_floating_point<T> Call(KernelContext*, Arg0 base, Arg1 exp, Status*) {
+  static enable_if_floating_value<T> Call(KernelContext*, Arg0 base, Arg1 exp, Status*) {
     static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
     return std::pow(base, exp);
   }
@@ -476,20 +459,20 @@ struct PowerChecked {
 
 struct Sign {
   template <typename T, typename Arg>
-  static constexpr enable_if_floating_point<Arg, T> Call(KernelContext*, Arg arg,
+  static constexpr enable_if_floating_value<Arg, T> Call(KernelContext*, Arg arg,
                                                          Status*) {
     return std::isnan(arg) ? arg : ((arg == 0) ? 0 : (std::signbit(arg) ? -1 : 1));
   }
 
   template <typename T, typename Arg>
-  static constexpr enable_if_unsigned_c_integer<Arg, T> Call(KernelContext*, Arg arg,
-                                                             Status*) {
+  static constexpr enable_if_unsigned_integer_value<Arg, T> Call(KernelContext*, Arg arg,
+                                                                 Status*) {
     return (arg > 0) ? 1 : 0;
   }
 
   template <typename T, typename Arg>
-  static constexpr enable_if_signed_c_integer<Arg, T> Call(KernelContext*, Arg arg,
-                                                           Status*) {
+  static constexpr enable_if_signed_integer_value<Arg, T> Call(KernelContext*, Arg arg,
+                                                               Status*) {
     return (arg > 0) ? 1 : ((arg == 0) ? 0 : -1);
   }
 };
@@ -539,8 +522,8 @@ struct ShiftLeft {
 // See SEI CERT C Coding Standard rule INT34-C
 struct ShiftLeftChecked {
   template <typename T, typename Arg0, typename Arg1>
-  static enable_if_unsigned_c_integer<T> Call(KernelContext*, Arg0 lhs, Arg1 rhs,
-                                              Status* st) {
+  static enable_if_unsigned_integer_value<T> Call(KernelContext*, Arg0 lhs, Arg1 rhs,
+                                                  Status* st) {
     static_assert(std::is_same<T, Arg0>::value, "");
     if (ARROW_PREDICT_FALSE(rhs < 0 || rhs >= std::numeric_limits<Arg0>::digits)) {
       *st = Status::Invalid("shift amount must be >= 0 and less than precision of type");
@@ -550,8 +533,8 @@ struct ShiftLeftChecked {
   }
 
   template <typename T, typename Arg0, typename Arg1>
-  static enable_if_signed_c_integer<T> Call(KernelContext*, Arg0 lhs, Arg1 rhs,
-                                            Status* st) {
+  static enable_if_signed_integer_value<T> Call(KernelContext*, Arg0 lhs, Arg1 rhs,
+                                                Status* st) {
     using Unsigned = typename std::make_unsigned<Arg0>::type;
     static_assert(std::is_same<T, Arg0>::value, "");
     if (ARROW_PREDICT_FALSE(rhs < 0 || rhs >= std::numeric_limits<Arg0>::digits)) {
@@ -596,7 +579,7 @@ struct ShiftRightChecked {
 
 struct Sin {
   template <typename T, typename Arg0>
-  static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status*) {
+  static enable_if_floating_value<Arg0, T> Call(KernelContext*, Arg0 val, Status*) {
     static_assert(std::is_same<T, Arg0>::value, "");
     return std::sin(val);
   }
@@ -604,7 +587,7 @@ struct Sin {
 
 struct SinChecked {
   template <typename T, typename Arg0>
-  static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status* st) {
+  static enable_if_floating_value<Arg0, T> Call(KernelContext*, Arg0 val, Status* st) {
     static_assert(std::is_same<T, Arg0>::value, "");
     if (ARROW_PREDICT_FALSE(std::isinf(val))) {
       *st = Status::Invalid("domain error");
@@ -616,7 +599,7 @@ struct SinChecked {
 
 struct Cos {
   template <typename T, typename Arg0>
-  static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status*) {
+  static enable_if_floating_value<Arg0, T> Call(KernelContext*, Arg0 val, Status*) {
     static_assert(std::is_same<T, Arg0>::value, "");
     return std::cos(val);
   }
@@ -624,7 +607,7 @@ struct Cos {
 
 struct CosChecked {
   template <typename T, typename Arg0>
-  static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status* st) {
+  static enable_if_floating_value<Arg0, T> Call(KernelContext*, Arg0 val, Status* st) {
     static_assert(std::is_same<T, Arg0>::value, "");
     if (ARROW_PREDICT_FALSE(std::isinf(val))) {
       *st = Status::Invalid("domain error");
@@ -636,7 +619,7 @@ struct CosChecked {
 
 struct Tan {
   template <typename T, typename Arg0>
-  static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status*) {
+  static enable_if_floating_value<Arg0, T> Call(KernelContext*, Arg0 val, Status*) {
     static_assert(std::is_same<T, Arg0>::value, "");
     return std::tan(val);
   }
@@ -644,7 +627,7 @@ struct Tan {
 
 struct TanChecked {
   template <typename T, typename Arg0>
-  static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status* st) {
+  static enable_if_floating_value<Arg0, T> Call(KernelContext*, Arg0 val, Status* st) {
     static_assert(std::is_same<T, Arg0>::value, "");
     if (ARROW_PREDICT_FALSE(std::isinf(val))) {
       *st = Status::Invalid("domain error");
@@ -657,7 +640,7 @@ struct TanChecked {
 
 struct Asin {
   template <typename T, typename Arg0>
-  static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status*) {
+  static enable_if_floating_value<Arg0, T> Call(KernelContext*, Arg0 val, Status*) {
     static_assert(std::is_same<T, Arg0>::value, "");
     if (ARROW_PREDICT_FALSE(val < -1.0 || val > 1.0)) {
       return std::numeric_limits<T>::quiet_NaN();
@@ -668,7 +651,7 @@ struct Asin {
 
 struct AsinChecked {
   template <typename T, typename Arg0>
-  static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status* st) {
+  static enable_if_floating_value<Arg0, T> Call(KernelContext*, Arg0 val, Status* st) {
     static_assert(std::is_same<T, Arg0>::value, "");
     if (ARROW_PREDICT_FALSE(val < -1.0 || val > 1.0)) {
       *st = Status::Invalid("domain error");
@@ -680,7 +663,7 @@ struct AsinChecked {
 
 struct Acos {
   template <typename T, typename Arg0>
-  static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status*) {
+  static enable_if_floating_value<Arg0, T> Call(KernelContext*, Arg0 val, Status*) {
     static_assert(std::is_same<T, Arg0>::value, "");
     if (ARROW_PREDICT_FALSE((val < -1.0 || val > 1.0))) {
       return std::numeric_limits<T>::quiet_NaN();
@@ -691,7 +674,7 @@ struct Acos {
 
 struct AcosChecked {
   template <typename T, typename Arg0>
-  static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status* st) {
+  static enable_if_floating_value<Arg0, T> Call(KernelContext*, Arg0 val, Status* st) {
     static_assert(std::is_same<T, Arg0>::value, "");
     if (ARROW_PREDICT_FALSE((val < -1.0 || val > 1.0))) {
       *st = Status::Invalid("domain error");
@@ -703,7 +686,7 @@ struct AcosChecked {
 
 struct Atan {
   template <typename T, typename Arg0>
-  static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status*) {
+  static enable_if_floating_value<Arg0, T> Call(KernelContext*, Arg0 val, Status*) {
     static_assert(std::is_same<T, Arg0>::value, "");
     return std::atan(val);
   }
@@ -711,7 +694,7 @@ struct Atan {
 
 struct Atan2 {
   template <typename T, typename Arg0, typename Arg1>
-  static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 y, Arg1 x, Status*) {
+  static enable_if_floating_value<Arg0, T> Call(KernelContext*, Arg0 y, Arg1 x, Status*) {
     static_assert(std::is_same<T, Arg0>::value, "");
     static_assert(std::is_same<Arg0, Arg1>::value, "");
     return std::atan2(y, x);
@@ -720,7 +703,7 @@ struct Atan2 {
 
 struct LogNatural {
   template <typename T, typename Arg>
-  static enable_if_floating_point<Arg, T> Call(KernelContext*, Arg arg, Status*) {
+  static enable_if_floating_value<Arg, T> Call(KernelContext*, Arg arg, Status*) {
     static_assert(std::is_same<T, Arg>::value, "");
     if (arg == 0.0) {
       return -std::numeric_limits<T>::infinity();
@@ -733,7 +716,7 @@ struct LogNatural {
 
 struct LogNaturalChecked {
   template <typename T, typename Arg>
-  static enable_if_floating_point<Arg, T> Call(KernelContext*, Arg arg, Status* st) {
+  static enable_if_floating_value<Arg, T> Call(KernelContext*, Arg arg, Status* st) {
     static_assert(std::is_same<T, Arg>::value, "");
     if (arg == 0.0) {
       *st = Status::Invalid("logarithm of zero");
@@ -748,7 +731,7 @@ struct LogNaturalChecked {
 
 struct Log10 {
   template <typename T, typename Arg>
-  static enable_if_floating_point<Arg, T> Call(KernelContext*, Arg arg, Status*) {
+  static enable_if_floating_value<Arg, T> Call(KernelContext*, Arg arg, Status*) {
     static_assert(std::is_same<T, Arg>::value, "");
     if (arg == 0.0) {
       return -std::numeric_limits<T>::infinity();
@@ -761,7 +744,7 @@ struct Log10 {
 
 struct Log10Checked {
   template <typename T, typename Arg>
-  static enable_if_floating_point<Arg, T> Call(KernelContext*, Arg arg, Status* st) {
+  static enable_if_floating_value<Arg, T> Call(KernelContext*, Arg arg, Status* st) {
     static_assert(std::is_same<T, Arg>::value, "");
     if (arg == 0) {
       *st = Status::Invalid("logarithm of zero");
@@ -776,7 +759,7 @@ struct Log10Checked {
 
 struct Log2 {
   template <typename T, typename Arg>
-  static enable_if_floating_point<Arg, T> Call(KernelContext*, Arg arg, Status*) {
+  static enable_if_floating_value<Arg, T> Call(KernelContext*, Arg arg, Status*) {
     static_assert(std::is_same<T, Arg>::value, "");
     if (arg == 0.0) {
       return -std::numeric_limits<T>::infinity();
@@ -789,7 +772,7 @@ struct Log2 {
 
 struct Log2Checked {
   template <typename T, typename Arg>
-  static enable_if_floating_point<Arg, T> Call(KernelContext*, Arg arg, Status* st) {
+  static enable_if_floating_value<Arg, T> Call(KernelContext*, Arg arg, Status* st) {
     static_assert(std::is_same<T, Arg>::value, "");
     if (arg == 0.0) {
       *st = Status::Invalid("logarithm of zero");
@@ -804,7 +787,7 @@ struct Log2Checked {
 
 struct Log1p {
   template <typename T, typename Arg>
-  static enable_if_floating_point<Arg, T> Call(KernelContext*, Arg arg, Status*) {
+  static enable_if_floating_value<Arg, T> Call(KernelContext*, Arg arg, Status*) {
     static_assert(std::is_same<T, Arg>::value, "");
     if (arg == -1) {
       return -std::numeric_limits<T>::infinity();
@@ -817,7 +800,7 @@ struct Log1p {
 
 struct Log1pChecked {
   template <typename T, typename Arg>
-  static enable_if_floating_point<Arg, T> Call(KernelContext*, Arg arg, Status* st) {
+  static enable_if_floating_value<Arg, T> Call(KernelContext*, Arg arg, Status* st) {
     static_assert(std::is_same<T, Arg>::value, "");
     if (arg == -1) {
       *st = Status::Invalid("logarithm of zero");
@@ -832,7 +815,7 @@ struct Log1pChecked {
 
 struct Logb {
   template <typename T, typename Arg0, typename Arg1>
-  static enable_if_floating_point<T> Call(KernelContext*, Arg0 x, Arg1 base, Status*) {
+  static enable_if_floating_value<T> Call(KernelContext*, Arg0 x, Arg1 base, Status*) {
     static_assert(std::is_same<T, Arg0>::value, "");
     static_assert(std::is_same<Arg0, Arg1>::value, "");
     if (x == 0.0) {
@@ -850,7 +833,7 @@ struct Logb {
 
 struct LogbChecked {
   template <typename T, typename Arg0, typename Arg1>
-  static enable_if_floating_point<T> Call(KernelContext*, Arg0 x, Arg1 base, Status* st) {
+  static enable_if_floating_value<T> Call(KernelContext*, Arg0 x, Arg1 base, Status* st) {
     static_assert(std::is_same<T, Arg0>::value, "");
     static_assert(std::is_same<Arg0, Arg1>::value, "");
     if (x == 0.0 || base == 0.0) {
@@ -867,7 +850,7 @@ struct LogbChecked {
 struct RoundUtil {
   // Calculate powers of ten with arbitrary integer exponent
   template <typename T = double>
-  static enable_if_floating_point<T> Pow10(int64_t power) {
+  static enable_if_floating_value<T> Pow10(int64_t power) {
     static constexpr T lut[] = {1e0F, 1e1F, 1e2F,  1e3F,  1e4F,  1e5F,  1e6F,  1e7F,
                                 1e8F, 1e9F, 1e10F, 1e11F, 1e12F, 1e13F, 1e14F, 1e15F};
     int64_t lut_size = (sizeof(lut) / sizeof(*lut));
@@ -887,7 +870,7 @@ struct RoundImpl;
 template <typename Type>
 struct RoundImpl<Type, RoundMode::DOWN> {
   template <typename T = Type>
-  static constexpr enable_if_floating_point<T> Round(const T val) {
+  static constexpr enable_if_floating_value<T> Round(const T val) {
     return std::floor(val);
   }
 
@@ -904,7 +887,7 @@ struct RoundImpl<Type, RoundMode::DOWN> {
 template <typename Type>
 struct RoundImpl<Type, RoundMode::UP> {
   template <typename T = Type>
-  static constexpr enable_if_floating_point<T> Round(const T val) {
+  static constexpr enable_if_floating_value<T> Round(const T val) {
     return std::ceil(val);
   }
 
@@ -921,7 +904,7 @@ struct RoundImpl<Type, RoundMode::UP> {
 template <typename Type>
 struct RoundImpl<Type, RoundMode::TOWARDS_ZERO> {
   template <typename T = Type>
-  static constexpr enable_if_floating_point<T> Round(const T val) {
+  static constexpr enable_if_floating_value<T> Round(const T val) {
     return std::trunc(val);
   }
 
@@ -935,7 +918,7 @@ struct RoundImpl<Type, RoundMode::TOWARDS_ZERO> {
 template <typename Type>
 struct RoundImpl<Type, RoundMode::TOWARDS_INFINITY> {
   template <typename T = Type>
-  static constexpr enable_if_floating_point<T> Round(const T val) {
+  static constexpr enable_if_floating_value<T> Round(const T val) {
     return std::signbit(val) ? std::floor(val) : std::ceil(val);
   }
 
@@ -958,7 +941,7 @@ struct RoundImpl<Type, RoundMode::TOWARDS_INFINITY> {
 template <typename Type>
 struct RoundImpl<Type, RoundMode::HALF_DOWN> {
   template <typename T = Type>
-  static constexpr enable_if_floating_point<T> Round(const T val) {
+  static constexpr enable_if_floating_value<T> Round(const T val) {
     return RoundImpl<T, RoundMode::DOWN>::Round(val);
   }
 
@@ -972,7 +955,7 @@ struct RoundImpl<Type, RoundMode::HALF_DOWN> {
 template <typename Type>
 struct RoundImpl<Type, RoundMode::HALF_UP> {
   template <typename T = Type>
-  static constexpr enable_if_floating_point<T> Round(const T val) {
+  static constexpr enable_if_floating_value<T> Round(const T val) {
     return RoundImpl<T, RoundMode::UP>::Round(val);
   }
 
@@ -986,7 +969,7 @@ struct RoundImpl<Type, RoundMode::HALF_UP> {
 template <typename Type>
 struct RoundImpl<Type, RoundMode::HALF_TOWARDS_ZERO> {
   template <typename T = Type>
-  static constexpr enable_if_floating_point<T> Round(const T val) {
+  static constexpr enable_if_floating_value<T> Round(const T val) {
     return RoundImpl<T, RoundMode::TOWARDS_ZERO>::Round(val);
   }
 
@@ -1000,7 +983,7 @@ struct RoundImpl<Type, RoundMode::HALF_TOWARDS_ZERO> {
 template <typename Type>
 struct RoundImpl<Type, RoundMode::HALF_TOWARDS_INFINITY> {
   template <typename T = Type>
-  static constexpr enable_if_floating_point<T> Round(const T val) {
+  static constexpr enable_if_floating_value<T> Round(const T val) {
     return RoundImpl<T, RoundMode::TOWARDS_INFINITY>::Round(val);
   }
 
@@ -1014,7 +997,7 @@ struct RoundImpl<Type, RoundMode::HALF_TOWARDS_INFINITY> {
 template <typename Type>
 struct RoundImpl<Type, RoundMode::HALF_TO_EVEN> {
   template <typename T = Type>
-  static constexpr enable_if_floating_point<T> Round(const T val) {
+  static constexpr enable_if_floating_value<T> Round(const T val) {
     return std::round(val * T(0.5)) * 2;
   }
 
@@ -1032,7 +1015,7 @@ struct RoundImpl<Type, RoundMode::HALF_TO_EVEN> {
 template <typename Type>
 struct RoundImpl<Type, RoundMode::HALF_TO_ODD> {
   template <typename T = Type>
-  static constexpr enable_if_floating_point<T> Round(const T val) {
+  static constexpr enable_if_floating_value<T> Round(const T val) {
     return std::floor(val * T(0.5)) + std::ceil(val * T(0.5));
   }
 
@@ -1149,7 +1132,7 @@ struct Round {
       : pow10(static_cast<CType>(state.pow10)), ndigits(state.options.ndigits) {}
 
   template <typename T = ArrowType, typename CType = typename TypeTraits<T>::CType>
-  enable_if_floating_point<CType> Call(KernelContext* ctx, CType arg, Status* st) const {
+  enable_if_floating_value<CType> Call(KernelContext* ctx, CType arg, Status* st) const {
     // Do not process Inf or NaN because they will trigger the overflow error at end of
     // function.
     if (!std::isfinite(arg)) {
@@ -1285,7 +1268,7 @@ struct RoundToMultiple {
   }
 
   template <typename T = ArrowType, typename CType = typename TypeTraits<T>::CType>
-  enable_if_floating_point<CType> Call(KernelContext* ctx, CType arg, Status* st) const {
+  enable_if_floating_value<CType> Call(KernelContext* ctx, CType arg, Status* st) const {
     // Do not process Inf or NaN because they will trigger the overflow error at end of
     // function.
     if (!std::isfinite(arg)) {
@@ -1425,7 +1408,7 @@ struct RoundToMultiple<ArrowType, kRoundMode, enable_if_decimal<ArrowType>> {
 
 struct Floor {
   template <typename T, typename Arg>
-  static constexpr enable_if_floating_point<Arg, T> Call(KernelContext*, Arg arg,
+  static constexpr enable_if_floating_value<Arg, T> Call(KernelContext*, Arg arg,
                                                          Status*) {
     static_assert(std::is_same<T, Arg>::value, "");
     return RoundImpl<T, RoundMode::DOWN>::Round(arg);
@@ -1434,7 +1417,7 @@ struct Floor {
 
 struct Ceil {
   template <typename T, typename Arg>
-  static constexpr enable_if_floating_point<Arg, T> Call(KernelContext*, Arg arg,
+  static constexpr enable_if_floating_value<Arg, T> Call(KernelContext*, Arg arg,
                                                          Status*) {
     static_assert(std::is_same<T, Arg>::value, "");
     return RoundImpl<T, RoundMode::UP>::Round(arg);
@@ -1443,7 +1426,7 @@ struct Ceil {
 
 struct Trunc {
   template <typename T, typename Arg>
-  static constexpr enable_if_floating_point<Arg, T> Call(KernelContext*, Arg arg,
+  static constexpr enable_if_floating_value<Arg, T> Call(KernelContext*, Arg arg,
                                                          Status*) {
     static_assert(std::is_same<T, Arg>::value, "");
     return RoundImpl<T, RoundMode::TOWARDS_ZERO>::Round(arg);
diff --git a/cpp/src/arrow/compute/kernels/scalar_compare.cc b/cpp/src/arrow/compute/kernels/scalar_compare.cc
index 316a94cbc8d4e..681f2b68c19a4 100644
--- a/cpp/src/arrow/compute/kernels/scalar_compare.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_compare.cc
@@ -65,30 +65,15 @@ struct GreaterEqual {
   }
 };
 
-template <typename T>
-using is_unsigned_integer = std::integral_constant<bool, std::is_integral<T>::value &&
-                                                             std::is_unsigned<T>::value>;
-
-template <typename T>
-using is_signed_integer =
-    std::integral_constant<bool, std::is_integral<T>::value && std::is_signed<T>::value>;
-
-template <typename T>
-using enable_if_integer =
-    enable_if_t<is_signed_integer<T>::value || is_unsigned_integer<T>::value, T>;
-
-template <typename T>
-using enable_if_floating_point = enable_if_t<std::is_floating_point<T>::value, T>;
-
 struct Minimum {
   template <typename T, typename Arg0, typename Arg1>
-  static enable_if_floating_point<T> Call(Arg0 left, Arg1 right) {
+  static enable_if_floating_value<T> Call(Arg0 left, Arg1 right) {
     static_assert(std::is_same<T, Arg0>::value && std::is_same<Arg0, Arg1>::value, "");
     return std::fmin(left, right);
   }
 
   template <typename T, typename Arg0, typename Arg1>
-  static enable_if_integer<T> Call(Arg0 left, Arg1 right) {
+  static enable_if_integer_value<T> Call(Arg0 left, Arg1 right) {
     static_assert(std::is_same<T, Arg0>::value && std::is_same<Arg0, Arg1>::value, "");
     return std::min(left, right);
   }
@@ -104,20 +89,20 @@ struct Minimum {
   }
 
   template <typename T>
-  static constexpr enable_if_integer<T> antiextreme() {
+  static constexpr enable_if_integer_value<T> antiextreme() {
     return std::numeric_limits<T>::max();
   }
 };
 
 struct Maximum {
   template <typename T, typename Arg0, typename Arg1>
-  static enable_if_floating_point<T> Call(Arg0 left, Arg1 right) {
+  static enable_if_floating_value<T> Call(Arg0 left, Arg1 right) {
     static_assert(std::is_same<T, Arg0>::value && std::is_same<Arg0, Arg1>::value, "");
     return std::fmax(left, right);
   }
 
   template <typename T, typename Arg0, typename Arg1>
-  static enable_if_integer<T> Call(Arg0 left, Arg1 right) {
+  static enable_if_integer_value<T> Call(Arg0 left, Arg1 right) {
     static_assert(std::is_same<T, Arg0>::value && std::is_same<Arg0, Arg1>::value, "");
     return std::max(left, right);
   }
@@ -133,7 +118,7 @@ struct Maximum {
   }
 
   template <typename T>
-  static constexpr enable_if_integer<T> antiextreme() {
+  static constexpr enable_if_integer_value<T> antiextreme() {
     return std::numeric_limits<T>::min();
   }
 };

From a044fc649c159e213d24fb13514339c08a99e6ab Mon Sep 17 00:00:00 2001
From: Alenka Frim <frim.alenka@gmail.com>
Date: Wed, 27 Oct 2021 11:38:47 +0200
Subject: [PATCH 025/194] ARROW-8285: [Python][Dataset] Test that
 ScalarExpression accepts numpy scalars

Add numpy scalar input to `test_expression_serialization()` in `test_dataset.py`.

Closes #11545 from AlenkaF/ARROW-8285

Authored-by: Alenka Frim <frim.alenka@gmail.com>
Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 python/pyarrow/tests/test_dataset.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py
index e5590c4a6bf96..b4959512f1146 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -548,8 +548,9 @@ def test_expression_serialization():
     e = ds.scalar(None)
     f = ds.scalar({'a': 1})
     g = ds.scalar(pa.scalar(1))
+    h = ds.scalar(np.int64(2))
 
-    all_exprs = [a, b, c, d, e, f, g, a == b, a > b, a & b, a | b, ~c,
+    all_exprs = [a, b, c, d, e, f, g, h, a == b, a > b, a & b, a | b, ~c,
                  d.is_valid(), a.cast(pa.int32(), safe=False),
                  a.cast(pa.int32(), safe=False), a.isin([1, 2, 3]),
                  ds.field('i64') > 5, ds.field('i64') == 5,

From 9e42a09baf2c53086d26351153a6d1638b001616 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= <szucs.krisztian@gmail.com>
Date: Wed, 27 Oct 2021 13:25:45 +0200
Subject: [PATCH 026/194] MINOR: [Release] Bump development versions to
 7.0.0-SNAPSHOT (#11548)

* [Release] Update versions for 7.0.0-SNAPSHOT

* [Release] Update .deb package names for 7.0.0
---
 c_glib/meson.build                            |   2 +-
 ci/scripts/PKGBUILD                           |   2 +-
 cpp/CMakeLists.txt                            |   2 +-
 cpp/vcpkg.json                                |   2 +-
 csharp/Directory.Build.props                  |   2 +-
 dev/release/rat_exclude_files.txt             |  32 ++---
 dev/tasks/homebrew-formulae/apache-arrow.rb   |   2 +-
 .../autobrew/apache-arrow.rb                  |   2 +-
 .../apache-arrow/debian/control.in            | 110 +++++++++---------
 ....install => libarrow-cuda-glib700.install} |   0
 ...da600.install => libarrow-cuda700.install} |   0
 ...stall => libarrow-dataset-glib700.install} |   0
 ...00.install => libarrow-dataset700.install} |   0
 ...nstall => libarrow-flight-glib700.install} |   0
 ...600.install => libarrow-flight700.install} |   0
 ...ib600.install => libarrow-glib700.install} |   0
 ...tall => libarrow-python-flight700.install} |   0
 ...600.install => libarrow-python700.install} |   0
 ...ibarrow600.install => libarrow700.install} |   0
 ...600.install => libgandiva-glib700.install} |   0
 ...ndiva600.install => libgandiva700.install} |   0
 ...600.install => libparquet-glib700.install} |   0
 ...rquet600.install => libparquet700.install} |   0
 ...b600.install => libplasma-glib700.install} |   0
 ...plasma600.install => libplasma700.install} |   0
 dev/tasks/tasks.yml                           |  64 +++++-----
 java/adapter/avro/pom.xml                     |   2 +-
 java/adapter/jdbc/pom.xml                     |   2 +-
 java/adapter/orc/pom.xml                      |   2 +-
 java/algorithm/pom.xml                        |   2 +-
 java/c/pom.xml                                |   2 +-
 java/compression/pom.xml                      |   2 +-
 java/dataset/pom.xml                          |   2 +-
 java/flight/flight-core/pom.xml               |   2 +-
 java/flight/flight-grpc/pom.xml               |   2 +-
 java/format/pom.xml                           |   2 +-
 java/gandiva/pom.xml                          |   2 +-
 java/memory/memory-core/pom.xml               |   2 +-
 java/memory/memory-netty/pom.xml              |   2 +-
 java/memory/memory-unsafe/pom.xml             |   2 +-
 java/memory/pom.xml                           |   2 +-
 java/performance/pom.xml                      |   4 +-
 java/plasma/pom.xml                           |   2 +-
 java/pom.xml                                  |   2 +-
 java/tools/pom.xml                            |   2 +-
 java/vector/pom.xml                           |   2 +-
 js/package.json                               |   2 +-
 matlab/CMakeLists.txt                         |   2 +-
 python/setup.py                               |   2 +-
 r/DESCRIPTION                                 |   2 +-
 r/NEWS.md                                     |   4 +-
 ruby/red-arrow-cuda/lib/arrow-cuda/version.rb |   2 +-
 .../lib/arrow-dataset/version.rb              |   2 +-
 .../lib/arrow-flight/version.rb               |   2 +-
 ruby/red-arrow/lib/arrow/version.rb           |   2 +-
 ruby/red-gandiva/lib/gandiva/version.rb       |   2 +-
 ruby/red-parquet/lib/parquet/version.rb       |   2 +-
 ruby/red-plasma/lib/plasma/version.rb         |   2 +-
 58 files changed, 145 insertions(+), 143 deletions(-)
 rename dev/tasks/linux-packages/apache-arrow/debian/{libarrow-cuda-glib600.install => libarrow-cuda-glib700.install} (100%)
 rename dev/tasks/linux-packages/apache-arrow/debian/{libarrow-cuda600.install => libarrow-cuda700.install} (100%)
 rename dev/tasks/linux-packages/apache-arrow/debian/{libarrow-dataset-glib600.install => libarrow-dataset-glib700.install} (100%)
 rename dev/tasks/linux-packages/apache-arrow/debian/{libarrow-dataset600.install => libarrow-dataset700.install} (100%)
 rename dev/tasks/linux-packages/apache-arrow/debian/{libarrow-flight-glib600.install => libarrow-flight-glib700.install} (100%)
 rename dev/tasks/linux-packages/apache-arrow/debian/{libarrow-flight600.install => libarrow-flight700.install} (100%)
 rename dev/tasks/linux-packages/apache-arrow/debian/{libarrow-glib600.install => libarrow-glib700.install} (100%)
 rename dev/tasks/linux-packages/apache-arrow/debian/{libarrow-python-flight600.install => libarrow-python-flight700.install} (100%)
 rename dev/tasks/linux-packages/apache-arrow/debian/{libarrow-python600.install => libarrow-python700.install} (100%)
 rename dev/tasks/linux-packages/apache-arrow/debian/{libarrow600.install => libarrow700.install} (100%)
 rename dev/tasks/linux-packages/apache-arrow/debian/{libgandiva-glib600.install => libgandiva-glib700.install} (100%)
 rename dev/tasks/linux-packages/apache-arrow/debian/{libgandiva600.install => libgandiva700.install} (100%)
 rename dev/tasks/linux-packages/apache-arrow/debian/{libparquet-glib600.install => libparquet-glib700.install} (100%)
 rename dev/tasks/linux-packages/apache-arrow/debian/{libparquet600.install => libparquet700.install} (100%)
 rename dev/tasks/linux-packages/apache-arrow/debian/{libplasma-glib600.install => libplasma-glib700.install} (100%)
 rename dev/tasks/linux-packages/apache-arrow/debian/{libplasma600.install => libplasma700.install} (100%)

diff --git a/c_glib/meson.build b/c_glib/meson.build
index 0e090c979688e..d5f8d66a5b1a9 100644
--- a/c_glib/meson.build
+++ b/c_glib/meson.build
@@ -23,7 +23,7 @@ project('arrow-glib', 'c', 'cpp',
           'cpp_std=c++11',
         ])
 
-version = '6.0.0-SNAPSHOT'
+version = '7.0.0-SNAPSHOT'
 if version.endswith('-SNAPSHOT')
   version_numbers = version.split('-')[0].split('.')
   version_tag = version.split('-')[1]
diff --git a/ci/scripts/PKGBUILD b/ci/scripts/PKGBUILD
index 246b679129a38..f02ce4e2d9e10 100644
--- a/ci/scripts/PKGBUILD
+++ b/ci/scripts/PKGBUILD
@@ -18,7 +18,7 @@
 _realname=arrow
 pkgbase=mingw-w64-${_realname}
 pkgname="${MINGW_PACKAGE_PREFIX}-${_realname}"
-pkgver=5.0.0.9000
+pkgver=6.0.0.9000
 pkgrel=8000
 pkgdesc="Apache Arrow is a cross-language development platform for in-memory data (mingw-w64)"
 arch=("any")
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 3c05f235df3dd..ea0eda4dfd85b 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -47,7 +47,7 @@ if(POLICY CMP0074)
   cmake_policy(SET CMP0074 NEW)
 endif()
 
-set(ARROW_VERSION "6.0.0-SNAPSHOT")
+set(ARROW_VERSION "7.0.0-SNAPSHOT")
 
 string(REGEX MATCH "^[0-9]+\\.[0-9]+\\.[0-9]+" ARROW_BASE_VERSION "${ARROW_VERSION}")
 
diff --git a/cpp/vcpkg.json b/cpp/vcpkg.json
index 723f3a46e7819..d285d78eb8517 100644
--- a/cpp/vcpkg.json
+++ b/cpp/vcpkg.json
@@ -1,6 +1,6 @@
 {
   "name": "arrow",
-  "version-string": "6.0.0-SNAPSHOT",
+  "version-string": "7.0.0-SNAPSHOT",
   "dependencies": [
     "abseil",
     {
diff --git a/csharp/Directory.Build.props b/csharp/Directory.Build.props
index c42ff55a4139c..3f8b021b064d0 100644
--- a/csharp/Directory.Build.props
+++ b/csharp/Directory.Build.props
@@ -29,7 +29,7 @@
     <Product>Apache Arrow library</Product>
     <Copyright>Copyright 2016-2019 The Apache Software Foundation</Copyright>
     <Company>The Apache Software Foundation</Company>
-    <Version>6.0.0-SNAPSHOT</Version>
+    <Version>7.0.0-SNAPSHOT</Version>
   </PropertyGroup>
 
   <PropertyGroup>
diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt
index b5268bbd37433..afb25ff49eaba 100644
--- a/dev/release/rat_exclude_files.txt
+++ b/dev/release/rat_exclude_files.txt
@@ -69,51 +69,51 @@ dev/tasks/linux-packages/apache-arrow/debian/libarrow-glib-dev.install
 dev/tasks/linux-packages/apache-arrow/debian/libarrow-glib-doc.doc-base
 dev/tasks/linux-packages/apache-arrow/debian/libarrow-glib-doc.install
 dev/tasks/linux-packages/apache-arrow/debian/libarrow-glib-doc.links
-dev/tasks/linux-packages/apache-arrow/debian/libarrow-glib600.install
+dev/tasks/linux-packages/apache-arrow/debian/libarrow-glib700.install
 dev/tasks/linux-packages/apache-arrow/debian/libarrow-cuda-dev.install
 dev/tasks/linux-packages/apache-arrow/debian/libarrow-cuda-glib-dev.install
-dev/tasks/linux-packages/apache-arrow/debian/libarrow-cuda-glib600.install
-dev/tasks/linux-packages/apache-arrow/debian/libarrow-cuda600.install
+dev/tasks/linux-packages/apache-arrow/debian/libarrow-cuda-glib700.install
+dev/tasks/linux-packages/apache-arrow/debian/libarrow-cuda700.install
 dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset-dev.install
 dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset-glib-dev.install
 dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset-glib-doc.doc-base
 dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset-glib-doc.install
 dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset-glib-doc.links
-dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset-glib600.install
-dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset600.install
+dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset-glib700.install
+dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset700.install
 dev/tasks/linux-packages/apache-arrow/debian/libarrow-flight-dev.install
 dev/tasks/linux-packages/apache-arrow/debian/libarrow-flight-glib-dev.install
 dev/tasks/linux-packages/apache-arrow/debian/libarrow-flight-glib-doc.doc-base
 dev/tasks/linux-packages/apache-arrow/debian/libarrow-flight-glib-doc.install
 dev/tasks/linux-packages/apache-arrow/debian/libarrow-flight-glib-doc.links
-dev/tasks/linux-packages/apache-arrow/debian/libarrow-flight-glib600.install
-dev/tasks/linux-packages/apache-arrow/debian/libarrow-flight600.install
+dev/tasks/linux-packages/apache-arrow/debian/libarrow-flight-glib700.install
+dev/tasks/linux-packages/apache-arrow/debian/libarrow-flight700.install
 dev/tasks/linux-packages/apache-arrow/debian/libarrow-python-dev.install
 dev/tasks/linux-packages/apache-arrow/debian/libarrow-python-flight-dev.install
-dev/tasks/linux-packages/apache-arrow/debian/libarrow-python-flight600.install
-dev/tasks/linux-packages/apache-arrow/debian/libarrow-python600.install
-dev/tasks/linux-packages/apache-arrow/debian/libarrow600.install
+dev/tasks/linux-packages/apache-arrow/debian/libarrow-python-flight700.install
+dev/tasks/linux-packages/apache-arrow/debian/libarrow-python700.install
+dev/tasks/linux-packages/apache-arrow/debian/libarrow700.install
 dev/tasks/linux-packages/apache-arrow/debian/libgandiva-dev.install
 dev/tasks/linux-packages/apache-arrow/debian/libgandiva-glib-dev.install
 dev/tasks/linux-packages/apache-arrow/debian/libgandiva-glib-doc.doc-base
 dev/tasks/linux-packages/apache-arrow/debian/libgandiva-glib-doc.install
 dev/tasks/linux-packages/apache-arrow/debian/libgandiva-glib-doc.links
-dev/tasks/linux-packages/apache-arrow/debian/libgandiva-glib600.install
-dev/tasks/linux-packages/apache-arrow/debian/libgandiva600.install
+dev/tasks/linux-packages/apache-arrow/debian/libgandiva-glib700.install
+dev/tasks/linux-packages/apache-arrow/debian/libgandiva700.install
 dev/tasks/linux-packages/apache-arrow/debian/libparquet-dev.install
 dev/tasks/linux-packages/apache-arrow/debian/libparquet-glib-dev.install
 dev/tasks/linux-packages/apache-arrow/debian/libparquet-glib-doc.doc-base
 dev/tasks/linux-packages/apache-arrow/debian/libparquet-glib-doc.install
 dev/tasks/linux-packages/apache-arrow/debian/libparquet-glib-doc.links
-dev/tasks/linux-packages/apache-arrow/debian/libparquet-glib600.install
-dev/tasks/linux-packages/apache-arrow/debian/libparquet600.install
+dev/tasks/linux-packages/apache-arrow/debian/libparquet-glib700.install
+dev/tasks/linux-packages/apache-arrow/debian/libparquet700.install
 dev/tasks/linux-packages/apache-arrow/debian/libplasma-dev.install
 dev/tasks/linux-packages/apache-arrow/debian/libplasma-glib-dev.install
 dev/tasks/linux-packages/apache-arrow/debian/libplasma-glib-doc.doc-base
 dev/tasks/linux-packages/apache-arrow/debian/libplasma-glib-doc.install
 dev/tasks/linux-packages/apache-arrow/debian/libplasma-glib-doc.links
-dev/tasks/linux-packages/apache-arrow/debian/libplasma-glib600.install
-dev/tasks/linux-packages/apache-arrow/debian/libplasma600.install
+dev/tasks/linux-packages/apache-arrow/debian/libplasma-glib700.install
+dev/tasks/linux-packages/apache-arrow/debian/libplasma700.install
 dev/tasks/linux-packages/apache-arrow/debian/patches/series
 dev/tasks/linux-packages/apache-arrow/debian/plasma-store-server.install
 dev/tasks/linux-packages/apache-arrow/debian/rules
diff --git a/dev/tasks/homebrew-formulae/apache-arrow.rb b/dev/tasks/homebrew-formulae/apache-arrow.rb
index ca3f83174ca4e..38e0244fb1a88 100644
--- a/dev/tasks/homebrew-formulae/apache-arrow.rb
+++ b/dev/tasks/homebrew-formulae/apache-arrow.rb
@@ -1,7 +1,7 @@
 class ApacheArrow < Formula
   desc "Columnar in-memory analytics layer designed to accelerate big data"
   homepage "https://arrow.apache.org/"
-  url "https://www.apache.org/dyn/closer.lua?path=arrow/arrow-6.0.0-SNAPSHOT/apache-arrow-6.0.0-SNAPSHOT.tar.gz"
+  url "https://www.apache.org/dyn/closer.lua?path=arrow/arrow-7.0.0-SNAPSHOT/apache-arrow-7.0.0-SNAPSHOT.tar.gz"
   sha256 "9948ddb6d4798b51552d0dca3252dd6e3a7d0f9702714fc6f5a1b59397ce1d28"
   license "Apache-2.0"
   head "https://github.com/apache/arrow.git"
diff --git a/dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb b/dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb
index 1d257e6c7786d..f8381ea77f729 100644
--- a/dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb
+++ b/dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb
@@ -19,7 +19,7 @@
 class ApacheArrow < Formula
   desc "Columnar in-memory analytics layer designed to accelerate big data"
   homepage "https://arrow.apache.org/"
-  url "https://www.apache.org/dyn/closer.lua?path=arrow/arrow-5.0.0.9000/apache-arrow-5.0.0.9000.tar.gz"
+  url "https://www.apache.org/dyn/closer.lua?path=arrow/arrow-6.0.0.9000/apache-arrow-6.0.0.9000.tar.gz"
   sha256 "9948ddb6d4798b51552d0dca3252dd6e3a7d0f9702714fc6f5a1b59397ce1d28"
   head "https://github.com/apache/arrow.git"
 
diff --git a/dev/tasks/linux-packages/apache-arrow/debian/control.in b/dev/tasks/linux-packages/apache-arrow/debian/control.in
index b6d849719a1ea..153892b5e9c1d 100644
--- a/dev/tasks/linux-packages/apache-arrow/debian/control.in
+++ b/dev/tasks/linux-packages/apache-arrow/debian/control.in
@@ -38,7 +38,7 @@ Build-Depends-Indep: libglib2.0-doc
 Standards-Version: 3.9.8
 Homepage: https://arrow.apache.org/
 
-Package: libarrow600
+Package: libarrow700
 Section: libs
 Architecture: any
 Multi-Arch: same
@@ -50,7 +50,7 @@ Description: Apache Arrow is a data processing library for analysis
  .
  This package provides C++ library files.
 
-Package: libarrow-cuda600
+Package: libarrow-cuda700
 Section: libs
 Architecture: @CUDA_ARCHITECTURE@
 Multi-Arch: same
@@ -58,12 +58,12 @@ Pre-Depends: ${misc:Pre-Depends}
 Depends:
   ${misc:Depends},
   ${shlibs:Depends},
-  libarrow600 (= ${binary:Version})
+  libarrow700 (= ${binary:Version})
 Description: Apache Arrow is a data processing library for analysis
  .
  This package provides C++ library files for CUDA support.
 
-Package: libarrow-dataset600
+Package: libarrow-dataset700
 Section: libs
 Architecture: any
 Multi-Arch: same
@@ -71,13 +71,13 @@ Pre-Depends: ${misc:Pre-Depends}
 Depends:
   ${misc:Depends},
   ${shlibs:Depends},
-  libarrow600 (= ${binary:Version}),
-  libparquet600 (= ${binary:Version})
+  libarrow700 (= ${binary:Version}),
+  libparquet700 (= ${binary:Version})
 Description: Apache Arrow is a data processing library for analysis
  .
  This package provides C++ library files for Dataset module.
 
-Package: libarrow-flight600
+Package: libarrow-flight700
 Section: libs
 Architecture: any
 Multi-Arch: same
@@ -85,12 +85,12 @@ Pre-Depends: ${misc:Pre-Depends}
 Depends:
   ${misc:Depends},
   ${shlibs:Depends},
-  libarrow600 (= ${binary:Version})
+  libarrow700 (= ${binary:Version})
 Description: Apache Arrow is a data processing library for analysis
  .
  This package provides C++ library files for Flight RPC system.
 
-Package: libarrow-python600
+Package: libarrow-python700
 Section: libs
 Architecture: any
 Multi-Arch: same
@@ -98,14 +98,14 @@ Pre-Depends: ${misc:Pre-Depends}
 Depends:
   ${misc:Depends},
   ${shlibs:Depends},
-  libarrow600 (= ${binary:Version}),
+  libarrow700 (= ${binary:Version}),
   python3,
   python3-numpy
 Description: Apache Arrow is a data processing library for analysis
  .
  This package provides C++ library files for Python support.
 
-Package: libarrow-python-flight600
+Package: libarrow-python-flight700
 Section: libs
 Architecture: any
 Multi-Arch: same
@@ -113,8 +113,8 @@ Pre-Depends: ${misc:Pre-Depends}
 Depends:
   ${misc:Depends},
   ${shlibs:Depends},
-  libarrow-flight600 (= ${binary:Version}),
-  libarrow-python600 (= ${binary:Version})
+  libarrow-flight700 (= ${binary:Version}),
+  libarrow-python700 (= ${binary:Version})
 Description: Apache Arrow is a data processing library for analysis
  .
  This package provides C++ library files for Flight and Python support.
@@ -125,7 +125,7 @@ Architecture: any
 Multi-Arch: same
 Depends:
   ${misc:Depends},
-  libarrow600 (= ${binary:Version}),
+  libarrow700 (= ${binary:Version}),
   libbrotli-dev,
   libbz2-dev,
 @USE_SYSTEM_C_ARES@  libc-ares-dev,
@@ -149,7 +149,7 @@ Multi-Arch: same
 Depends:
   ${misc:Depends},
   libarrow-dev (= ${binary:Version}),
-  libarrow-cuda600 (= ${binary:Version})
+  libarrow-cuda700 (= ${binary:Version})
 Description: Apache Arrow is a data processing library for analysis
  .
  This package provides C++ header files for CUDA support.
@@ -161,7 +161,7 @@ Multi-Arch: same
 Depends:
   ${misc:Depends},
   libarrow-dev (= ${binary:Version}),
-  libarrow-dataset600 (= ${binary:Version}),
+  libarrow-dataset700 (= ${binary:Version}),
   libparquet-dev (= ${binary:Version})
 Description: Apache Arrow is a data processing library for analysis
  .
@@ -174,7 +174,7 @@ Multi-Arch: same
 Depends:
   ${misc:Depends},
   libarrow-dev (= ${binary:Version}),
-  libarrow-flight600 (= ${binary:Version})
+  libarrow-flight700 (= ${binary:Version})
 Description: Apache Arrow is a data processing library for analysis
  .
  This package provides C++ header files for Flight RPC system.
@@ -186,7 +186,7 @@ Multi-Arch: same
 Depends:
   ${misc:Depends},
   libarrow-dev (= ${binary:Version}),
-  libarrow-python600 (= ${binary:Version})
+  libarrow-python700 (= ${binary:Version})
 Description: Apache Arrow is a data processing library for analysis
  .
  This package provides C++ header files for Python support.
@@ -199,12 +199,12 @@ Depends:
   ${misc:Depends},
   libarrow-flight-dev (= ${binary:Version}),
   libarrow-python-dev (= ${binary:Version}),
-  libarrow-python-flight600 (= ${binary:Version})
+  libarrow-python-flight700 (= ${binary:Version})
 Description: Apache Arrow is a data processing library for analysis
  .
  This package provides C++ header files for Flight and Python support.
 
-Package: libgandiva600
+Package: libgandiva700
 Section: libs
 Architecture: any
 Multi-Arch: same
@@ -212,7 +212,7 @@ Pre-Depends: ${misc:Pre-Depends}
 Depends:
   ${misc:Depends},
   ${shlibs:Depends},
-  libarrow600 (= ${binary:Version})
+  libarrow700 (= ${binary:Version})
 Description: Gandiva is a toolset for compiling and evaluating expressions
  on Arrow Data.
  .
@@ -225,13 +225,13 @@ Multi-Arch: same
 Depends:
   ${misc:Depends},
   libarrow-dev (= ${binary:Version}),
-  libgandiva600 (= ${binary:Version})
+  libgandiva700 (= ${binary:Version})
 Description: Gandiva is a toolset for compiling and evaluating expressions
  on Arrow Data.
  .
  This package provides C++ header files.
 
-Package: libplasma600
+Package: libplasma700
 Section: libs
 Architecture: @CUDA_ARCHITECTURE@
 Multi-Arch: same
@@ -239,7 +239,7 @@ Pre-Depends: ${misc:Pre-Depends}
 Depends:
   ${misc:Depends},
   ${shlibs:Depends},
-  libarrow-cuda600 (= ${binary:Version})
+  libarrow-cuda700 (= ${binary:Version})
 Description: Plasma is an in-memory object store and cache for big data.
  .
  This package provides C++ library files to connect plasma-store-server.
@@ -251,7 +251,7 @@ Pre-Depends: ${misc:Pre-Depends}
 Depends:
   ${misc:Depends},
   ${shlibs:Depends},
-  libplasma600 (= ${binary:Version})
+  libplasma700 (= ${binary:Version})
 Description: Plasma is an in-memory object store and cache for big data.
  .
  This package provides plasma-store-server.
@@ -263,12 +263,12 @@ Multi-Arch: same
 Depends:
   ${misc:Depends},
   libarrow-cuda-dev (= ${binary:Version}),
-  libplasma600 (= ${binary:Version})
+  libplasma700 (= ${binary:Version})
 Description: Plasma is an in-memory object store and cache for big data.
  .
  This package provides C++ header files.
 
-Package: libparquet600
+Package: libparquet700
 Section: libs
 Architecture: any
 Multi-Arch: same
@@ -287,12 +287,12 @@ Multi-Arch: same
 Depends:
   ${misc:Depends},
   libarrow-dev (= ${binary:Version}),
-  libparquet600 (= ${binary:Version})
+  libparquet700 (= ${binary:Version})
 Description: Apache Parquet is a columnar storage format
  .
  This package provides C++ header files.
 
-Package: libarrow-glib600
+Package: libarrow-glib700
 Section: libs
 Architecture: any
 Multi-Arch: same
@@ -300,7 +300,7 @@ Pre-Depends: ${misc:Pre-Depends}
 Depends:
   ${misc:Depends},
   ${shlibs:Depends},
-  libarrow600 (= ${binary:Version})
+  libarrow700 (= ${binary:Version})
 Description: Apache Arrow is a data processing library for analysis
  .
  This package provides GLib based library files.
@@ -324,7 +324,7 @@ Depends:
   ${misc:Depends},
   libglib2.0-dev,
   libarrow-dev (= ${binary:Version}),
-  libarrow-glib600 (= ${binary:Version}),
+  libarrow-glib700 (= ${binary:Version}),
   gir1.2-arrow-1.0 (= ${binary:Version})
 Suggests: libarrow-glib-doc
 Description: Apache Arrow is a data processing library for analysis
@@ -342,7 +342,7 @@ Description: Apache Arrow is a data processing library for analysis
  .
  This package provides documentations.
 
-Package: libarrow-cuda-glib600
+Package: libarrow-cuda-glib700
 Section: libs
 Architecture: @CUDA_ARCHITECTURE@
 Multi-Arch: same
@@ -350,8 +350,8 @@ Pre-Depends: ${misc:Pre-Depends}
 Depends:
   ${misc:Depends},
   ${shlibs:Depends},
-  libarrow-glib600 (= ${binary:Version}),
-  libarrow-cuda600 (= ${binary:Version})
+  libarrow-glib700 (= ${binary:Version}),
+  libarrow-cuda700 (= ${binary:Version})
 Description: Apache Arrow is a data processing library for analysis
  .
  This package provides GLib based library files for CUDA support.
@@ -375,13 +375,13 @@ Depends:
   ${misc:Depends},
   libarrow-cuda-dev (= ${binary:Version}),
   libarrow-glib-dev (= ${binary:Version}),
-  libarrow-cuda-glib600 (= ${binary:Version}),
+  libarrow-cuda-glib700 (= ${binary:Version}),
   gir1.2-arrow-cuda-1.0 (= ${binary:Version})
 Description: Apache Arrow is a data processing library for analysis
  .
  This package provides GLib based header files for CUDA support.
 
-Package: libarrow-dataset-glib600
+Package: libarrow-dataset-glib700
 Section: libs
 Architecture: any
 Multi-Arch: same
@@ -389,8 +389,8 @@ Pre-Depends: ${misc:Pre-Depends}
 Depends:
   ${misc:Depends},
   ${shlibs:Depends},
-  libarrow-glib600 (= ${binary:Version}),
-  libarrow-dataset600 (= ${binary:Version})
+  libarrow-glib700 (= ${binary:Version}),
+  libarrow-dataset700 (= ${binary:Version})
 Description: Apache Arrow is a data processing library for analysis
  .
  This package provides GLib based library files for dataset module.
@@ -414,7 +414,7 @@ Depends:
   ${misc:Depends},
   libarrow-dataset-dev (= ${binary:Version}),
   libarrow-glib-dev (= ${binary:Version}),
-  libarrow-dataset-glib600 (= ${binary:Version}),
+  libarrow-dataset-glib700 (= ${binary:Version}),
   gir1.2-arrow-dataset-1.0 (= ${binary:Version})
 Description: Apache Arrow is a data processing library for analysis
  .
@@ -431,7 +431,7 @@ Description: Apache Arrow is a data processing library for analysis
  .
  This package provides documentations for dataset module.
 
-Package: libarrow-flight-glib600
+Package: libarrow-flight-glib700
 Section: libs
 Architecture: any
 Multi-Arch: same
@@ -439,8 +439,8 @@ Pre-Depends: ${misc:Pre-Depends}
 Depends:
   ${misc:Depends},
   ${shlibs:Depends},
-  libarrow-glib600 (= ${binary:Version}),
-  libarrow-flight600 (= ${binary:Version})
+  libarrow-glib700 (= ${binary:Version}),
+  libarrow-flight700 (= ${binary:Version})
 Description: Apache Arrow is a data processing library for analysis
  .
  This package provides GLib based library files for Apache Arrow Flight.
@@ -465,7 +465,7 @@ Depends:
   ${misc:Depends},
   libarrow-flight-dev (= ${binary:Version}),
   libarrow-glib-dev (= ${binary:Version}),
-  libarrow-flight-glib600 (= ${binary:Version}),
+  libarrow-flight-glib700 (= ${binary:Version}),
   gir1.2-arrow-flight-1.0 (= ${binary:Version})
 Description: Apache Arrow is a data processing library for analysis
  .
@@ -482,7 +482,7 @@ Description: Apache Arrow is a data processing library for analysis
  .
  This package provides documentations for Apache Arrow Flight.
 
-Package: libgandiva-glib600
+Package: libgandiva-glib700
 Section: libs
 Architecture: any
 Multi-Arch: same
@@ -490,8 +490,8 @@ Pre-Depends: ${misc:Pre-Depends}
 Depends:
   ${misc:Depends},
   ${shlibs:Depends},
-  libarrow-glib600 (= ${binary:Version}),
-  libgandiva600 (= ${binary:Version})
+  libarrow-glib700 (= ${binary:Version}),
+  libgandiva700 (= ${binary:Version})
 Description: Gandiva is a toolset for compiling and evaluating expressions
  on Arrow Data.
  .
@@ -517,7 +517,7 @@ Depends:
   ${misc:Depends},
   libgandiva-dev (= ${binary:Version}),
   libarrow-glib-dev (= ${binary:Version}),
-  libgandiva-glib600 (= ${binary:Version}),
+  libgandiva-glib700 (= ${binary:Version}),
   gir1.2-gandiva-1.0 (= ${binary:Version})
 Description: Gandiva is a toolset for compiling and evaluating expressions
  on Arrow Data.
@@ -536,7 +536,7 @@ Description: Gandiva is a toolset for compiling and evaluating expressions
  .
  This package provides documentations.
 
-Package: libplasma-glib600
+Package: libplasma-glib700
 Section: libs
 Architecture: @CUDA_ARCHITECTURE@
 Multi-Arch: same
@@ -544,8 +544,8 @@ Pre-Depends: ${misc:Pre-Depends}
 Depends:
   ${misc:Depends},
   ${shlibs:Depends},
-  libarrow-cuda-glib600 (= ${binary:Version}),
-  libplasma600 (= ${binary:Version})
+  libarrow-cuda-glib700 (= ${binary:Version}),
+  libplasma700 (= ${binary:Version})
 Description: Plasma is an in-memory object store and cache for big data.
  .
  This package provides GLib based library files to connect plasma-store-server.
@@ -569,7 +569,7 @@ Depends:
   ${misc:Depends},
   libplasma-dev (= ${binary:Version}),
   libarrow-cuda-glib-dev (= ${binary:Version}),
-  libplasma-glib600 (= ${binary:Version}),
+  libplasma-glib700 (= ${binary:Version}),
   gir1.2-plasma-1.0 (= ${binary:Version})
 Description: Plasma is an in-memory object store and cache for big data.
  .
@@ -586,7 +586,7 @@ Description: Plasma is an in-memory object store and cache for big data.
  .
  This package provides documentations.
 
-Package: libparquet-glib600
+Package: libparquet-glib700
 Section: libs
 Architecture: any
 Multi-Arch: same
@@ -594,8 +594,8 @@ Pre-Depends: ${misc:Pre-Depends}
 Depends:
   ${misc:Depends},
   ${shlibs:Depends},
-  libarrow-glib600 (= ${binary:Version}),
-  libparquet600 (= ${binary:Version})
+  libarrow-glib700 (= ${binary:Version}),
+  libparquet700 (= ${binary:Version})
 Description: Apache Parquet is a columnar storage format
  .
  This package provides GLib based library files.
@@ -619,7 +619,7 @@ Depends:
   ${misc:Depends},
   libarrow-glib-dev (= ${binary:Version}),
   libparquet-dev (= ${binary:Version}),
-  libparquet-glib600 (= ${binary:Version}),
+  libparquet-glib700 (= ${binary:Version}),
   gir1.2-parquet-1.0 (= ${binary:Version})
 Suggests: libparquet-glib-doc
 Description: Apache Parquet is a columnar storage format
diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-cuda-glib600.install b/dev/tasks/linux-packages/apache-arrow/debian/libarrow-cuda-glib700.install
similarity index 100%
rename from dev/tasks/linux-packages/apache-arrow/debian/libarrow-cuda-glib600.install
rename to dev/tasks/linux-packages/apache-arrow/debian/libarrow-cuda-glib700.install
diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-cuda600.install b/dev/tasks/linux-packages/apache-arrow/debian/libarrow-cuda700.install
similarity index 100%
rename from dev/tasks/linux-packages/apache-arrow/debian/libarrow-cuda600.install
rename to dev/tasks/linux-packages/apache-arrow/debian/libarrow-cuda700.install
diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset-glib600.install b/dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset-glib700.install
similarity index 100%
rename from dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset-glib600.install
rename to dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset-glib700.install
diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset600.install b/dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset700.install
similarity index 100%
rename from dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset600.install
rename to dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset700.install
diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-flight-glib600.install b/dev/tasks/linux-packages/apache-arrow/debian/libarrow-flight-glib700.install
similarity index 100%
rename from dev/tasks/linux-packages/apache-arrow/debian/libarrow-flight-glib600.install
rename to dev/tasks/linux-packages/apache-arrow/debian/libarrow-flight-glib700.install
diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-flight600.install b/dev/tasks/linux-packages/apache-arrow/debian/libarrow-flight700.install
similarity index 100%
rename from dev/tasks/linux-packages/apache-arrow/debian/libarrow-flight600.install
rename to dev/tasks/linux-packages/apache-arrow/debian/libarrow-flight700.install
diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-glib600.install b/dev/tasks/linux-packages/apache-arrow/debian/libarrow-glib700.install
similarity index 100%
rename from dev/tasks/linux-packages/apache-arrow/debian/libarrow-glib600.install
rename to dev/tasks/linux-packages/apache-arrow/debian/libarrow-glib700.install
diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-python-flight600.install b/dev/tasks/linux-packages/apache-arrow/debian/libarrow-python-flight700.install
similarity index 100%
rename from dev/tasks/linux-packages/apache-arrow/debian/libarrow-python-flight600.install
rename to dev/tasks/linux-packages/apache-arrow/debian/libarrow-python-flight700.install
diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-python600.install b/dev/tasks/linux-packages/apache-arrow/debian/libarrow-python700.install
similarity index 100%
rename from dev/tasks/linux-packages/apache-arrow/debian/libarrow-python600.install
rename to dev/tasks/linux-packages/apache-arrow/debian/libarrow-python700.install
diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libarrow600.install b/dev/tasks/linux-packages/apache-arrow/debian/libarrow700.install
similarity index 100%
rename from dev/tasks/linux-packages/apache-arrow/debian/libarrow600.install
rename to dev/tasks/linux-packages/apache-arrow/debian/libarrow700.install
diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libgandiva-glib600.install b/dev/tasks/linux-packages/apache-arrow/debian/libgandiva-glib700.install
similarity index 100%
rename from dev/tasks/linux-packages/apache-arrow/debian/libgandiva-glib600.install
rename to dev/tasks/linux-packages/apache-arrow/debian/libgandiva-glib700.install
diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libgandiva600.install b/dev/tasks/linux-packages/apache-arrow/debian/libgandiva700.install
similarity index 100%
rename from dev/tasks/linux-packages/apache-arrow/debian/libgandiva600.install
rename to dev/tasks/linux-packages/apache-arrow/debian/libgandiva700.install
diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libparquet-glib600.install b/dev/tasks/linux-packages/apache-arrow/debian/libparquet-glib700.install
similarity index 100%
rename from dev/tasks/linux-packages/apache-arrow/debian/libparquet-glib600.install
rename to dev/tasks/linux-packages/apache-arrow/debian/libparquet-glib700.install
diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libparquet600.install b/dev/tasks/linux-packages/apache-arrow/debian/libparquet700.install
similarity index 100%
rename from dev/tasks/linux-packages/apache-arrow/debian/libparquet600.install
rename to dev/tasks/linux-packages/apache-arrow/debian/libparquet700.install
diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libplasma-glib600.install b/dev/tasks/linux-packages/apache-arrow/debian/libplasma-glib700.install
similarity index 100%
rename from dev/tasks/linux-packages/apache-arrow/debian/libplasma-glib600.install
rename to dev/tasks/linux-packages/apache-arrow/debian/libplasma-glib700.install
diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libplasma600.install b/dev/tasks/linux-packages/apache-arrow/debian/libplasma700.install
similarity index 100%
rename from dev/tasks/linux-packages/apache-arrow/debian/libplasma600.install
rename to dev/tasks/linux-packages/apache-arrow/debian/libplasma700.install
diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml
index 7b6946368c5c6..86cba81653ee1 100644
--- a/dev/tasks/tasks.yml
+++ b/dev/tasks/tasks.yml
@@ -488,60 +488,60 @@ tasks:
       - libarrow-dataset-dev_{no_rc_version}-1_[a-z0-9]+.deb
       - libarrow-dataset-glib-dev_{no_rc_version}-1_[a-z0-9]+.deb
       - libarrow-dataset-glib-doc_{no_rc_version}-1_[a-z0-9]+.deb
-      - libarrow-dataset-glib600-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb
-      - libarrow-dataset-glib600_{no_rc_version}-1_[a-z0-9]+.deb
-      - libarrow-dataset600-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb
-      - libarrow-dataset600_{no_rc_version}-1_[a-z0-9]+.deb
+      - libarrow-dataset-glib700-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb
+      - libarrow-dataset-glib700_{no_rc_version}-1_[a-z0-9]+.deb
+      - libarrow-dataset700-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb
+      - libarrow-dataset700_{no_rc_version}-1_[a-z0-9]+.deb
       - libarrow-dev_{no_rc_version}-1_[a-z0-9]+.deb
       - libarrow-flight-dev_{no_rc_version}-1_[a-z0-9]+.deb
       - libarrow-flight-glib-dev_{no_rc_version}-1_[a-z0-9]+.deb
       - libarrow-flight-glib-doc_{no_rc_version}-1_[a-z0-9]+.deb
-      - libarrow-flight-glib600-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb
-      - libarrow-flight-glib600_{no_rc_version}-1_[a-z0-9]+.deb
-      - libarrow-flight600-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb
-      - libarrow-flight600_{no_rc_version}-1_[a-z0-9]+.deb
+      - libarrow-flight-glib700-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb
+      - libarrow-flight-glib700_{no_rc_version}-1_[a-z0-9]+.deb
+      - libarrow-flight700-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb
+      - libarrow-flight700_{no_rc_version}-1_[a-z0-9]+.deb
       - libarrow-glib-dev_{no_rc_version}-1_[a-z0-9]+.deb
       - libarrow-glib-doc_{no_rc_version}-1_[a-z0-9]+.deb
-      - libarrow-glib600-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb
-      - libarrow-glib600_{no_rc_version}-1_[a-z0-9]+.deb
+      - libarrow-glib700-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb
+      - libarrow-glib700_{no_rc_version}-1_[a-z0-9]+.deb
       - libarrow-python-dev_{no_rc_version}-1_[a-z0-9]+.deb
       - libarrow-python-flight-dev_{no_rc_version}-1_[a-z0-9]+.deb
-      - libarrow-python-flight600-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb
-      - libarrow-python-flight600_{no_rc_version}-1_[a-z0-9]+.deb
-      - libarrow-python600-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb
-      - libarrow-python600_{no_rc_version}-1_[a-z0-9]+.deb
-      - libarrow600-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb
-      - libarrow600_{no_rc_version}-1_[a-z0-9]+.deb
+      - libarrow-python-flight700-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb
+      - libarrow-python-flight700_{no_rc_version}-1_[a-z0-9]+.deb
+      - libarrow-python700-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb
+      - libarrow-python700_{no_rc_version}-1_[a-z0-9]+.deb
+      - libarrow700-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb
+      - libarrow700_{no_rc_version}-1_[a-z0-9]+.deb
       - libgandiva-dev_{no_rc_version}-1_[a-z0-9]+.deb
       - libgandiva-glib-dev_{no_rc_version}-1_[a-z0-9]+.deb
       - libgandiva-glib-doc_{no_rc_version}-1_[a-z0-9]+.deb
-      - libgandiva-glib600-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb
-      - libgandiva-glib600_{no_rc_version}-1_[a-z0-9]+.deb
-      - libgandiva600-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb
-      - libgandiva600_{no_rc_version}-1_[a-z0-9]+.deb
+      - libgandiva-glib700-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb
+      - libgandiva-glib700_{no_rc_version}-1_[a-z0-9]+.deb
+      - libgandiva700-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb
+      - libgandiva700_{no_rc_version}-1_[a-z0-9]+.deb
       - libparquet-dev_{no_rc_version}-1_[a-z0-9]+.deb
       - libparquet-glib-dev_{no_rc_version}-1_[a-z0-9]+.deb
       - libparquet-glib-doc_{no_rc_version}-1_[a-z0-9]+.deb
-      - libparquet-glib600-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb
-      - libparquet-glib600_{no_rc_version}-1_[a-z0-9]+.deb
-      - libparquet600-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb
-      - libparquet600_{no_rc_version}-1_[a-z0-9]+.deb
+      - libparquet-glib700-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb
+      - libparquet-glib700_{no_rc_version}-1_[a-z0-9]+.deb
+      - libparquet700-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb
+      - libparquet700_{no_rc_version}-1_[a-z0-9]+.deb
     {% if architecture == "amd64" %}
       - gir1.2-arrow-cuda-1.0_{no_rc_version}-1_[a-z0-9]+.deb
       - gir1.2-plasma-1.0_{no_rc_version}-1_[a-z0-9]+.deb
       - libarrow-cuda-dev_{no_rc_version}-1_[a-z0-9]+.deb
       - libarrow-cuda-glib-dev_{no_rc_version}-1_[a-z0-9]+.deb
-      - libarrow-cuda-glib600-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb
-      - libarrow-cuda-glib600_{no_rc_version}-1_[a-z0-9]+.deb
-      - libarrow-cuda600-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb
-      - libarrow-cuda600_{no_rc_version}-1_[a-z0-9]+.deb
+      - libarrow-cuda-glib700-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb
+      - libarrow-cuda-glib700_{no_rc_version}-1_[a-z0-9]+.deb
+      - libarrow-cuda700-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb
+      - libarrow-cuda700_{no_rc_version}-1_[a-z0-9]+.deb
       - libplasma-dev_{no_rc_version}-1_[a-z0-9]+.deb
       - libplasma-glib-dev_{no_rc_version}-1_[a-z0-9]+.deb
       - libplasma-glib-doc_{no_rc_version}-1_[a-z0-9]+.deb
-      - libplasma-glib600-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb
-      - libplasma-glib600_{no_rc_version}-1_[a-z0-9]+.deb
-      - libplasma600-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb
-      - libplasma600_{no_rc_version}-1_[a-z0-9]+.deb
+      - libplasma-glib700-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb
+      - libplasma-glib700_{no_rc_version}-1_[a-z0-9]+.deb
+      - libplasma700-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb
+      - libplasma700_{no_rc_version}-1_[a-z0-9]+.deb
       - plasma-store-server-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb
       - plasma-store-server_{no_rc_version}-1_[a-z0-9]+.deb
     {% endif %}
diff --git a/java/adapter/avro/pom.xml b/java/adapter/avro/pom.xml
index 96952454e3bea..4fb71d7ec5f74 100644
--- a/java/adapter/avro/pom.xml
+++ b/java/adapter/avro/pom.xml
@@ -16,7 +16,7 @@
   <parent>
     <groupId>org.apache.arrow</groupId>
     <artifactId>arrow-java-root</artifactId>
-    <version>6.0.0-SNAPSHOT</version>
+    <version>7.0.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/java/adapter/jdbc/pom.xml b/java/adapter/jdbc/pom.xml
index 774676cf7e51c..c8ecd8e09f299 100644
--- a/java/adapter/jdbc/pom.xml
+++ b/java/adapter/jdbc/pom.xml
@@ -16,7 +16,7 @@
     <parent>
         <groupId>org.apache.arrow</groupId>
         <artifactId>arrow-java-root</artifactId>
-        <version>6.0.0-SNAPSHOT</version>
+        <version>7.0.0-SNAPSHOT</version>
         <relativePath>../../pom.xml</relativePath>
     </parent>
 
diff --git a/java/adapter/orc/pom.xml b/java/adapter/orc/pom.xml
index 9a6424a9afc56..a69a7ad03be4e 100644
--- a/java/adapter/orc/pom.xml
+++ b/java/adapter/orc/pom.xml
@@ -87,7 +87,7 @@
     <parent>
         <groupId>org.apache.arrow</groupId>
         <artifactId>arrow-java-root</artifactId>
-        <version>6.0.0-SNAPSHOT</version>
+        <version>7.0.0-SNAPSHOT</version>
         <relativePath>../../pom.xml</relativePath>
     </parent>
 
diff --git a/java/algorithm/pom.xml b/java/algorithm/pom.xml
index cb504c73b6adf..c17ce31ecd9a4 100644
--- a/java/algorithm/pom.xml
+++ b/java/algorithm/pom.xml
@@ -14,7 +14,7 @@
   <parent>
     <groupId>org.apache.arrow</groupId>
     <artifactId>arrow-java-root</artifactId>
-    <version>6.0.0-SNAPSHOT</version>
+    <version>7.0.0-SNAPSHOT</version>
   </parent>
   <artifactId>arrow-algorithm</artifactId>
   <name>Arrow Algorithms</name>
diff --git a/java/c/pom.xml b/java/c/pom.xml
index 55d07302ee673..a6734f33b330c 100644
--- a/java/c/pom.xml
+++ b/java/c/pom.xml
@@ -13,7 +13,7 @@
     <parent>
         <artifactId>arrow-java-root</artifactId>
         <groupId>org.apache.arrow</groupId>
-        <version>6.0.0-SNAPSHOT</version>
+        <version>7.0.0-SNAPSHOT</version>
     </parent>
     <modelVersion>4.0.0</modelVersion>
 
diff --git a/java/compression/pom.xml b/java/compression/pom.xml
index 652d9e692ec32..657209cd26ef4 100644
--- a/java/compression/pom.xml
+++ b/java/compression/pom.xml
@@ -14,7 +14,7 @@
   <parent>
     <groupId>org.apache.arrow</groupId>
     <artifactId>arrow-java-root</artifactId>
-    <version>6.0.0-SNAPSHOT</version>
+    <version>7.0.0-SNAPSHOT</version>
   </parent>
   <artifactId>arrow-compression</artifactId>
   <name>Arrow Compression</name>
diff --git a/java/dataset/pom.xml b/java/dataset/pom.xml
index a71b36210f6e4..4eeadb1df3b50 100644
--- a/java/dataset/pom.xml
+++ b/java/dataset/pom.xml
@@ -15,7 +15,7 @@
     <parent>
         <artifactId>arrow-java-root</artifactId>
         <groupId>org.apache.arrow</groupId>
-        <version>6.0.0-SNAPSHOT</version>
+        <version>7.0.0-SNAPSHOT</version>
     </parent>
     <modelVersion>4.0.0</modelVersion>
 
diff --git a/java/flight/flight-core/pom.xml b/java/flight/flight-core/pom.xml
index 71095aa4a540f..b1f00eb83f98e 100644
--- a/java/flight/flight-core/pom.xml
+++ b/java/flight/flight-core/pom.xml
@@ -14,7 +14,7 @@
   <parent>
     <groupId>org.apache.arrow</groupId>
     <artifactId>arrow-java-root</artifactId>
-    <version>6.0.0-SNAPSHOT</version>
+    <version>7.0.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/java/flight/flight-grpc/pom.xml b/java/flight/flight-grpc/pom.xml
index 5bd80f4b14ee6..c567b7cada5a4 100644
--- a/java/flight/flight-grpc/pom.xml
+++ b/java/flight/flight-grpc/pom.xml
@@ -13,7 +13,7 @@
   <parent>
     <artifactId>arrow-java-root</artifactId>
     <groupId>org.apache.arrow</groupId>
-    <version>6.0.0-SNAPSHOT</version>
+    <version>7.0.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
   <modelVersion>4.0.0</modelVersion>
diff --git a/java/format/pom.xml b/java/format/pom.xml
index aeb5d86dd2039..a80971247eb42 100644
--- a/java/format/pom.xml
+++ b/java/format/pom.xml
@@ -15,7 +15,7 @@
 <parent>
   <artifactId>arrow-java-root</artifactId>
   <groupId>org.apache.arrow</groupId>
-  <version>6.0.0-SNAPSHOT</version>
+  <version>7.0.0-SNAPSHOT</version>
 </parent>
 
 <artifactId>arrow-format</artifactId>
diff --git a/java/gandiva/pom.xml b/java/gandiva/pom.xml
index 3b13f809829b6..2a7614b092b37 100644
--- a/java/gandiva/pom.xml
+++ b/java/gandiva/pom.xml
@@ -14,7 +14,7 @@
     <parent>
       <groupId>org.apache.arrow</groupId>
       <artifactId>arrow-java-root</artifactId>
-      <version>6.0.0-SNAPSHOT</version>
+      <version>7.0.0-SNAPSHOT</version>
     </parent>
 
     <groupId>org.apache.arrow.gandiva</groupId>
diff --git a/java/memory/memory-core/pom.xml b/java/memory/memory-core/pom.xml
index c6cbe4adacf5e..40193b4c02230 100644
--- a/java/memory/memory-core/pom.xml
+++ b/java/memory/memory-core/pom.xml
@@ -13,7 +13,7 @@
   <parent>
     <artifactId>arrow-memory</artifactId>
     <groupId>org.apache.arrow</groupId>
-    <version>6.0.0-SNAPSHOT</version>
+    <version>7.0.0-SNAPSHOT</version>
   </parent>
   <modelVersion>4.0.0</modelVersion>
 
diff --git a/java/memory/memory-netty/pom.xml b/java/memory/memory-netty/pom.xml
index d94f4da28944a..460b5077c4d73 100644
--- a/java/memory/memory-netty/pom.xml
+++ b/java/memory/memory-netty/pom.xml
@@ -13,7 +13,7 @@
   <parent>
     <artifactId>arrow-memory</artifactId>
     <groupId>org.apache.arrow</groupId>
-    <version>6.0.0-SNAPSHOT</version>
+    <version>7.0.0-SNAPSHOT</version>
   </parent>
   <modelVersion>4.0.0</modelVersion>
 
diff --git a/java/memory/memory-unsafe/pom.xml b/java/memory/memory-unsafe/pom.xml
index 0ed164c4a120d..458694ec2d3ce 100644
--- a/java/memory/memory-unsafe/pom.xml
+++ b/java/memory/memory-unsafe/pom.xml
@@ -13,7 +13,7 @@
   <parent>
     <artifactId>arrow-memory</artifactId>
     <groupId>org.apache.arrow</groupId>
-    <version>6.0.0-SNAPSHOT</version>
+    <version>7.0.0-SNAPSHOT</version>
   </parent>
   <modelVersion>4.0.0</modelVersion>
 
diff --git a/java/memory/pom.xml b/java/memory/pom.xml
index a7520a76edb1d..e6c70615b01e0 100644
--- a/java/memory/pom.xml
+++ b/java/memory/pom.xml
@@ -14,7 +14,7 @@
   <parent>
     <groupId>org.apache.arrow</groupId>
     <artifactId>arrow-java-root</artifactId>
-    <version>6.0.0-SNAPSHOT</version>
+    <version>7.0.0-SNAPSHOT</version>
   </parent>
   <artifactId>arrow-memory</artifactId>
   <name>Arrow Memory</name>
diff --git a/java/performance/pom.xml b/java/performance/pom.xml
index d41df57876b4f..09bae24443bd2 100644
--- a/java/performance/pom.xml
+++ b/java/performance/pom.xml
@@ -14,7 +14,7 @@
     <parent>
         <artifactId>arrow-java-root</artifactId>
         <groupId>org.apache.arrow</groupId>
-        <version>6.0.0-SNAPSHOT</version>
+        <version>7.0.0-SNAPSHOT</version>
     </parent>
     <artifactId>arrow-performance</artifactId>
     <packaging>jar</packaging>
@@ -86,7 +86,7 @@
         <dependency>
             <groupId>org.apache.arrow</groupId>
             <artifactId>arrow-algorithm</artifactId>
-            <version>6.0.0-SNAPSHOT</version>
+            <version>7.0.0-SNAPSHOT</version>
             <scope>test</scope>
         </dependency>
     </dependencies>
diff --git a/java/plasma/pom.xml b/java/plasma/pom.xml
index 57609fc70f237..88e3b9141ef31 100644
--- a/java/plasma/pom.xml
+++ b/java/plasma/pom.xml
@@ -14,7 +14,7 @@
     <parent>
         <groupId>org.apache.arrow</groupId>
         <artifactId>arrow-java-root</artifactId>
-        <version>6.0.0-SNAPSHOT</version>
+        <version>7.0.0-SNAPSHOT</version>
     </parent>
     <artifactId>arrow-plasma</artifactId>
     <name>Arrow Plasma Client</name>
diff --git a/java/pom.xml b/java/pom.xml
index 5dd2dfc940223..007f4533ad35a 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -20,7 +20,7 @@
 
   <groupId>org.apache.arrow</groupId>
   <artifactId>arrow-java-root</artifactId>
-  <version>6.0.0-SNAPSHOT</version>
+  <version>7.0.0-SNAPSHOT</version>
   <packaging>pom</packaging>
 
   <name>Apache Arrow Java Root POM</name>
diff --git a/java/tools/pom.xml b/java/tools/pom.xml
index aed13379c50cf..3cc0282639f97 100644
--- a/java/tools/pom.xml
+++ b/java/tools/pom.xml
@@ -14,7 +14,7 @@
     <parent>
         <groupId>org.apache.arrow</groupId>
         <artifactId>arrow-java-root</artifactId>
-        <version>6.0.0-SNAPSHOT</version>
+        <version>7.0.0-SNAPSHOT</version>
     </parent>
     <artifactId>arrow-tools</artifactId>
     <name>Arrow Tools</name>
diff --git a/java/vector/pom.xml b/java/vector/pom.xml
index 4661a133172a7..4fce197dbf612 100644
--- a/java/vector/pom.xml
+++ b/java/vector/pom.xml
@@ -14,7 +14,7 @@
   <parent>
     <groupId>org.apache.arrow</groupId>
     <artifactId>arrow-java-root</artifactId>
-    <version>6.0.0-SNAPSHOT</version>
+    <version>7.0.0-SNAPSHOT</version>
   </parent>
   <artifactId>arrow-vector</artifactId>
   <name>Arrow Vectors</name>
diff --git a/js/package.json b/js/package.json
index 12144de9b86af..918e6e9825d41 100644
--- a/js/package.json
+++ b/js/package.json
@@ -100,5 +100,5 @@
   "engines": {
     "node": ">=12.0"
   },
-  "version": "6.0.0-SNAPSHOT"
+  "version": "7.0.0-SNAPSHOT"
 }
diff --git a/matlab/CMakeLists.txt b/matlab/CMakeLists.txt
index 3c3b873ef37f9..e667500c46641 100644
--- a/matlab/CMakeLists.txt
+++ b/matlab/CMakeLists.txt
@@ -183,7 +183,7 @@ endmacro()
 
 set(CMAKE_CXX_STANDARD 11)
 
-set(MLARROW_VERSION "6.0.0-SNAPSHOT")
+set(MLARROW_VERSION "7.0.0-SNAPSHOT")
 string(REGEX MATCH "^[0-9]+\\.[0-9]+\\.[0-9]+" MLARROW_BASE_VERSION "${MLARROW_VERSION}")
 
 project(mlarrow VERSION "${MLARROW_BASE_VERSION}")
diff --git a/python/setup.py b/python/setup.py
index deab5c063a2fe..3f0953c15527c 100755
--- a/python/setup.py
+++ b/python/setup.py
@@ -527,7 +527,7 @@ def _move_shared_libs_unix(build_prefix, build_lib, lib_name):
 
 # If the event of not running from a git clone (e.g. from a git archive
 # or a Python sdist), see if we can set the version number ourselves
-default_version = '6.0.0-SNAPSHOT'
+default_version = '7.0.0-SNAPSHOT'
 if (not os.path.exists('../.git') and
         not os.environ.get('SETUPTOOLS_SCM_PRETEND_VERSION')):
     os.environ['SETUPTOOLS_SCM_PRETEND_VERSION'] = \
diff --git a/r/DESCRIPTION b/r/DESCRIPTION
index a893189596a10..91d35fd8d65ab 100644
--- a/r/DESCRIPTION
+++ b/r/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: arrow
 Title: Integration to 'Apache' 'Arrow'
-Version: 5.0.0.9000
+Version: 6.0.0.9000
 Authors@R: c(
     person("Neal", "Richardson", email = "neal@ursalabs.org", role = c("aut", "cre")),
     person("Ian", "Cook", email = "ianmcook@gmail.com", role = c("aut")),
diff --git a/r/NEWS.md b/r/NEWS.md
index 065a5d792df7b..cea923b902e30 100644
--- a/r/NEWS.md
+++ b/r/NEWS.md
@@ -17,7 +17,9 @@
   under the License.
 -->
 
-# arrow 5.0.0.9000
+# arrow 6.0.0.9000
+
+# arrow 6.0.0
 
 There are now two ways to query Arrow data:
 
diff --git a/ruby/red-arrow-cuda/lib/arrow-cuda/version.rb b/ruby/red-arrow-cuda/lib/arrow-cuda/version.rb
index dbaf09a029f1b..e728c7c2823d0 100644
--- a/ruby/red-arrow-cuda/lib/arrow-cuda/version.rb
+++ b/ruby/red-arrow-cuda/lib/arrow-cuda/version.rb
@@ -16,7 +16,7 @@
 # under the License.
 
 module ArrowCUDA
-  VERSION = "6.0.0-SNAPSHOT"
+  VERSION = "7.0.0-SNAPSHOT"
 
   module Version
     numbers, TAG = VERSION.split("-")
diff --git a/ruby/red-arrow-dataset/lib/arrow-dataset/version.rb b/ruby/red-arrow-dataset/lib/arrow-dataset/version.rb
index 56e57651f9699..adb804ad8c615 100644
--- a/ruby/red-arrow-dataset/lib/arrow-dataset/version.rb
+++ b/ruby/red-arrow-dataset/lib/arrow-dataset/version.rb
@@ -16,7 +16,7 @@
 # under the License.
 
 module ArrowDataset
-  VERSION = "6.0.0-SNAPSHOT"
+  VERSION = "7.0.0-SNAPSHOT"
 
   module Version
     numbers, TAG = VERSION.split("-")
diff --git a/ruby/red-arrow-flight/lib/arrow-flight/version.rb b/ruby/red-arrow-flight/lib/arrow-flight/version.rb
index 75e8b2e522a61..57789ad02205e 100644
--- a/ruby/red-arrow-flight/lib/arrow-flight/version.rb
+++ b/ruby/red-arrow-flight/lib/arrow-flight/version.rb
@@ -16,7 +16,7 @@
 # under the License.
 
 module ArrowFlight
-  VERSION = "6.0.0-SNAPSHOT"
+  VERSION = "7.0.0-SNAPSHOT"
 
   module Version
     numbers, TAG = VERSION.split("-")
diff --git a/ruby/red-arrow/lib/arrow/version.rb b/ruby/red-arrow/lib/arrow/version.rb
index 6979bc80fc56a..a54bdaab45625 100644
--- a/ruby/red-arrow/lib/arrow/version.rb
+++ b/ruby/red-arrow/lib/arrow/version.rb
@@ -16,7 +16,7 @@
 # under the License.
 
 module Arrow
-  VERSION = "6.0.0-SNAPSHOT"
+  VERSION = "7.0.0-SNAPSHOT"
 
   module Version
     numbers, TAG = VERSION.split("-")
diff --git a/ruby/red-gandiva/lib/gandiva/version.rb b/ruby/red-gandiva/lib/gandiva/version.rb
index 2b38fb777d02e..bb2d36f32bfbb 100644
--- a/ruby/red-gandiva/lib/gandiva/version.rb
+++ b/ruby/red-gandiva/lib/gandiva/version.rb
@@ -16,7 +16,7 @@
 # under the License.
 
 module Gandiva
-  VERSION = "6.0.0-SNAPSHOT"
+  VERSION = "7.0.0-SNAPSHOT"
 
   module Version
     numbers, TAG = VERSION.split("-")
diff --git a/ruby/red-parquet/lib/parquet/version.rb b/ruby/red-parquet/lib/parquet/version.rb
index f803eb2be2e54..36e03486faf82 100644
--- a/ruby/red-parquet/lib/parquet/version.rb
+++ b/ruby/red-parquet/lib/parquet/version.rb
@@ -16,7 +16,7 @@
 # under the License.
 
 module Parquet
-  VERSION = "6.0.0-SNAPSHOT"
+  VERSION = "7.0.0-SNAPSHOT"
 
   module Version
     numbers, TAG = VERSION.split("-")
diff --git a/ruby/red-plasma/lib/plasma/version.rb b/ruby/red-plasma/lib/plasma/version.rb
index 8032429175502..241920e43c3cc 100644
--- a/ruby/red-plasma/lib/plasma/version.rb
+++ b/ruby/red-plasma/lib/plasma/version.rb
@@ -16,7 +16,7 @@
 # under the License.
 
 module Plasma
-  VERSION = "6.0.0-SNAPSHOT"
+  VERSION = "7.0.0-SNAPSHOT"
 
   module Version
     numbers, TAG = VERSION.split("-")

From c7281540548d8d4dd491eaeaceb9286dfdb5bbb7 Mon Sep 17 00:00:00 2001
From: Nic Crane <thisisnic@gmail.com>
Date: Wed, 27 Oct 2021 12:27:09 +0100
Subject: [PATCH 027/194] ARROW-14384: [Docs] Add documentation for building
 Sphinx docs without having to build pyarrow

These instructions simplify the process of building individual directories of docs for local development without having to install all the dependencies and build every single directory.

Closes #11464 from thisisnic/ARROW-14384_build_docs

Authored-by: Nic Crane <thisisnic@gmail.com>
Signed-off-by: Nic Crane <thisisnic@gmail.com>
---
 docs/source/developers/documentation.rst | 36 ++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/docs/source/developers/documentation.rst b/docs/source/developers/documentation.rst
index 813cc9cbdd267..fa46a57e4098d 100644
--- a/docs/source/developers/documentation.rst
+++ b/docs/source/developers/documentation.rst
@@ -101,3 +101,39 @@ The final output is located under ``docs/_build/html``.
 .. seealso::
 
    :ref:`docker-builds`.
+
+Building a single directory for dev purposes without all the pre-requisites
+---------------------------------------------------------------------------
+
+You can build documentation in a single directory without needing to install
+all of the pre-requisites by installing sphinx, setting up a temporary
+index in the directory you want to build and then building that directory.
+
+The example below shows how to do this in the ``developers`` directory.
+
+Install ``sphinx``:
+
+.. code-block:: shell
+
+   pip install sphinx
+
+Create an temporary index file ``temp_index.rst`` file in the target directory:
+
+.. code-block:: shell
+
+   echo $'.. toctree::\n\t:glob:\n\n\t*' > ./source/developers/temp_index.rst
+
+Build the docs in the target directory:
+
+.. code-block:: shell
+
+   sphinx-build ./source/developers ./source/developers/_build -c ./source -D master_doc=temp_index
+
+This builds everything in the target directory to a folder inside of it
+called ``_build`` using the config file in the `source` directory.
+
+Once you have verified the HTML documents, you can remove temporary index file:
+
+.. code-block:: shell
+
+   rm ./source/developers/temp_index.rst

From fc9317f0d3d82031c695b8d2333f0896a8527bed Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Wed, 27 Oct 2021 13:28:09 +0200
Subject: [PATCH 028/194] ARROW-14486: [Packaging][deb] Add missing
 libthrift-dev dependency
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes #11555 from kou/packaging-linux-thrift-dev

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Krisztián Szűcs <szucs.krisztian@gmail.com>
---
 dev/release/verify-apt.sh                     |  2 ++
 dev/release/verify-yum.sh                     |  5 ++++-
 .../linux-packages/apache-arrow/Rakefile      | 21 ++++++++++++++-----
 .../apache-arrow/debian/control.in            |  2 ++
 4 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/dev/release/verify-apt.sh b/dev/release/verify-apt.sh
index 2aa0d7f46ca0b..3773e27fa54f4 100755
--- a/dev/release/verify-apt.sh
+++ b/dev/release/verify-apt.sh
@@ -148,6 +148,8 @@ pushd build/minimal_build
 cmake .
 make -j$(nproc)
 ./arrow_example
+c++ -std=c++11 -o arrow_example example.cc $(pkg-config --cflags --libs arrow)
+./arrow_example
 popd
 echo "::endgroup::"
 
diff --git a/dev/release/verify-yum.sh b/dev/release/verify-yum.sh
index 1a48cc579468e..a7f572a44ff7c 100755
--- a/dev/release/verify-yum.sh
+++ b/dev/release/verify-yum.sh
@@ -152,13 +152,16 @@ ${install_command} \
   gcc-c++ \
   git \
   libarchive \
-  make
+  make \
+  pkg-config
 mkdir -p build
 cp -a /arrow/cpp/examples/minimal_build build
 pushd build/minimal_build
 ${cmake_command} .
 make -j$(nproc)
 ./arrow_example
+c++ -std=c++11 -o arrow_example example.cc $(pkg-config --cflags --libs arrow)
+./arrow_example
 popd
 
 if [ "${have_glib}" = "yes" ]; then
diff --git a/dev/tasks/linux-packages/apache-arrow/Rakefile b/dev/tasks/linux-packages/apache-arrow/Rakefile
index f401a06d64c9a..9dfae955e21f9 100644
--- a/dev/tasks/linux-packages/apache-arrow/Rakefile
+++ b/dev/tasks/linux-packages/apache-arrow/Rakefile
@@ -96,6 +96,16 @@ class ApacheArrowPackageTask < PackageTask
     control.gsub(/@CUDA_ARCHITECTURE@/, cuda_architecture)
   end
 
+  def apt_prepare_debian_control_c_ares(control, target)
+    case target
+    when /\Aubuntu-bionic/
+      use_system_c_ares = "#"
+    else
+      use_system_c_ares = ""
+    end
+    control.gsub(/@USE_SYSTEM_C_ARES@/, use_system_c_ares)
+  end
+
   def apt_prepare_debian_control_grpc(control, target)
     case target
     when /\Adebian-buster/, /\Aubuntu-(?:bionic|focal)/
@@ -106,14 +116,14 @@ class ApacheArrowPackageTask < PackageTask
     control.gsub(/@USE_SYSTEM_GRPC@/, use_system_grpc)
   end
 
-  def apt_prepare_debian_control_c_ares(control, target)
+  def apt_prepare_debian_control_thrift(control, target)
     case target
     when /\Aubuntu-bionic/
-      use_system_c_ares = "#"
+      use_system_thrift = "#"
     else
-      use_system_c_ares = ""
+      use_system_thrift = ""
     end
-    control.gsub(/@USE_SYSTEM_C_ARES@/, use_system_c_ares)
+    control.gsub(/@USE_SYSTEM_THRIFT@/, use_system_thrift)
   end
 
   def apt_prepare_debian_control_utf8proc(control, target)
@@ -139,8 +149,9 @@ class ApacheArrowPackageTask < PackageTask
   def apt_prepare_debian_control(control_in, target)
     control = control_in.dup
     control = apt_prepare_debian_control_cuda_architecture(control, target)
-    control = apt_prepare_debian_control_grpc(control, target)
     control = apt_prepare_debian_control_c_ares(control, target)
+    control = apt_prepare_debian_control_grpc(control, target)
+    control = apt_prepare_debian_control_thrift(control, target)
     control = apt_prepare_debian_control_utf8proc(control, target)
     control = apt_prepare_debian_control_zstd(control, target)
     control
diff --git a/dev/tasks/linux-packages/apache-arrow/debian/control.in b/dev/tasks/linux-packages/apache-arrow/debian/control.in
index 153892b5e9c1d..32417be24a38f 100644
--- a/dev/tasks/linux-packages/apache-arrow/debian/control.in
+++ b/dev/tasks/linux-packages/apache-arrow/debian/control.in
@@ -24,6 +24,7 @@ Build-Depends:
   libre2-dev,
   libsnappy-dev,
   libssl-dev,
+@USE_SYSTEM_THRIFT@  libthrift-dev,
 @USE_SYSTEM_UTF8PROC@  libutf8proc-dev,
 @USE_SYSTEM_ZSTD@  libzstd-dev,
   ninja-build,
@@ -134,6 +135,7 @@ Depends:
   libre2-dev,
   libsnappy-dev,
   libssl-dev,
+@USE_SYSTEM_THRIFT@  libthrift-dev,
 @USE_SYSTEM_UTF8PROC@  libutf8proc-dev,
 @USE_SYSTEM_ZSTD@  libzstd-dev,
 @USE_SYSTEM_GRPC@  protobuf-compiler-grpc,

From a1e94b2c87e4cb888984d775a1a50823afc6347c Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Wed, 27 Oct 2021 13:29:12 +0200
Subject: [PATCH 029/194] ARROW-14484: [Crossbow] Add support for specifying
 queue path by environment variable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes #11554 from kou/crossbow-queue-path-env

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Krisztián Szűcs <szucs.krisztian@gmail.com>
---
 dev/archery/archery/crossbow/cli.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dev/archery/archery/crossbow/cli.py b/dev/archery/archery/crossbow/cli.py
index 1d0610343c8b9..42d6f5c7302c9 100644
--- a/dev/archery/archery/crossbow/cli.py
+++ b/dev/archery/archery/crossbow/cli.py
@@ -38,6 +38,7 @@
               help='Arrow\'s repository path. Defaults to the repository of '
                    'this script')
 @click.option('--queue-path', '-q',
+              envvar="CROSSBOW_QUEUE_PATH",
               type=click.Path(), default=_default_queue_path,
               help='The repository path used for scheduling the tasks. '
                    'Defaults to crossbow directory placed next to arrow')

From fcfda9f0b65fa3c7b0a9f9d3413e1f9b34965886 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Wed, 27 Oct 2021 13:30:49 +0200
Subject: [PATCH 030/194] ARROW-14483: [Release] Add missing download targets
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Packages for AlmaLinux and Amazon Linux aren't downloaded.

Closes #11553 from kou/release-verify-binaries-missing-repositories

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Krisztián Szűcs <szucs.krisztian@gmail.com>
---
 dev/release/download_rc_binaries.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/dev/release/download_rc_binaries.py b/dev/release/download_rc_binaries.py
index 3e3d0f7d3a48a..c7e0dbfa43ac2 100755
--- a/dev/release/download_rc_binaries.py
+++ b/dev/release/download_rc_binaries.py
@@ -126,7 +126,13 @@ def parallel_map_terminate_early(f, iterable, num_parallel):
                 raise e
 
 
-ARROW_REPOSITORY_PACKAGE_TYPES = ['centos', 'debian', 'ubuntu']
+ARROW_REPOSITORY_PACKAGE_TYPES = [
+    'almalinux',
+    'amazon-linux',
+    'centos',
+    'debian',
+    'ubuntu',
+]
 ARROW_STANDALONE_PACKAGE_TYPES = ['nuget', 'python']
 ARROW_PACKAGE_TYPES = \
     ARROW_REPOSITORY_PACKAGE_TYPES + \

From 85c09b43646f2e120a4eb4c46e4420eebd780a76 Mon Sep 17 00:00:00 2001
From: jeszyb <jeszyb@gmail.com>
Date: Wed, 27 Oct 2021 13:32:41 +0200
Subject: [PATCH 031/194] ARROW-14398: [CI] Don't build doxygen docs in all of
 the conda builds
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In practice, disable doxygen for conda-cpp-valgrind.

Closes #11544 from jeszyb/ARROW-14398

Authored-by: jeszyb <jeszyb@gmail.com>
Signed-off-by: Krisztián Szűcs <szucs.krisztian@gmail.com>
---
 docker-compose.yml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/docker-compose.yml b/docker-compose.yml
index 93314e440a2da..b917edc22af8a 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -236,7 +236,7 @@ services:
     volumes: &conda-volumes
       - .:/arrow:delegated
       - ${DOCKER_VOLUME_PREFIX}conda-ccache:/ccache:delegated
-    command: &cpp-conda-command
+    command:
       ["/arrow/ci/scripts/cpp_build.sh /arrow /build true &&
         /arrow/ci/scripts/cpp_test.sh /arrow /build"]
 
@@ -271,7 +271,9 @@ services:
       ARROW_USE_LD_GOLD: "ON"
       BUILD_WARNING_LEVEL: "PRODUCTION"
     volumes: *conda-volumes
-    command: *cpp-conda-command
+    command: 
+      ["/arrow/ci/scripts/cpp_build.sh /arrow /build &&
+        /arrow/ci/scripts/cpp_test.sh /arrow /build"]
 
   debian-cpp:
     # Usage:

From 5c85e844ca94b55cb703980adee637baeab8a457 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= <szucs.krisztian@gmail.com>
Date: Wed, 27 Oct 2021 13:36:00 +0200
Subject: [PATCH 032/194] ARROW-14435: [Release] Update verification scripts to
 check python 3.10 wheels
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Cherry picking the commit to the verification PR it can be tested before merging.

Closes #11519 from kszucs/verify-python-310-wheels

Authored-by: Krisztián Szűcs <szucs.krisztian@gmail.com>
Signed-off-by: Krisztián Szűcs <szucs.krisztian@gmail.com>
---
 .../verify-release-candidate-wheels.bat       |  6 ++++++
 dev/release/verify-release-candidate.sh       | 20 ++++++++++++++-----
 2 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/dev/release/verify-release-candidate-wheels.bat b/dev/release/verify-release-candidate-wheels.bat
index 5bcefe80d6059..322e0cb56e1c2 100644
--- a/dev/release/verify-release-candidate-wheels.bat
+++ b/dev/release/verify-release-candidate-wheels.bat
@@ -65,6 +65,12 @@ if errorlevel 1 GOTO error
 CALL :verify_wheel 3.8
 if errorlevel 1 GOTO error
 
+CALL :verify_wheel 3.9
+if errorlevel 1 GOTO error
+
+CALL :verify_wheel 3.10
+if errorlevel 1 GOTO error
+
 :done
 cd %_CURRENT_DIR%
 
diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh
index d2d299a3e2c10..0222fafcdce70 100755
--- a/dev/release/verify-release-candidate.sh
+++ b/dev/release/verify-release-candidate.sh
@@ -599,12 +599,17 @@ test_linux_wheels() {
     local arch="x86_64"
   fi
 
-  local py_arches="3.6m 3.7m 3.8 3.9"
+  local py_arches="3.6m 3.7m 3.8 3.9 3.10"
   local platform_tags="manylinux_2_12_${arch}.manylinux2010_${arch} manylinux_2_17_${arch}.manylinux2014_${arch}"
 
   for py_arch in ${py_arches}; do
     local env=_verify_wheel-${py_arch}
-    conda create -yq -n ${env} python=${py_arch//[mu]/}
+    if [ $py_arch = "3.10" ]; then
+      local channels="-c conda-forge -c defaults"
+    else
+      local channels="-c conda-forge"
+    fi
+    conda create -yq -n ${env} ${channels} python=${py_arch//[mu]/}
     conda activate ${env}
     pip install -U pip
 
@@ -619,7 +624,7 @@ test_linux_wheels() {
 }
 
 test_macos_wheels() {
-  local py_arches="3.6m 3.7m 3.8 3.9"
+  local py_arches="3.6m 3.7m 3.8 3.9 3.10"
   local macos_version=$(sw_vers -productVersion)
   local macos_short_version=${macos_version:0:5}
 
@@ -632,14 +637,19 @@ test_macos_wheels() {
   fi
   # apple silicon processor
   if [ "$(uname -m)" = "arm64" ]; then
-    local py_arches="3.8 3.9"
+    local py_arches="3.8 3.9 3.10"
     local check_flight=OFF
   fi
 
   # verify arch-native wheels inside an arch-native conda environment
   for py_arch in ${py_arches}; do
     local env=_verify_wheel-${py_arch}
-    conda create -yq -n ${env} python=${py_arch//m/}
+    if [ $py_arch = "3.10" ]; then
+      local channels="-c conda-forge -c defaults"
+    else
+      local channels="-c conda-forge"
+    fi
+    conda create -yq -n ${env} ${channels} python=${py_arch//m/}
     conda activate ${env}
     pip install -U pip
 

From 311a9531e4fa310a13f439e7af244ab083c934d6 Mon Sep 17 00:00:00 2001
From: Jiayu Liu <jiayu.liu@airbnb.com>
Date: Wed, 27 Oct 2021 14:30:02 +0200
Subject: [PATCH 033/194] ARROW-14489: [Rust][CI] Install stable rust toolchain
 in the integration docker image
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

otherwise [downstream pull request][1] will be blocked at 1.55 rust version

[1]: https://github.com/apache/arrow-rs/pull/591

Closes #11557 from Jimexist/update-conda-integration

Lead-authored-by: Jiayu Liu <jiayu.liu@airbnb.com>
Co-authored-by: Krisztián Szűcs <szucs.krisztian@gmail.com>
Signed-off-by: Krisztián Szűcs <szucs.krisztian@gmail.com>
---
 ci/docker/conda-integration.dockerfile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ci/docker/conda-integration.dockerfile b/ci/docker/conda-integration.dockerfile
index 2dba1d10bfa92..b367feb88a308 100644
--- a/ci/docker/conda-integration.dockerfile
+++ b/ci/docker/conda-integration.dockerfile
@@ -40,6 +40,7 @@ RUN conda install -q \
 # Install Rust with only the needed components
 # (rustfmt is needed for tonic-build to compile the protobuf definitions)
 RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- --profile=minimal -y && \
+    $HOME/.cargo/bin/rustup toolchain install stable && \
     $HOME/.cargo/bin/rustup component add rustfmt
 
 ENV GOROOT=/opt/go \
@@ -51,7 +52,7 @@ RUN wget -nv -O - https://dl.google.com/go/go${go}.linux-${arch}.tar.gz | tar -x
 ENV DOTNET_ROOT=/opt/dotnet \
     PATH=/opt/dotnet:$PATH
 RUN curl -sSL https://dot.net/v1/dotnet-install.sh | bash /dev/stdin -Channel 3.1 -InstallDir /opt/dotnet
-    
+
 ENV ARROW_BUILD_INTEGRATION=ON \
     ARROW_BUILD_STATIC=OFF \
     ARROW_BUILD_TESTS=OFF \

From 9c57eb3e8585fdd687c0993dd5783b62ba4a83db Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Wed, 27 Oct 2021 16:41:57 +0200
Subject: [PATCH 034/194] ARROW-14189: [Docs] Add version dropdown to the
 sphinx docs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes #11283 from jorisvandenbossche/ARROW-14189-version-dropdown

Authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Signed-off-by: Krisztián Szűcs <szucs.krisztian@gmail.com>
---
 docs/source/_static/versions.json            | 26 +++++++++
 docs/source/_templates/docs-sidebar.html     |  2 +
 docs/source/_templates/version-switcher.html | 60 ++++++++++++++++++++
 docs/source/conf.py                          | 10 ++++
 4 files changed, 98 insertions(+)
 create mode 100644 docs/source/_static/versions.json
 create mode 100644 docs/source/_templates/version-switcher.html

diff --git a/docs/source/_static/versions.json b/docs/source/_static/versions.json
new file mode 100644
index 0000000000000..d364cfe271401
--- /dev/null
+++ b/docs/source/_static/versions.json
@@ -0,0 +1,26 @@
+[
+    {
+        "name": "6.0 (stable)",
+        "version": ""
+    },
+    {
+        "name": "5.0",
+        "version": "5.0/"
+    },
+    {
+        "name": "4.0",
+        "version": "4.0/"
+    },
+    {
+        "name": "3.0",
+        "version": "3.0/"
+    },
+    {
+        "name": "2.0",
+        "version": "2.0/"
+    },
+    {
+        "name": "1.0",
+        "version": "1.0/"
+    }
+]
\ No newline at end of file
diff --git a/docs/source/_templates/docs-sidebar.html b/docs/source/_templates/docs-sidebar.html
index f6ee66cadaad7..9ae2e19031f01 100644
--- a/docs/source/_templates/docs-sidebar.html
+++ b/docs/source/_templates/docs-sidebar.html
@@ -3,6 +3,8 @@
   <img src="{{ pathto('_static/' + logo, 1) }}" class="logo" alt="logo">
 </a>
 
+{% include "version-switcher.html" %}
+
 <form class="bd-search d-flex align-items-center" action="{{ pathto('search') }}" method="get">
   <i class="icon fas fa-search"></i>
   <input type="search" class="form-control" name="q" id="search-input" placeholder="{{ theme_search_bar_text }}" aria-label="{{ theme_search_bar_text }}" autocomplete="off" >
diff --git a/docs/source/_templates/version-switcher.html b/docs/source/_templates/version-switcher.html
new file mode 100644
index 0000000000000..297a2b0e4f451
--- /dev/null
+++ b/docs/source/_templates/version-switcher.html
@@ -0,0 +1,60 @@
+<div class="dropdown">
+    <button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown">
+        {{ release }}
+        <span class="caret"></span>
+    </button>
+    <div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button">
+    <!-- dropdown will be populated by javascript on page load -->
+    </div>
+</div>
+
+<script type="text/javascript">
+// Function to construct the target URL from the JSON components
+function buildURL(entry) {
+    var template = "{{ switcher_template_url }}";  // supplied by jinja
+    template = template.replace("{version}", entry.version);
+    return template;
+}
+
+// Function to check if corresponding page path exists in other version of docs
+// and, if so, go there instead of the homepage of the other docs version
+function checkPageExistsAndRedirect(event) {
+    const currentFilePath = "{{ pagename }}.html",
+          otherDocsHomepage = event.target.getAttribute("href");
+    let tryUrl = `${otherDocsHomepage}${currentFilePath}`;
+    $.ajax({
+        type: 'HEAD',
+        url: tryUrl,
+        // if the page exists, go there
+        success: function() {
+            location.href = tryUrl;
+        }
+    }).fail(function() {
+        location.href = otherDocsHomepage;
+    });
+    return false;
+}
+
+// Function to populate the version switcher
+(function () {
+    // get JSON config
+    $.getJSON("{{ switcher_json_url }}", function(data, textStatus, jqXHR) {
+        // create the nodes first (before AJAX calls) to ensure the order is
+        // correct (for now, links will go to doc version homepage)
+        $.each(data, function(index, entry) {
+            // if no custom name specified (e.g., "latest"), use version string
+            if (!("name" in entry)) {
+                entry.name = entry.version;
+            }
+            // construct the appropriate URL, and add it to the dropdown
+            entry.url = buildURL(entry);
+            const node = document.createElement("a");
+            node.setAttribute("class", "list-group-item list-group-item-action py-1");
+            node.setAttribute("href", `${entry.url}`);
+            node.textContent = `${entry.name}`;
+            node.onclick = checkPageExistsAndRedirect;
+            $("#version_switcher").append(node);
+        });
+    });
+})();
+</script>
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 2e5f9b50125fb..150cd4181d2a8 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -127,6 +127,9 @@
 release = os.environ.get('ARROW_DOCS_VERSION',
                          pyarrow.__version__)
 
+if "+" in release:
+    release = release.split(".dev")[0] + " (dev)"
+
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
 #
@@ -196,6 +199,13 @@
     "google_analytics_id": "UA-107500873-1",
 }
 
+html_context = {
+    "switcher_json_url": "/docs/_static/versions.json",
+    "switcher_template_url": "https://arrow.apache.org/docs/{version}",
+    # for local testing
+    # "switcher_template_url": "http://0.0.0.0:8000/docs/{version}",
+}
+
 # Add any paths that contain custom themes here, relative to this directory.
 # html_theme_path = []
 

From 846b6be5daf58d7168bd1c41cdf72595c91e5bc1 Mon Sep 17 00:00:00 2001
From: Alenka Frim <frim.alenka@gmail.com>
Date: Wed, 27 Oct 2021 18:12:28 +0200
Subject: [PATCH 035/194] ARROW-14470: [Python] Expose the use_threads option
 in Feather read functions

Closes #11558 from AlenkaF/ARROW-14470

Authored-by: Alenka Frim <frim.alenka@gmail.com>
Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 cpp/src/arrow/ipc/feather.cc                 | 25 +++++++++++++------
 cpp/src/arrow/ipc/feather.h                  | 10 ++++++++
 python/pyarrow/_feather.pyx                  | 10 +++++---
 python/pyarrow/feather.py                    | 14 +++++++----
 python/pyarrow/includes/libarrow.pxd         |  1 +
 python/pyarrow/includes/libarrow_feather.pxd |  5 ++--
 python/pyarrow/tests/test_feather.py         | 26 ++++++++++++++++++++
 7 files changed, 74 insertions(+), 17 deletions(-)

diff --git a/cpp/src/arrow/ipc/feather.cc b/cpp/src/arrow/ipc/feather.cc
index b1c30eec0b3c3..26051de042d05 100644
--- a/cpp/src/arrow/ipc/feather.cc
+++ b/cpp/src/arrow/ipc/feather.cc
@@ -705,13 +705,19 @@ Status WriteFeatherV1(const Table& table, io::OutputStream* dst) {
 
 class ReaderV2 : public Reader {
  public:
-  Status Open(const std::shared_ptr<io::RandomAccessFile>& source) {
+  Status Open(const std::shared_ptr<io::RandomAccessFile>& source,
+              const IpcReadOptions& options) {
     source_ = source;
-    ARROW_ASSIGN_OR_RAISE(auto reader, RecordBatchFileReader::Open(source_));
+    options_ = options;
+    ARROW_ASSIGN_OR_RAISE(auto reader, RecordBatchFileReader::Open(source_, options_));
     schema_ = reader->schema();
     return Status::OK();
   }
 
+  Status Open(const std::shared_ptr<io::RandomAccessFile>& source) {
+    return Open(source, IpcReadOptions::Defaults());
+  }
+
   int version() const override { return kFeatherV2Version; }
 
   std::shared_ptr<Schema> schema() const override { return schema_; }
@@ -726,12 +732,10 @@ class ReaderV2 : public Reader {
     return Table::FromRecordBatches(reader->schema(), batches).Value(out);
   }
 
-  Status Read(std::shared_ptr<Table>* out) override {
-    return Read(IpcReadOptions::Defaults(), out);
-  }
+  Status Read(std::shared_ptr<Table>* out) override { return Read(options_, out); }
 
   Status Read(const std::vector<int>& indices, std::shared_ptr<Table>* out) override {
-    auto options = IpcReadOptions::Defaults();
+    auto options = options_;
     options.included_fields = indices;
     return Read(options, out);
   }
@@ -753,12 +757,18 @@ class ReaderV2 : public Reader {
  private:
   std::shared_ptr<io::RandomAccessFile> source_;
   std::shared_ptr<Schema> schema_;
+  IpcReadOptions options_;
 };
 
 }  // namespace
 
 Result<std::shared_ptr<Reader>> Reader::Open(
     const std::shared_ptr<io::RandomAccessFile>& source) {
+  return Reader::Open(source, IpcReadOptions::Defaults());
+}
+
+Result<std::shared_ptr<Reader>> Reader::Open(
+    const std::shared_ptr<io::RandomAccessFile>& source, const IpcReadOptions& options) {
   // Pathological issue where the file is smaller than header and footer
   // combined
   ARROW_ASSIGN_OR_RAISE(int64_t size, source->GetSize());
@@ -773,12 +783,13 @@ Result<std::shared_ptr<Reader>> Reader::Open(
 
   if (memcmp(buffer->data(), kFeatherV1MagicBytes, strlen(kFeatherV1MagicBytes)) == 0) {
     std::shared_ptr<ReaderV1> result = std::make_shared<ReaderV1>();
+    // IPC Read options are ignored for ReaderV1
     RETURN_NOT_OK(result->Open(source));
     return result;
   } else if (memcmp(buffer->data(), internal::kArrowMagicBytes,
                     strlen(internal::kArrowMagicBytes)) == 0) {
     std::shared_ptr<ReaderV2> result = std::make_shared<ReaderV2>();
-    RETURN_NOT_OK(result->Open(source));
+    RETURN_NOT_OK(result->Open(source, options));
     return result;
   } else {
     return Status::Invalid("Not a Feather V1 or Arrow IPC file");
diff --git a/cpp/src/arrow/ipc/feather.h b/cpp/src/arrow/ipc/feather.h
index a32ff6d0a5a6c..da88ee22f8291 100644
--- a/cpp/src/arrow/ipc/feather.h
+++ b/cpp/src/arrow/ipc/feather.h
@@ -25,6 +25,7 @@
 #include <string>
 #include <vector>
 
+#include "arrow/ipc/options.h"
 #include "arrow/type_fwd.h"
 #include "arrow/util/compression.h"
 #include "arrow/util/visibility.h"
@@ -64,6 +65,15 @@ class ARROW_EXPORT Reader {
   static Result<std::shared_ptr<Reader>> Open(
       const std::shared_ptr<io::RandomAccessFile>& source);
 
+  /// \brief Open a Feather file from a RandomAccessFile interface
+  /// with IPC Read options
+  ///
+  /// \param[in] source a RandomAccessFile instance
+  /// \param[in] options IPC Read options
+  /// \return the table reader
+  static Result<std::shared_ptr<Reader>> Open(
+      const std::shared_ptr<io::RandomAccessFile>& source, const IpcReadOptions& options);
+
   /// \brief Return the version number of the Feather file
   virtual int version() const = 0;
 
diff --git a/python/pyarrow/_feather.pyx b/python/pyarrow/_feather.pyx
index 8df7935aaf33a..7dd61c9a986ff 100644
--- a/python/pyarrow/_feather.pyx
+++ b/python/pyarrow/_feather.pyx
@@ -68,11 +68,15 @@ cdef class FeatherReader(_Weakrefable):
     cdef:
         shared_ptr[CFeatherReader] reader
 
-    def __cinit__(self, source, c_bool use_memory_map):
-        cdef shared_ptr[CRandomAccessFile] reader
+    def __cinit__(self, source, c_bool use_memory_map, c_bool use_threads):
+        cdef:
+            shared_ptr[CRandomAccessFile] reader
+            CIpcReadOptions options = CIpcReadOptions.Defaults()
+        options.use_threads = use_threads
+
         get_reader(source, use_memory_map, &reader)
         with nogil:
-            self.reader = GetResultValue(CFeatherReader.Open(reader))
+            self.reader = GetResultValue(CFeatherReader.Open(reader, options))
 
     @property
     def version(self):
diff --git a/python/pyarrow/feather.py b/python/pyarrow/feather.py
index 2170a93c37651..6824f4ba96006 100644
--- a/python/pyarrow/feather.py
+++ b/python/pyarrow/feather.py
@@ -207,7 +207,7 @@ def read_feather(source, columns=None, use_threads=True, memory_map=True):
         read.
     use_threads : bool, default True
         Whether to parallelize reading using multiple threads. If false the
-        restriction is only used in the conversion to Pandas and not in the
+        restriction is used in the conversion to Pandas as well as in the
         reading from Feather format.
     memory_map : boolean, default True
         Use memory mapping when opening file on disk
@@ -217,11 +217,12 @@ def read_feather(source, columns=None, use_threads=True, memory_map=True):
     df : pandas.DataFrame
     """
     _check_pandas_version()
-    return (read_table(source, columns=columns, memory_map=memory_map)
-            .to_pandas(use_threads=use_threads))
+    return (read_table(
+        source, columns=columns, memory_map=memory_map,
+        use_threads=use_threads).to_pandas(use_threads=use_threads))
 
 
-def read_table(source, columns=None, memory_map=True):
+def read_table(source, columns=None, memory_map=True, use_threads=True):
     """
     Read a pyarrow.Table from Feather format
 
@@ -233,12 +234,15 @@ def read_table(source, columns=None, memory_map=True):
         read.
     memory_map : boolean, default True
         Use memory mapping when opening file on disk
+    use_threads : bool, default True
+        Whether to parallelize reading using multiple threads.
 
     Returns
     -------
     table : pyarrow.Table
     """
-    reader = _feather.FeatherReader(source, use_memory_map=memory_map)
+    reader = _feather.FeatherReader(
+        source, use_memory_map=memory_map, use_threads=use_threads)
 
     if columns is None:
         return reader.read()
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 815238f112c6e..6d187eaa8c6eb 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -1443,6 +1443,7 @@ cdef extern from "arrow/ipc/api.h" namespace "arrow::ipc" nogil:
         int max_recursion_depth
         CMemoryPool* memory_pool
         shared_ptr[unordered_set[int]] included_fields
+        c_bool use_threads
 
         @staticmethod
         CIpcReadOptions Defaults()
diff --git a/python/pyarrow/includes/libarrow_feather.pxd b/python/pyarrow/includes/libarrow_feather.pxd
index ddfc8b2e53a71..e5061c66dba27 100644
--- a/python/pyarrow/includes/libarrow_feather.pxd
+++ b/python/pyarrow/includes/libarrow_feather.pxd
@@ -20,7 +20,7 @@
 from pyarrow.includes.libarrow cimport (CCompressionType, CStatus, CTable,
                                         COutputStream, CResult, shared_ptr,
                                         vector, CRandomAccessFile, CSchema,
-                                        c_string)
+                                        c_string, CIpcReadOptions)
 
 
 cdef extern from "arrow/ipc/api.h" namespace "arrow::ipc" nogil:
@@ -40,7 +40,8 @@ cdef extern from "arrow/ipc/api.h" namespace "arrow::ipc" nogil:
     cdef cppclass CFeatherReader" arrow::ipc::feather::Reader":
         @staticmethod
         CResult[shared_ptr[CFeatherReader]] Open(
-            const shared_ptr[CRandomAccessFile]& file)
+            const shared_ptr[CRandomAccessFile]& file,
+            const CIpcReadOptions options)
         int version()
         shared_ptr[CSchema] schema()
 
diff --git a/python/pyarrow/tests/test_feather.py b/python/pyarrow/tests/test_feather.py
index 3d0451ee33e03..43f1773926de4 100644
--- a/python/pyarrow/tests/test_feather.py
+++ b/python/pyarrow/tests/test_feather.py
@@ -192,6 +192,32 @@ def test_read_table(version):
     assert_frame_equal(table.to_pandas(), result)
 
 
+@pytest.mark.pandas
+def test_use_threads(version):
+    # ARROW-14470
+    num_values = (10, 10)
+    path = random_path()
+
+    TEST_FILES.append(path)
+
+    values = np.random.randint(0, 10, size=num_values)
+    columns = ['col_' + str(i) for i in range(10)]
+    table = pa.Table.from_arrays(values, columns)
+
+    write_feather(table, path, version=version)
+
+    result = read_feather(path)
+    assert_frame_equal(table.to_pandas(), result)
+
+    # Test read_feather with use_threads=False
+    result = read_feather(path, use_threads=False)
+    assert_frame_equal(table.to_pandas(), result)
+
+    # Test read_table with use_threads=False
+    result = read_table(path, use_threads=False)
+    assert result.equals(table)
+
+
 @pytest.mark.pandas
 def test_float_nulls(version):
     num_values = 100

From de11d49e4330c1064da05c470bc48b2a96219a4f Mon Sep 17 00:00:00 2001
From: Weston Pace <weston.pace@gmail.com>
Date: Wed, 27 Oct 2021 14:18:57 -0400
Subject: [PATCH 036/194] ARROW-14480: [R] Expose
 arrow::dataset::ExistingDataBehavior to R

This only adds the one option so we can restore backwards compatibility for CRAN compliance.

Closes #11552 from westonpace/feature/ARROW-14480--expose-new-dataset-writer-opts-to-r

Lead-authored-by: Weston Pace <weston.pace@gmail.com>
Co-authored-by: Jonathan Keane <jkeane@gmail.com>
Co-authored-by: Neal Richardson <neal.p.richardson@gmail.com>
Signed-off-by: Neal Richardson <neal.p.richardson@gmail.com>
---
 cpp/src/arrow/dataset/dataset_writer.cc      |  5 ++-
 cpp/src/arrow/dataset/dataset_writer_test.cc |  6 +--
 cpp/src/arrow/dataset/file_base.h            | 14 +------
 cpp/src/arrow/dataset/type_fwd.h             | 12 ++++++
 r/R/arrowExports.R                           |  4 +-
 r/R/dataset-write.R                          | 17 ++++++++-
 r/man/arrow-package.Rd                       |  6 ++-
 r/man/write_dataset.Rd                       | 10 +++++
 r/src/arrowExports.cpp                       | 11 +++---
 r/src/dataset.cpp                            |  4 +-
 r/tests/testthat/test-dataset-write.R        | 40 ++++++++++++++++++++
 11 files changed, 101 insertions(+), 28 deletions(-)

diff --git a/cpp/src/arrow/dataset/dataset_writer.cc b/cpp/src/arrow/dataset/dataset_writer.cc
index 12b7858c4b9fe..a61f32cbc0e82 100644
--- a/cpp/src/arrow/dataset/dataset_writer.cc
+++ b/cpp/src/arrow/dataset/dataset_writer.cc
@@ -300,7 +300,8 @@ class DatasetWriterDirectoryQueue : public util::AsyncDestroyable {
     init_future_ =
         DeferNotOk(write_options_.filesystem->io_context().executor()->Submit([this] {
           RETURN_NOT_OK(write_options_.filesystem->CreateDir(directory_));
-          if (write_options_.existing_data_behavior == kDeleteMatchingPartitions) {
+          if (write_options_.existing_data_behavior ==
+              ExistingDataBehavior::kDeleteMatchingPartitions) {
             return write_options_.filesystem->DeleteDirContents(directory_);
           }
           return Status::OK();
@@ -358,7 +359,7 @@ Status ValidateBasenameTemplate(util::string_view basename_template) {
 }
 
 Status EnsureDestinationValid(const FileSystemDatasetWriteOptions& options) {
-  if (options.existing_data_behavior == kError) {
+  if (options.existing_data_behavior == ExistingDataBehavior::kError) {
     fs::FileSelector selector;
     selector.base_dir = options.base_dir;
     selector.recursive = true;
diff --git a/cpp/src/arrow/dataset/dataset_writer_test.cc b/cpp/src/arrow/dataset/dataset_writer_test.cc
index e3fac05ccd355..bf38c2f602c7f 100644
--- a/cpp/src/arrow/dataset/dataset_writer_test.cc
+++ b/cpp/src/arrow/dataset/dataset_writer_test.cc
@@ -284,7 +284,7 @@ TEST_F(DatasetWriterTestFixture, DeleteExistingData) {
                      fs::File("testdir/chunk-5.arrow"), fs::File("testdir/blah.txt")}));
   filesystem_ = std::dynamic_pointer_cast<MockFileSystem>(fs);
   write_options_.filesystem = filesystem_;
-  write_options_.existing_data_behavior = kDeleteMatchingPartitions;
+  write_options_.existing_data_behavior = ExistingDataBehavior::kDeleteMatchingPartitions;
   EXPECT_OK_AND_ASSIGN(auto dataset_writer, DatasetWriter::Make(write_options_));
   Future<> queue_fut = dataset_writer->WriteRecordBatch(MakeBatch(100), "");
   AssertFinished(queue_fut);
@@ -302,7 +302,7 @@ TEST_F(DatasetWriterTestFixture, PartitionedDeleteExistingData) {
                      fs::File("testdir/part1/bar.arrow")}));
   filesystem_ = std::dynamic_pointer_cast<MockFileSystem>(fs);
   write_options_.filesystem = filesystem_;
-  write_options_.existing_data_behavior = kDeleteMatchingPartitions;
+  write_options_.existing_data_behavior = ExistingDataBehavior::kDeleteMatchingPartitions;
   EXPECT_OK_AND_ASSIGN(auto dataset_writer, DatasetWriter::Make(write_options_));
   Future<> queue_fut = dataset_writer->WriteRecordBatch(MakeBatch(100), "part0");
   AssertFinished(queue_fut);
@@ -321,7 +321,7 @@ TEST_F(DatasetWriterTestFixture, LeaveExistingData) {
                      fs::File("testdir/chunk-5.arrow"), fs::File("testdir/blah.txt")}));
   filesystem_ = std::dynamic_pointer_cast<MockFileSystem>(fs);
   write_options_.filesystem = filesystem_;
-  write_options_.existing_data_behavior = kOverwriteOrIgnore;
+  write_options_.existing_data_behavior = ExistingDataBehavior::kOverwriteOrIgnore;
   EXPECT_OK_AND_ASSIGN(auto dataset_writer, DatasetWriter::Make(write_options_));
   Future<> queue_fut = dataset_writer->WriteRecordBatch(MakeBatch(100), "");
   AssertFinished(queue_fut);
diff --git a/cpp/src/arrow/dataset/file_base.h b/cpp/src/arrow/dataset/file_base.h
index 3c7b8258963c6..9113691814b35 100644
--- a/cpp/src/arrow/dataset/file_base.h
+++ b/cpp/src/arrow/dataset/file_base.h
@@ -343,18 +343,6 @@ class ARROW_DS_EXPORT FileWriter {
   fs::FileLocator destination_locator_;
 };
 
-/// \brief Controls what happens if files exist in an output directory during a dataset
-/// write
-enum ExistingDataBehavior : int8_t {
-  /// Deletes all files in a directory the first time that directory is encountered
-  kDeleteMatchingPartitions,
-  /// Ignores existing files, overwriting any that happen to have the same name as an
-  /// output file
-  kOverwriteOrIgnore,
-  /// Returns an error if there are any files or subdirectories in the output directory
-  kError,
-};
-
 /// \brief Options for writing a dataset.
 struct ARROW_DS_EXPORT FileSystemDatasetWriteOptions {
   /// Options for individual fragment writing.
@@ -388,7 +376,7 @@ struct ARROW_DS_EXPORT FileSystemDatasetWriteOptions {
   uint64_t max_rows_per_file = 0;
 
   /// Controls what happens if an output directory already exists.
-  ExistingDataBehavior existing_data_behavior = kError;
+  ExistingDataBehavior existing_data_behavior = ExistingDataBehavior::kError;
 
   /// Callback to be invoked against all FileWriters before
   /// they are finalized with FileWriter::Finish().
diff --git a/cpp/src/arrow/dataset/type_fwd.h b/cpp/src/arrow/dataset/type_fwd.h
index ad1a2996af4e3..78748a31402d4 100644
--- a/cpp/src/arrow/dataset/type_fwd.h
+++ b/cpp/src/arrow/dataset/type_fwd.h
@@ -52,6 +52,18 @@ class FileSystemDataset;
 class FileSystemDatasetFactory;
 struct FileSystemDatasetWriteOptions;
 
+/// \brief Controls what happens if files exist in an output directory during a dataset
+/// write
+enum class ExistingDataBehavior : int8_t {
+  /// Deletes all files in a directory the first time that directory is encountered
+  kDeleteMatchingPartitions,
+  /// Ignores existing files, overwriting any that happen to have the same name as an
+  /// output file
+  kOverwriteOrIgnore,
+  /// Returns an error if there are any files or subdirectories in the output directory
+  kError,
+};
+
 class InMemoryDataset;
 
 class CsvFileFormat;
diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R
index f5f2dd7c93c6e..014b1641f7d00 100644
--- a/r/R/arrowExports.R
+++ b/r/R/arrowExports.R
@@ -604,8 +604,8 @@ dataset___ScanTask__get_batches <- function(scan_task) {
   .Call(`_arrow_dataset___ScanTask__get_batches`, scan_task)
 }
 
-dataset___Dataset__Write <- function(file_write_options, filesystem, base_dir, partitioning, basename_template, scanner) {
-  invisible(.Call(`_arrow_dataset___Dataset__Write`, file_write_options, filesystem, base_dir, partitioning, basename_template, scanner))
+dataset___Dataset__Write <- function(file_write_options, filesystem, base_dir, partitioning, basename_template, scanner, existing_data_behavior) {
+  invisible(.Call(`_arrow_dataset___Dataset__Write`, file_write_options, filesystem, base_dir, partitioning, basename_template, scanner, existing_data_behavior))
 }
 
 dataset___Scanner__TakeRows <- function(scanner, indices) {
diff --git a/r/R/dataset-write.R b/r/R/dataset-write.R
index 95c7f7bd33db9..3a98357b013f6 100644
--- a/r/R/dataset-write.R
+++ b/r/R/dataset-write.R
@@ -38,6 +38,16 @@
 #' will yield `"part-0.feather", ...`.
 #' @param hive_style logical: write partition segments as Hive-style
 #' (`key1=value1/key2=value2/file.ext`) or as just bare values. Default is `TRUE`.
+#' @param existing_data_behavior The behavior to use when there is already data
+#' in the destination directory.  Must be one of "overwrite", "error", or
+#' "delete_matching".
+#' - "overwrite" (the default) then any new files created will overwrite
+#'   existing files
+#' - "error" then the operation will fail if the destination directory is not
+#'   empty
+#' - "delete_matching" then the writer will delete any existing partitions
+#'   if data is going to be written to those partitions and will leave alone
+#'   partitions which data is not written to.
 #' @param ... additional format-specific arguments. For available Parquet
 #' options, see [write_parquet()]. The available Feather options are
 #' - `use_legacy_format` logical: write data formatted so that Arrow libraries
@@ -97,6 +107,7 @@ write_dataset <- function(dataset,
                           partitioning = dplyr::group_vars(dataset),
                           basename_template = paste0("part-{i}.", as.character(format)),
                           hive_style = TRUE,
+                          existing_data_behavior = c("overwrite", "error", "delete_matching"),
                           ...) {
   format <- match.arg(format)
   if (inherits(dataset, "arrow_dplyr_query")) {
@@ -122,8 +133,12 @@ write_dataset <- function(dataset,
   path_and_fs <- get_path_and_filesystem(path)
   options <- FileWriteOptions$create(format, table = scanner, ...)
 
+  existing_data_behavior_opts <- c("delete_matching", "overwrite", "error")
+  existing_data_behavior <- match(match.arg(existing_data_behavior), existing_data_behavior_opts) - 1L
+
   dataset___Dataset__Write(
     options, path_and_fs$fs, path_and_fs$path,
-    partitioning, basename_template, scanner
+    partitioning, basename_template, scanner,
+    existing_data_behavior
   )
 }
diff --git a/r/man/arrow-package.Rd b/r/man/arrow-package.Rd
index 122f7682e1720..021762162b7cb 100644
--- a/r/man/arrow-package.Rd
+++ b/r/man/arrow-package.Rd
@@ -6,7 +6,11 @@
 \alias{arrow-package}
 \title{arrow: Integration to 'Apache' 'Arrow'}
 \description{
-'Apache' 'Arrow' <https://arrow.apache.org/> is a cross-language development platform for in-memory data. It specifies a standardized language-independent columnar memory format for flat and hierarchical data, organized for efficient analytic operations on modern hardware. This package provides an interface to the 'Arrow C++' library.
+'Apache' 'Arrow' <https://arrow.apache.org/> is a cross-language
+    development platform for in-memory data. It specifies a standardized
+    language-independent columnar memory format for flat and hierarchical data,
+    organized for efficient analytic operations on modern hardware. This
+    package provides an interface to the 'Arrow C++' library.
 }
 \seealso{
 Useful links:
diff --git a/r/man/write_dataset.Rd b/r/man/write_dataset.Rd
index 219cc834510ea..76bbaf7c7b541 100644
--- a/r/man/write_dataset.Rd
+++ b/r/man/write_dataset.Rd
@@ -11,6 +11,7 @@ write_dataset(
   partitioning = dplyr::group_vars(dataset),
   basename_template = paste0("part-{i}.", as.character(format)),
   hive_style = TRUE,
+  existing_data_behavior = c("overwrite", "error", "delete_matching"),
   ...
 )
 }
@@ -38,6 +39,15 @@ will yield \verb{"part-0.feather", ...}.}
 \item{hive_style}{logical: write partition segments as Hive-style
 (\code{key1=value1/key2=value2/file.ext}) or as just bare values. Default is \code{TRUE}.}
 
+\item{existing_data_behavior}{The behavior to use when there is already data
+in the destination directory.  Must be one of overwrite, error, or
+delete_matching.  When this is set to "overwrite" (the default) then any
+new files created will overwrite existing files.  When this is set to
+"error" then the operation will fail if the destination directory is not
+empty.  When this is set to "delete_matching" then the writer will delete
+any existing partitions if data is going to be written to those partitions
+and will leave alone partitions which data is not written to.}
+
 \item{...}{additional format-specific arguments. For available Parquet
 options, see \code{\link[=write_parquet]{write_parquet()}}. The available Feather options are
 \itemize{
diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp
index c446b77a70ff6..5872aa4d2f020 100644
--- a/r/src/arrowExports.cpp
+++ b/r/src/arrowExports.cpp
@@ -2396,8 +2396,8 @@ extern "C" SEXP _arrow_dataset___ScanTask__get_batches(SEXP scan_task_sexp){
 
 // dataset.cpp
 #if defined(ARROW_R_WITH_DATASET)
-void dataset___Dataset__Write(const std::shared_ptr<ds::FileWriteOptions>& file_write_options, const std::shared_ptr<fs::FileSystem>& filesystem, std::string base_dir, const std::shared_ptr<ds::Partitioning>& partitioning, std::string basename_template, const std::shared_ptr<ds::Scanner>& scanner);
-extern "C" SEXP _arrow_dataset___Dataset__Write(SEXP file_write_options_sexp, SEXP filesystem_sexp, SEXP base_dir_sexp, SEXP partitioning_sexp, SEXP basename_template_sexp, SEXP scanner_sexp){
+void dataset___Dataset__Write(const std::shared_ptr<ds::FileWriteOptions>& file_write_options, const std::shared_ptr<fs::FileSystem>& filesystem, std::string base_dir, const std::shared_ptr<ds::Partitioning>& partitioning, std::string basename_template, const std::shared_ptr<ds::Scanner>& scanner, arrow::dataset::ExistingDataBehavior existing_data_behavior);
+extern "C" SEXP _arrow_dataset___Dataset__Write(SEXP file_write_options_sexp, SEXP filesystem_sexp, SEXP base_dir_sexp, SEXP partitioning_sexp, SEXP basename_template_sexp, SEXP scanner_sexp, SEXP existing_data_behavior_sexp){
 BEGIN_CPP11
 	arrow::r::Input<const std::shared_ptr<ds::FileWriteOptions>&>::type file_write_options(file_write_options_sexp);
 	arrow::r::Input<const std::shared_ptr<fs::FileSystem>&>::type filesystem(filesystem_sexp);
@@ -2405,12 +2405,13 @@ BEGIN_CPP11
 	arrow::r::Input<const std::shared_ptr<ds::Partitioning>&>::type partitioning(partitioning_sexp);
 	arrow::r::Input<std::string>::type basename_template(basename_template_sexp);
 	arrow::r::Input<const std::shared_ptr<ds::Scanner>&>::type scanner(scanner_sexp);
-	dataset___Dataset__Write(file_write_options, filesystem, base_dir, partitioning, basename_template, scanner);
+	arrow::r::Input<arrow::dataset::ExistingDataBehavior>::type existing_data_behavior(existing_data_behavior_sexp);
+	dataset___Dataset__Write(file_write_options, filesystem, base_dir, partitioning, basename_template, scanner, existing_data_behavior);
 	return R_NilValue;
 END_CPP11
 }
 #else
-extern "C" SEXP _arrow_dataset___Dataset__Write(SEXP file_write_options_sexp, SEXP filesystem_sexp, SEXP base_dir_sexp, SEXP partitioning_sexp, SEXP basename_template_sexp, SEXP scanner_sexp){
+extern "C" SEXP _arrow_dataset___Dataset__Write(SEXP file_write_options_sexp, SEXP filesystem_sexp, SEXP base_dir_sexp, SEXP partitioning_sexp, SEXP basename_template_sexp, SEXP scanner_sexp, SEXP existing_data_behavior_sexp){
 	Rf_error("Cannot call dataset___Dataset__Write(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
 }
 #endif
@@ -7319,7 +7320,7 @@ static const R_CallMethodDef CallEntries[] = {
 		{ "_arrow_dataset___Scanner__head", (DL_FUNC) &_arrow_dataset___Scanner__head, 2}, 
 		{ "_arrow_dataset___Scanner__schema", (DL_FUNC) &_arrow_dataset___Scanner__schema, 1}, 
 		{ "_arrow_dataset___ScanTask__get_batches", (DL_FUNC) &_arrow_dataset___ScanTask__get_batches, 1}, 
-		{ "_arrow_dataset___Dataset__Write", (DL_FUNC) &_arrow_dataset___Dataset__Write, 6}, 
+		{ "_arrow_dataset___Dataset__Write", (DL_FUNC) &_arrow_dataset___Dataset__Write, 7}, 
 		{ "_arrow_dataset___Scanner__TakeRows", (DL_FUNC) &_arrow_dataset___Scanner__TakeRows, 2}, 
 		{ "_arrow_dataset___Scanner__CountRows", (DL_FUNC) &_arrow_dataset___Scanner__CountRows, 1}, 
 		{ "_arrow_Int8__initialize", (DL_FUNC) &_arrow_Int8__initialize, 0}, 
diff --git a/r/src/dataset.cpp b/r/src/dataset.cpp
index 544c9d8b9e3c6..7e384aa545daa 100644
--- a/r/src/dataset.cpp
+++ b/r/src/dataset.cpp
@@ -516,9 +516,11 @@ void dataset___Dataset__Write(
     const std::shared_ptr<ds::FileWriteOptions>& file_write_options,
     const std::shared_ptr<fs::FileSystem>& filesystem, std::string base_dir,
     const std::shared_ptr<ds::Partitioning>& partitioning, std::string basename_template,
-    const std::shared_ptr<ds::Scanner>& scanner) {
+    const std::shared_ptr<ds::Scanner>& scanner,
+    arrow::dataset::ExistingDataBehavior existing_data_behavior) {
   ds::FileSystemDatasetWriteOptions opts;
   opts.file_write_options = file_write_options;
+  opts.existing_data_behavior = existing_data_behavior;
   opts.filesystem = filesystem;
   opts.base_dir = base_dir;
   opts.partitioning = partitioning;
diff --git a/r/tests/testthat/test-dataset-write.R b/r/tests/testthat/test-dataset-write.R
index 705103f54f7bf..8e7c077e62285 100644
--- a/r/tests/testthat/test-dataset-write.R
+++ b/r/tests/testthat/test-dataset-write.R
@@ -139,6 +139,46 @@ test_that("Writing a dataset: Parquet->Parquet (default)", {
   )
 })
 
+test_that("Writing a dataset: existing data behavior", {
+  # This test does not work on Windows because unlink does not immediately
+  # delete the data.
+  skip_on_os("windows")
+  ds <- open_dataset(csv_dir, partitioning = "part", format = "csv")
+  dst_dir <- make_temp_dir()
+  write_dataset(ds, dst_dir, format = "feather", partitioning = "int")
+  expect_true(dir.exists(dst_dir))
+
+  check_dataset <- function() {
+    new_ds <- open_dataset(dst_dir, format = "feather")
+
+    expect_equal(
+      new_ds %>%
+        select(string = chr, integer = int) %>%
+        filter(integer > 6 & integer < 11) %>%
+        collect() %>%
+        summarize(mean = mean(integer)),
+      df1 %>%
+        select(string = chr, integer = int) %>%
+        filter(integer > 6) %>%
+        summarize(mean = mean(integer))
+    )
+  }
+
+  check_dataset()
+  # By default we should overwrite
+  write_dataset(ds, dst_dir, format = "feather", partitioning = "int")
+  check_dataset()
+  write_dataset(ds, dst_dir, format = "feather", partitioning = "int", existing_data_behavior = "overwrite")
+  check_dataset()
+  expect_error(
+    write_dataset(ds, dst_dir, format = "feather", partitioning = "int", existing_data_behavior = "error"),
+    "directory is not empty"
+  )
+  unlink(dst_dir, recursive = TRUE)
+  write_dataset(ds, dst_dir, format = "feather", partitioning = "int", existing_data_behavior = "error")
+  check_dataset()
+})
+
 test_that("Writing a dataset: no format specified", {
   dst_dir <- make_temp_dir()
   write_dataset(example_data, dst_dir)

From 112fde57803d7085f50d6023204a0c3a3d66d60b Mon Sep 17 00:00:00 2001
From: Yibo Cai <yibo.cai@arm.com>
Date: Wed, 27 Oct 2021 16:38:35 -0400
Subject: [PATCH 037/194] ARROW-14405: [C++] Fix build error from clang for
 windows

Closes #11495 from cyb70289/fix-clang-win

Authored-by: Yibo Cai <yibo.cai@arm.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 .../compute/kernels/scalar_temporal_unary.cc     | 16 ++++++++--------
 cpp/src/arrow/io/file.cc                         |  2 ++
 cpp/src/arrow/io/slow.h                          |  2 +-
 cpp/src/arrow/scalar.h                           |  6 +++---
 4 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc b/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc
index a62fbb2ccdcb1..d29ebca0ca862 100644
--- a/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc
@@ -73,14 +73,6 @@ const std::shared_ptr<DataType>& IsoCalendarType() {
   return type;
 }
 
-Result<std::locale> GetLocale(const std::string& locale) {
-  try {
-    return std::locale(locale.c_str());
-  } catch (const std::runtime_error& ex) {
-    return Status::Invalid("Cannot find locale '", locale, "': ", ex.what());
-  }
-}
-
 Status ValidateDayOfWeekOptions(const DayOfWeekOptions& options) {
   if (options.week_start < 1 || 7 < options.week_start) {
     return Status::Invalid(
@@ -465,6 +457,14 @@ struct Nanosecond {
 // Convert timestamps to a string representation with an arbitrary format
 
 #ifndef _WIN32
+Result<std::locale> GetLocale(const std::string& locale) {
+  try {
+    return std::locale(locale.c_str());
+  } catch (const std::runtime_error& ex) {
+    return Status::Invalid("Cannot find locale '", locale, "': ", ex.what());
+  }
+}
+
 template <typename Duration, typename InType>
 struct Strftime {
   const StrftimeOptions& options;
diff --git a/cpp/src/arrow/io/file.cc b/cpp/src/arrow/io/file.cc
index effbfd30b4b07..55bb3bc392fb0 100644
--- a/cpp/src/arrow/io/file.cc
+++ b/cpp/src/arrow/io/file.cc
@@ -299,6 +299,8 @@ class ReadableFile::ReadableFileImpl : public OSFile {
       if (radvisory.ra_count > 0 && fcntl(fd_, F_RDADVISE, &radvisory) == -1) {
         RETURN_NOT_OK(report_error(errno, "fcntl(fd, F_RDADVISE, ...) failed"));
       }
+#else
+      ARROW_UNUSED(report_error);
 #endif
     }
     return Status::OK();
diff --git a/cpp/src/arrow/io/slow.h b/cpp/src/arrow/io/slow.h
index b0c02a85ac6f3..1ed90f0c2e920 100644
--- a/cpp/src/arrow/io/slow.h
+++ b/cpp/src/arrow/io/slow.h
@@ -48,7 +48,7 @@ class ARROW_EXPORT LatencyGenerator {
 // XXX use ConcurrencyWrapper?  It could increase chances of finding a race.
 
 template <class StreamType>
-class ARROW_EXPORT SlowInputStreamBase : public StreamType {
+class SlowInputStreamBase : public StreamType {
  public:
   SlowInputStreamBase(std::shared_ptr<StreamType> stream,
                       std::shared_ptr<LatencyGenerator> latencies)
diff --git a/cpp/src/arrow/scalar.h b/cpp/src/arrow/scalar.h
index be8df6b64c6b0..232d54950dbf6 100644
--- a/cpp/src/arrow/scalar.h
+++ b/cpp/src/arrow/scalar.h
@@ -298,7 +298,7 @@ struct ARROW_EXPORT FixedSizeBinaryScalar : public BinaryScalar {
 };
 
 template <typename T>
-struct ARROW_EXPORT TemporalScalar : internal::PrimitiveScalar<T> {
+struct TemporalScalar : internal::PrimitiveScalar<T> {
   using internal::PrimitiveScalar<T>::PrimitiveScalar;
   using ValueType = typename TemporalScalar<T>::ValueType;
 
@@ -307,7 +307,7 @@ struct ARROW_EXPORT TemporalScalar : internal::PrimitiveScalar<T> {
 };
 
 template <typename T>
-struct ARROW_EXPORT DateScalar : public TemporalScalar<T> {
+struct DateScalar : public TemporalScalar<T> {
   using TemporalScalar<T>::TemporalScalar;
   using ValueType = typename TemporalScalar<T>::ValueType;
 
@@ -342,7 +342,7 @@ struct ARROW_EXPORT TimestampScalar : public TemporalScalar<TimestampType> {
 };
 
 template <typename T>
-struct ARROW_EXPORT IntervalScalar : public TemporalScalar<T> {
+struct IntervalScalar : public TemporalScalar<T> {
   using TemporalScalar<T>::TemporalScalar;
   using ValueType = typename TemporalScalar<T>::ValueType;
 

From 4c2a8dd164768539aa65ad008d626a68a725af11 Mon Sep 17 00:00:00 2001
From: Dominik Moritz <domoritz@gmail.com>
Date: Wed, 27 Oct 2021 19:00:30 -0400
Subject: [PATCH 038/194] ARROW-14492: [JS] Fix export for browser bundles

Closes #11565 from domoritz/dom/browser-bundle

Authored-by: Dominik Moritz <domoritz@gmail.com>
Signed-off-by: Dominik Moritz <domoritz@gmail.com>
---
 js/gulp/package-task.js | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/js/gulp/package-task.js b/js/gulp/package-task.js
index 23edf0b8c6ce9..321e65a30c0e5 100644
--- a/js/gulp/package-task.js
+++ b/js/gulp/package-task.js
@@ -60,8 +60,8 @@ const createMainPackageJson = (target, format) => (orig) => ({
     main: `${mainExport}.node.js`,
     module: `${mainExport}.node.mjs`,
     browser: {
-        [`${mainExport}.node.js`]: `${mainExport}.dom.js`,
-        [`${mainExport}.node.mjs`]: `${mainExport}.dom.mjs`
+        [`./${mainExport}.node.js`]: `./${mainExport}.dom.js`,
+        [`./${mainExport}.node.mjs`]: `./${mainExport}.dom.mjs`
     },
     exports: {
         import: `./${mainExport}.node.mjs`,

From 61be0519c6b60bc2576a3b6049e00c7d7627f07d Mon Sep 17 00:00:00 2001
From: Carlos O'Ryan <coryan@google.com>
Date: Thu, 28 Oct 2021 09:49:05 +0900
Subject: [PATCH 039/194] ARROW-14166: [C++] update vcpkg builtin baseline

Closes #11266 from coryan/ARROW-14166-update-vcpkg-builtin-baseline

Authored-by: Carlos O'Ryan <coryan@google.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 cpp/vcpkg.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/vcpkg.json b/cpp/vcpkg.json
index d285d78eb8517..f74b4d3e37b76 100644
--- a/cpp/vcpkg.json
+++ b/cpp/vcpkg.json
@@ -42,5 +42,5 @@
   "overrides": [
     { "name": "gtest", "version": "1.10.0", "port-version": 4 }
   ],
-  "builtin-baseline": "a267ab118c09f56f3dae96c9a4b3410820ad2f0b"
+  "builtin-baseline": "b5865bfeba430cd44063fc54a45a8861195a715f"
 }

From 61a22124eb86737fee96f1a8b485b3ad5cff8d31 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= <szucs.krisztian@gmail.com>
Date: Thu, 28 Oct 2021 09:54:59 +0900
Subject: [PATCH 040/194] ARROW-14438: [CI] Don't cancel builds on the main
 branch
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

And anywhere else but pull requests.

Closes #11520 from kszucs/ARROW-14438

Authored-by: Krisztián Szűcs <szucs.krisztian@gmail.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 .github/workflows/archery.yml         | 2 +-
 .github/workflows/cpp.yml             | 2 +-
 .github/workflows/csharp.yml          | 2 +-
 .github/workflows/dev.yml             | 2 +-
 .github/workflows/go.yml              | 4 ++--
 .github/workflows/integration.yml     | 2 +-
 .github/workflows/java.yml            | 2 +-
 .github/workflows/java_jni.yml        | 2 +-
 .github/workflows/js.yml              | 2 +-
 .github/workflows/julia.yml           | 2 +-
 .github/workflows/matlab.yml          | 6 +++---
 .github/workflows/python.yml          | 2 +-
 .github/workflows/r-without-arrow.yml | 2 +-
 .github/workflows/r.yml               | 2 +-
 .github/workflows/ruby.yml            | 2 +-
 15 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/.github/workflows/archery.yml b/.github/workflows/archery.yml
index fa84f4dc3b08b..3e8b35fa18270 100644
--- a/.github/workflows/archery.yml
+++ b/.github/workflows/archery.yml
@@ -32,7 +32,7 @@ on:
       - 'docker-compose.yml'
 
 concurrency:
-  group: ${{ github.repository }}-${{ github.ref }}-${{ github.workflow }}
+  group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
   cancel-in-progress: true
 
 jobs:
diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml
index 0f19f7351c325..d0c476ac1f75d 100644
--- a/.github/workflows/cpp.yml
+++ b/.github/workflows/cpp.yml
@@ -38,7 +38,7 @@ on:
       - 'format/Flight.proto'
 
 concurrency:
-  group: ${{ github.repository }}-${{ github.ref }}-${{ github.workflow }}
+  group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
   cancel-in-progress: true
 
 env:
diff --git a/.github/workflows/csharp.yml b/.github/workflows/csharp.yml
index b339b8f46555d..59e5113685fc9 100644
--- a/.github/workflows/csharp.yml
+++ b/.github/workflows/csharp.yml
@@ -30,7 +30,7 @@ on:
       - 'csharp/**'
 
 concurrency:
-  group: ${{ github.repository }}-${{ github.ref }}-${{ github.workflow }}
+  group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
   cancel-in-progress: true
 
 jobs:
diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml
index 9ef46c31fa34e..4b368cdc88373 100644
--- a/.github/workflows/dev.yml
+++ b/.github/workflows/dev.yml
@@ -23,7 +23,7 @@ on:
   pull_request:
 
 concurrency:
-  group: ${{ github.repository }}-${{ github.ref }}-${{ github.workflow }}
+  group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
   cancel-in-progress: true
 
 env:
diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml
index 5d8034ff09f1f..b2f8534ae18e8 100644
--- a/.github/workflows/go.yml
+++ b/.github/workflows/go.yml
@@ -33,7 +33,7 @@ on:
       - 'go/**'
 
 concurrency:
-  group: ${{ github.repository }}-${{ github.ref }}-${{ github.workflow }}
+  group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
   cancel-in-progress: true
 
 env:
@@ -244,7 +244,7 @@ jobs:
           #- 32 runtime handling for CGO needs 64-bit currently
           - 64
     env:
-      ARROW_GO_TESTCGO: "1"            
+      ARROW_GO_TESTCGO: "1"
     steps:
       - name: Disable Crash Dialogs
         run: |
diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml
index 7a4deb8e3ea31..c93d4133fc7cc 100644
--- a/.github/workflows/integration.yml
+++ b/.github/workflows/integration.yml
@@ -42,7 +42,7 @@ on:
       - 'format/**'
 
 concurrency:
-  group: ${{ github.repository }}-${{ github.ref }}-${{ github.workflow }}
+  group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
   cancel-in-progress: true
 
 env:
diff --git a/.github/workflows/java.yml b/.github/workflows/java.yml
index 72f4df7e36e38..df942771bba25 100644
--- a/.github/workflows/java.yml
+++ b/.github/workflows/java.yml
@@ -36,7 +36,7 @@ on:
       - 'java/**'
 
 concurrency:
-  group: ${{ github.repository }}-${{ github.ref }}-${{ github.workflow }}
+  group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
   cancel-in-progress: true
 
 env:
diff --git a/.github/workflows/java_jni.yml b/.github/workflows/java_jni.yml
index 48351f3c22ad3..82b2a641071fa 100644
--- a/.github/workflows/java_jni.yml
+++ b/.github/workflows/java_jni.yml
@@ -36,7 +36,7 @@ on:
       - 'java/**'
 
 concurrency:
-  group: ${{ github.repository }}-${{ github.ref }}-${{ github.workflow }}
+  group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
   cancel-in-progress: true
 
 env:
diff --git a/.github/workflows/js.yml b/.github/workflows/js.yml
index cbd6ce0acef8f..8f8c43403ee4b 100644
--- a/.github/workflows/js.yml
+++ b/.github/workflows/js.yml
@@ -32,7 +32,7 @@ on:
       - 'js/**'
 
 concurrency:
-  group: ${{ github.repository }}-${{ github.ref }}-${{ github.workflow }}
+  group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
   cancel-in-progress: true
 
 env:
diff --git a/.github/workflows/julia.yml b/.github/workflows/julia.yml
index 226ec3e6ad042..519a4fab2972d 100644
--- a/.github/workflows/julia.yml
+++ b/.github/workflows/julia.yml
@@ -27,7 +27,7 @@ on:
       - 'julia/**'
 
 concurrency:
-  group: ${{ github.repository }}-${{ github.ref }}-${{ github.workflow }}
+  group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
   cancel-in-progress: true
 
 jobs:
diff --git a/.github/workflows/matlab.yml b/.github/workflows/matlab.yml
index 1ac3df6020ba6..268bb63cb2f2c 100644
--- a/.github/workflows/matlab.yml
+++ b/.github/workflows/matlab.yml
@@ -32,13 +32,13 @@ on:
       - 'cpp/src/arrow/**'
 
 concurrency:
-  group: ${{ github.repository }}-${{ github.ref }}-${{ github.workflow }}
+  group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
   cancel-in-progress: true
 
 jobs:
 
   matlab:
-    name: AMD64 Ubuntu 20.04 MATLAB 
+    name: AMD64 Ubuntu 20.04 MATLAB
     runs-on: ubuntu-latest
     steps:
       - name: Check out repository
@@ -54,7 +54,7 @@ jobs:
       - name: Build MATLAB Interface
         run: ci/scripts/matlab_build.sh $(pwd)
       - name: Run MATLAB Tests
-        env: 
+        env:
           # libarrow.so requires a more recent version of libstdc++.so
           # than is bundled with MATLAB under <matlabroot>/sys/os/glnxa64.
           # Therefore, if a MEX function that depends on libarrow.so
diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
index cce66fe71ba79..81ec8c093a472 100644
--- a/.github/workflows/python.yml
+++ b/.github/workflows/python.yml
@@ -32,7 +32,7 @@ on:
       - 'python/**'
 
 concurrency:
-  group: ${{ github.repository }}-${{ github.ref }}-${{ github.workflow }}
+  group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
   cancel-in-progress: true
 
 env:
diff --git a/.github/workflows/r-without-arrow.yml b/.github/workflows/r-without-arrow.yml
index a517cfcd18744..12afdd43c4aeb 100644
--- a/.github/workflows/r-without-arrow.yml
+++ b/.github/workflows/r-without-arrow.yml
@@ -28,7 +28,7 @@ on:
       - "r/src/**"
 
 concurrency:
-  group: ${{ github.repository }}-${{ github.ref }}-${{ github.workflow }}
+  group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
   cancel-in-progress: true
 
 env:
diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml
index 900acdac794fb..4f8709fe5bc4a 100644
--- a/.github/workflows/r.yml
+++ b/.github/workflows/r.yml
@@ -40,7 +40,7 @@ on:
       - "r/**"
 
 concurrency:
-  group: ${{ github.repository }}-${{ github.ref }}-${{ github.workflow }}
+  group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
   cancel-in-progress: true
 
 env:
diff --git a/.github/workflows/ruby.yml b/.github/workflows/ruby.yml
index 067b40aefe92f..2afb753231866 100644
--- a/.github/workflows/ruby.yml
+++ b/.github/workflows/ruby.yml
@@ -44,7 +44,7 @@ on:
       - 'ruby/**'
 
 concurrency:
-  group: ${{ github.repository }}-${{ github.ref }}-${{ github.workflow }}
+  group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
   cancel-in-progress: true
 
 env:

From 0f3e36e6b8d2d583dd7089c9e290de56981de1db Mon Sep 17 00:00:00 2001
From: Niyas Sait <niyas.sait@linaro.org>
Date: Thu, 28 Oct 2021 05:34:52 +0000
Subject: [PATCH 041/194] ARROW-9688: [C++][Python] Enable building c++ library
 and pyarrow package for win/arm64 build

The patch will enable building arrow c++ library and pyarrow package for windows/arm64 targets using clang-cl and ninja toolchain.

The patch contains steps to build c++ library for basic configuration. Library with Python can be built similar to other platforms.

MSVC cannot be yet used for win/arm64 build as some of the templates from xsimd and boost are not properly handled by MSVC.

PyArrow can be built with Ninja and clang-cl by setting the following environment variables

```
set PYARROW_CMAKE_GENERATOR="Ninja"
set PYARROW_CMAKE_OPTIONS="-DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl"
```

PyArrow tests are all passing

```
=============================================================================== test session starts ===============================================================================
platform win32 -- Python 3.9.7, pytest-6.2.5, py-1.10.0, pluggy-1.0.0
rootdir: C:\users\niysai01\workspace\arrow\python, configfile: setup.cfg
plugins: hypothesis-6.23.2, lazy-fixture-0.6.3
collected 4106 items / 3 skipped / 4103 selected

pyarrow\tests\test_adhoc_memory_leak.py s                                                                                                                                    [  0%]
pyarrow\tests\test_array.py ......................s...............................................................................................s......................... [  3%]
....................................................ss...........                                                                                                            [  5%]
pyarrow\tests\test_builder.py ....                                                                                                                                           [  5%]
pyarrow\tests\test_cffi.py ..........                                                                                                                                        [  5%]
pyarrow\tests\test_compute.py .............................................................................................................................................. [  8%]
...................................sss....................                                                                                                                   [ 10%]
pyarrow\tests\test_convert_builtin.py ...................................................................................................................................... [ 13%]
.............................................................................x......................................................................................x....... [ 17%]
................ssssss........................................................................................................sssssssss                                      [ 21%]
pyarrow\tests\test_csv.py ......................................................................................................sss.......                                   [ 23%]
pyarrow\tests\test_cython.py ..                                                                                                                                              [ 23%]
pyarrow\tests\test_dataset.py ssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssss [ 27%]
ssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssss [ 31%]
sssssssssssssssssssssssss                                                                                                                                                    [ 32%]
pyarrow\tests\test_extension_type.py ..............................sssss..                                                                                                   [ 33%]
pyarrow\tests\test_feather.py ............s...............ss..............s...............ss...x.....ss.s...ssss                                                             [ 34%]
pyarrow\tests\test_filesystem.py ....                                                                                                                                        [ 35%]
pyarrow\tests\test_flight.py ssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssss                                                                                  [ 36%]
pyarrow\tests\test_fs.py s.......ssx.xsss...ssx.xsss....ss...sss...ss...sss...ss...sss...ss...sss...ss...sss...ss...sss...ss...sss....ss...sss...ss...sss...ss...sss...ss... [ 40%]
sssssssssssssssssssssssssssssssssssssssssssssss...ss...sssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssss...ss...sss....sss........ [ 44%]
.......s...........sss.                                                                                                                                                      [ 44%]
pyarrow\tests\test_gandiva.py sssssssssss                                                                                                                                    [ 45%]
pyarrow\tests\test_hdfs.py ssssssssssssssssssssssss                                                                                                                          [ 45%]
pyarrow\tests\test_io.py ........s..........................ssssssssssss..........s...s..............sssssss.....sssssssssss..........s..s.sss....s.s.s.                     [ 48%]
pyarrow\tests\test_ipc.py ...............s..............s....................                                                                                                [ 50%]
pyarrow\tests\test_json.py ..........................                                                                                                                        [ 50%]
pyarrow\tests\test_memory.py ........                                                                                                                                        [ 50%]
pyarrow\tests\test_misc.py ................................................................................................                                                  [ 53%]
pyarrow\tests\test_orc.py sssss                                                                                                                                              [ 53%]
pyarrow\tests\test_pandas.py .............................................................s............................sss.................................................. [ 56%]
...s.........s............................................................s...................................................................s..............                [ 60%]
pyarrow\tests\test_plasma.py sssssssssssssssssssssssssssssssss                                                                                                               [ 61%]
pyarrow\tests\test_plasma_tf_op.py s                                                                                                                                         [ 61%]
pyarrow\tests\test_scalars.py ..............................................s...s....................                                                                        [ 63%]
pyarrow\tests\test_schema.py ................................                                                                                                                [ 64%]
pyarrow\tests\test_serialization.py ........s..ss........................................................................................................................... [ 67%]
.........................................................ss................................................................................................................. [ 71%]
...............................................................s............................................................................................................ [ 75%]
....................................................................s....................................................................................................... [ 79%]
............................................................................s..............                                                                                  [ 82%]
pyarrow\tests\test_serialization_deprecated.py ..                                                                                                                            [ 82%]
pyarrow\tests\test_sparse_tensor.py ....................................................................................................................................ssss [ 85%]
sssssssssssssssssssssssssssss                                                                                                                                                [ 86%]
pyarrow\tests\test_strategies.py ssssssss                                                                                                                                    [ 86%]
pyarrow\tests\test_table.py ....................s.........................................................................                                                   [ 88%]
pyarrow\tests\test_tensor.py ..............s......                                                                                                                           [ 89%]
pyarrow\tests\test_types.py .....................s............................................................ss                                                             [ 91%]
pyarrow\tests\test_util.py s                                                                                                                                                 [ 91%]
pyarrow\tests\parquet\test_basic.py ssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssss                                                 [ 93%]
pyarrow\tests\parquet\test_compliant_nested_type.py ssssssss                                                                                                                 [ 93%]
pyarrow\tests\parquet\test_data_types.py ssssssssssssssssssssssssssssssssssssssssss                                                                                          [ 94%]
pyarrow\tests\parquet\test_dataset.py ssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssss                                     [ 97%]
pyarrow\tests\parquet\test_datetime.py ssssssssssssssssss                                                                                                                    [ 97%]
pyarrow\tests\parquet\test_metadata.py ssssssssssssssssssssssssss                                                                                                            [ 98%]
pyarrow\tests\parquet\test_pandas.py ssssssssssssssssssssssssssssssssssssssssssssssss                                                                                        [ 99%]
pyarrow\tests\parquet\test_parquet_file.py ssssssssssssss                                                                                                                    [ 99%]
pyarrow\tests\parquet\test_parquet_writer.py ssssssssssssssss                                                                                                                [100%]

=========================================================== 2899 passed, 1203 skipped, 7 xfailed, 40 warnings in 56.11s ===========================================================

```

Closes #11383 from nsait-linaro/enable_win_arm64

Lead-authored-by: Niyas Sait <niyas.sait@linaro.org>
Co-authored-by: Yibo Cai <yibo.cai@arm.com>
Signed-off-by: Yibo Cai <yibo.cai@arm.com>
---
 cpp/cmake_modules/ThirdpartyToolchain.cmake |  8 ++++++-
 cpp/src/arrow/util/bit_util.h               |  3 +++
 cpp/src/arrow/util/cpu_info.cc              |  6 ++++++
 cpp/src/arrow/util/simd.h                   |  2 --
 docs/source/developers/cpp/windows.rst      | 24 +++++++++++++++++++++
 5 files changed, 40 insertions(+), 3 deletions(-)

diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake
index a793f30462dac..0673aa18fdf07 100644
--- a/cpp/cmake_modules/ThirdpartyToolchain.cmake
+++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake
@@ -904,7 +904,13 @@ if(ARROW_USE_UBSAN)
   set(ARROW_USE_NATIVE_INT128 FALSE)
 else()
   include(CheckCXXSymbolExists)
-  check_cxx_symbol_exists("__SIZEOF_INT128__" "" ARROW_USE_NATIVE_INT128)
+  check_cxx_symbol_exists("_M_ARM64" "" WIN32_ARM64_TARGET)
+  if(WIN32_ARM64_TARGET AND CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+    # NOTE: For clang/win-arm64, native int128_t produce linker errors
+    set(ARROW_USE_NATIVE_INT128 FALSE)
+  else()
+    check_cxx_symbol_exists("__SIZEOF_INT128__" "" ARROW_USE_NATIVE_INT128)
+  endif()
 endif()
 
 # - Gandiva has a compile-time (header-only) dependency on Boost, not runtime.
diff --git a/cpp/src/arrow/util/bit_util.h b/cpp/src/arrow/util/bit_util.h
index c306ce7821b24..b6b5f2dfc1d71 100644
--- a/cpp/src/arrow/util/bit_util.h
+++ b/cpp/src/arrow/util/bit_util.h
@@ -18,8 +18,11 @@
 #pragma once
 
 #if defined(_MSC_VER)
+#if defined(_M_AMD64) || defined(_M_X64)
 #include <intrin.h>  // IWYU pragma: keep
 #include <nmmintrin.h>
+#endif
+
 #pragma intrinsic(_BitScanReverse)
 #pragma intrinsic(_BitScanForward)
 #define ARROW_POPCOUNT64 __popcnt64
diff --git a/cpp/src/arrow/util/cpu_info.cc b/cpp/src/arrow/util/cpu_info.cc
index d803521a2d90d..5f3328e413b24 100644
--- a/cpp/src/arrow/util/cpu_info.cc
+++ b/cpp/src/arrow/util/cpu_info.cc
@@ -31,7 +31,9 @@
 #endif
 
 #ifdef _WIN32
+#if defined(_M_AMD64) || defined(_M_X64)
 #include <immintrin.h>
+#endif
 #include <intrin.h>
 #include <array>
 #include <bitset>
@@ -229,6 +231,7 @@ bool RetrieveCacheSize(int64_t* cache_sizes) {
   return true;
 }
 
+#ifndef _M_ARM64
 // Source: https://en.wikipedia.org/wiki/CPUID
 bool RetrieveCPUInfo(int64_t* hardware_flags, std::string* model_name,
                      CpuInfo::Vendor* vendor) {
@@ -309,6 +312,7 @@ bool RetrieveCPUInfo(int64_t* hardware_flags, std::string* model_name,
   return true;
 }
 #endif
+#endif
 
 }  // namespace
 
@@ -431,7 +435,9 @@ void CpuInfo::Init() {
   if (!RetrieveCacheSize(cache_sizes_)) {
     SetDefaultCacheSize();
   }
+#ifndef _M_ARM64
   RetrieveCPUInfo(&hardware_flags_, &model_name_, &vendor_);
+#endif
 #else
   SetDefaultCacheSize();
 #endif
diff --git a/cpp/src/arrow/util/simd.h b/cpp/src/arrow/util/simd.h
index 259641dd45607..046c74cdccebd 100644
--- a/cpp/src/arrow/util/simd.h
+++ b/cpp/src/arrow/util/simd.h
@@ -22,8 +22,6 @@
 
 #if defined(_M_AMD64) || defined(_M_X64)
 #include <intrin.h>
-#elif defined(_M_ARM64)
-#include <arm64_neon.h>
 #endif
 
 #else
diff --git a/docs/source/developers/cpp/windows.rst b/docs/source/developers/cpp/windows.rst
index ee5a613bc6dbe..d4d418343be6e 100644
--- a/docs/source/developers/cpp/windows.rst
+++ b/docs/source/developers/cpp/windows.rst
@@ -286,6 +286,30 @@ file:
    make || exit /B
    popd
 
+Building on Windows/ARM64 using Ninja and Clang
+===============================================
+
+Ninja and clang can be used for building library on windows/arm64 platform.
+
+.. code-block:: batch
+
+   cd cpp
+   mkdir build
+   cd build
+
+   set CC=clang-cl
+   set CXX=clang-cl
+
+   cmake -G "Ninja" ..
+
+   cmake --build . --config Release
+
+LLVM toolchain for Windows on ARM64 can be downloaded from LLVM release page `LLVM release page <https://releases.llvm.org>`_
+
+Visual Studio (MSVC) cannot be yet used for compiling win/arm64 build due to compatibility issues for dependencies like xsimd and boost library.
+
+Note: This is only an experimental build for WoA64 as all features are not extensively tested through CI due to lack of infrastructure.
+
 Debug builds
 ============
 

From 08ed9ad9706052c4ff7898f1860cd5ef044db0ae Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Thu, 28 Oct 2021 12:37:56 +0200
Subject: [PATCH 042/194] ARROW-14499: [Docs] Version dropdown side-by-side
 with search box
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes #11563 from jorisvandenbossche/ARROW-14189-version-dropdown-follow-up

Authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Signed-off-by: Krisztián Szűcs <szucs.krisztian@gmail.com>
---
 docs/source/_static/theme_overrides.css      | 19 +++++++++++++++++++
 docs/source/_templates/docs-sidebar.html     |  6 +++++-
 docs/source/_templates/version-switcher.html |  2 +-
 3 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/docs/source/_static/theme_overrides.css b/docs/source/_static/theme_overrides.css
index 0edc5c5023a27..d7d0bdfdbe1a5 100644
--- a/docs/source/_static/theme_overrides.css
+++ b/docs/source/_static/theme_overrides.css
@@ -93,6 +93,25 @@ Details: min(15vh, 110px) for the logo size, 8rem for search box etc*/
   }
 }
 
+/* Styling to get the version dropdown and search box side-by-side on wide screens */
+
+#version-search-wrapper {
+  overflow: hidden;
+  width: inherit;
+  display: flex;
+  flex-wrap: wrap;
+  justify-content: left;
+  align-items: center;
+}
+
+#version-button {
+  padding-left: 0.5rem;
+  padding-right: 1rem;
+}
+
+#search-box {
+  flex: 1 0 12em;
+}
 
 /* Fix table text wrapping in RTD theme,
  * see https://rackerlabs.github.io/docs-rackspace/tools/rtd-tables.html
diff --git a/docs/source/_templates/docs-sidebar.html b/docs/source/_templates/docs-sidebar.html
index 9ae2e19031f01..fde4435df305b 100644
--- a/docs/source/_templates/docs-sidebar.html
+++ b/docs/source/_templates/docs-sidebar.html
@@ -3,13 +3,17 @@
   <img src="{{ pathto('_static/' + logo, 1) }}" class="logo" alt="logo">
 </a>
 
+<div id="version-search-wrapper">
+
 {% include "version-switcher.html" %}
 
-<form class="bd-search d-flex align-items-center" action="{{ pathto('search') }}" method="get">
+<form id="search-box" class="bd-search d-flex align-items-center" action="{{ pathto('search') }}" method="get">
   <i class="icon fas fa-search"></i>
   <input type="search" class="form-control" name="q" id="search-input" placeholder="{{ theme_search_bar_text }}" aria-label="{{ theme_search_bar_text }}" autocomplete="off" >
 </form>
 
+</div>
+
 <nav class="bd-links" id="bd-docs-nav" aria-label="Main navigation">
   <div class="bd-toc-item active">
     {% if "python/api" in pagename or "python/generated" in pagename %}
diff --git a/docs/source/_templates/version-switcher.html b/docs/source/_templates/version-switcher.html
index 297a2b0e4f451..24a8c15ac0102 100644
--- a/docs/source/_templates/version-switcher.html
+++ b/docs/source/_templates/version-switcher.html
@@ -1,4 +1,4 @@
-<div class="dropdown">
+<div id="version-button" class="dropdown">
     <button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown">
         {{ release }}
         <span class="caret"></span>

From 905cb784e257a5fd621ebbf176c6c6f9acd94ba4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= <szucs.krisztian@gmail.com>
Date: Thu, 28 Oct 2021 12:40:15 +0200
Subject: [PATCH 043/194] ARROW-14490: [Doc] Regenerate CHANGELOG.md to include
 all versions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Using the following command:

```
 archery release changelog regenerate
```

Closes #11560 from kszucs/ARROW-14490

Authored-by: Krisztián Szűcs <szucs.krisztian@gmail.com>
Signed-off-by: Krisztián Szűcs <szucs.krisztian@gmail.com>
---
 CHANGELOG.md               | 2091 +++++++++++++++++++++++++++++++++++-
 dev/archery/archery/cli.py |    2 +-
 2 files changed, 2068 insertions(+), 25 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3ed715d7f4fc4..1e5474b1a5c44 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,10 +1,1988 @@
 
-# Apache Arrow 3.0.0 (2021-01-18)
+# Apache Arrow 6.0.0 (2021-10-26)
+
+## Bug Fixes
+
+* [ARROW-6946](https://issues.apache.org/jira/browse/ARROW-6946) - [Go] Run tests with assert build tag enabled
+* [ARROW-8452](https://issues.apache.org/jira/browse/ARROW-8452) - [Go][Integration] Go JSON producer generates incorrect nullable flag for nested types
+* [ARROW-8453](https://issues.apache.org/jira/browse/ARROW-8453) - [Integration][Go] Recursive nested types unsupported
+* [ARROW-8999](https://issues.apache.org/jira/browse/ARROW-8999) - [Python][C++] Non-deterministic segfault in "AMD64 MacOS 10.15 Python 3.7" build
+* [ARROW-9948](https://issues.apache.org/jira/browse/ARROW-9948) - [C++] Decimal128 does not check scale range when rescaling; can cause buffer overflow
+* [ARROW-10213](https://issues.apache.org/jira/browse/ARROW-10213) - [C++] Temporal cast from timestamp to date rounds instead of extracting date component
+* [ARROW-10373](https://issues.apache.org/jira/browse/ARROW-10373) - [C++] ValidateFull() does not validate null\_count
+* [ARROW-10773](https://issues.apache.org/jira/browse/ARROW-10773) - [R] parallel as.data.frame.Table hangs indefinitely on Windows
+* [ARROW-11518](https://issues.apache.org/jira/browse/ARROW-11518) - [C++] [Parquet] Parquet reader crashes when reading boolean columns
+* [ARROW-11579](https://issues.apache.org/jira/browse/ARROW-11579) - [R] read\_feather hanging on Windows
+* [ARROW-11634](https://issues.apache.org/jira/browse/ARROW-11634) - [C++][Parquet] Parquet statistics (min/max) for dictionary columns are incorrect
+* [ARROW-11729](https://issues.apache.org/jira/browse/ARROW-11729) - [R] Add examples to the datasets documentation
+* [ARROW-12011](https://issues.apache.org/jira/browse/ARROW-12011) - [C++][Python] Crashes and incorrect results when converting large integers to dates
+* [ARROW-12072](https://issues.apache.org/jira/browse/ARROW-12072) - (ipc.Writer).Write panics with \`arrow/array: index out of range\`
+* [ARROW-12087](https://issues.apache.org/jira/browse/ARROW-12087) - [C++] Fix sort\_indices, array\_sort\_indices timestamp support discrepancy
+* [ARROW-12513](https://issues.apache.org/jira/browse/ARROW-12513) - [C++][Parquet] Parquet Writer always puts null\_count=0 in Parquet statistics for dictionary-encoded array with nulls
+* [ARROW-12540](https://issues.apache.org/jira/browse/ARROW-12540) - [C++] Implement cast from date32[day] to utf8 
+* [ARROW-12636](https://issues.apache.org/jira/browse/ARROW-12636) - [JS] ESM Tree-Shaking produces broken code
+* [ARROW-12700](https://issues.apache.org/jira/browse/ARROW-12700) - [R] Read/Write\_feather stuck forever after bad write, R, Win32
+* [ARROW-12837](https://issues.apache.org/jira/browse/ARROW-12837) - [C++] Array::ToString() segfaults with null buffer.
+* [ARROW-13134](https://issues.apache.org/jira/browse/ARROW-13134) - [C++] SSL-related arrow-s3fs-test failures with aws-sdk-cpp 1.9.51
+* [ARROW-13151](https://issues.apache.org/jira/browse/ARROW-13151) - [Python] Unable to read single child field of struct column from Parquet
+* [ARROW-13198](https://issues.apache.org/jira/browse/ARROW-13198) - [C++][Dataset] Async scanner occasionally segfaulting in CI
+* [ARROW-13293](https://issues.apache.org/jira/browse/ARROW-13293) - [R] open\_dataset followed by collect hangs (while compute works)
+* [ARROW-13304](https://issues.apache.org/jira/browse/ARROW-13304) - [C++] Unable to install nightly on Ubuntu 21.04 due to day of week options
+* [ARROW-13336](https://issues.apache.org/jira/browse/ARROW-13336) - [Doc][Python] make clean doesn't clean up "generated" documentation
+* [ARROW-13422](https://issues.apache.org/jira/browse/ARROW-13422) - [R] Clarify README about S3 support on Windows
+* [ARROW-13424](https://issues.apache.org/jira/browse/ARROW-13424) - [C++] conda-forge benchmark library rejected
+* [ARROW-13425](https://issues.apache.org/jira/browse/ARROW-13425) - [Dev][Archery] Archery import pandas which imports pyarrow
+* [ARROW-13429](https://issues.apache.org/jira/browse/ARROW-13429) - [C++][Gandiva] Gandiva crashes when compiling If-else expression with binary type
+* [ARROW-13430](https://issues.apache.org/jira/browse/ARROW-13430) - [Integration][Go] Various errors in the integration tests
+* [ARROW-13436](https://issues.apache.org/jira/browse/ARROW-13436) - [Python][Doc] Clarify what should be expected if read\_table is passed an empty list of columns
+* [ARROW-13437](https://issues.apache.org/jira/browse/ARROW-13437) - [C++] Slice of FixedSizeList fails ValidateFull
+* [ARROW-13441](https://issues.apache.org/jira/browse/ARROW-13441) - [CSV] Streaming reader conversion should skip empty blocks
+* [ARROW-13443](https://issues.apache.org/jira/browse/ARROW-13443) - [C++] Fix the incorrect mapping from flatbuf::MetadataVersion to arrow::ipc::MetadataVersion
+* [ARROW-13445](https://issues.apache.org/jira/browse/ARROW-13445) - [Java][Packaging] Fix artifact patterns for the Java jars
+* [ARROW-13446](https://issues.apache.org/jira/browse/ARROW-13446) - [Release] Fix verification on amazon linux 
+* [ARROW-13447](https://issues.apache.org/jira/browse/ARROW-13447) - [Release] Verification script for arm64 and universal2 macOS wheels
+* [ARROW-13450](https://issues.apache.org/jira/browse/ARROW-13450) - [Python][Packaging] Set deployment target to 10.13 for universal2 wheels
+* [ARROW-13469](https://issues.apache.org/jira/browse/ARROW-13469) - [C++] Suppress -Wmissing-field-initializers in DayMilliseconds arrow/type.h
+* [ARROW-13474](https://issues.apache.org/jira/browse/ARROW-13474) - [C++][Python] PyArrow crash when filter/take empty Extension array
+* [ARROW-13477](https://issues.apache.org/jira/browse/ARROW-13477) - [Release] Pass ARTIFACTORY\_API\_KEY to the upload script
+* [ARROW-13484](https://issues.apache.org/jira/browse/ARROW-13484) - [Release] Packages not available for Amazon Linux 2
+* [ARROW-13490](https://issues.apache.org/jira/browse/ARROW-13490) - [R] [CI] Need to gate duckdb examples on duckdb version
+* [ARROW-13492](https://issues.apache.org/jira/browse/ARROW-13492) - [R] [CI] Move r tools 35 build back to per-commit/pre-PR
+* [ARROW-13493](https://issues.apache.org/jira/browse/ARROW-13493) - [C++] Anonymous structs in an anonymous union are a GNU extension
+* [ARROW-13495](https://issues.apache.org/jira/browse/ARROW-13495) - [C++] UBSAN error in BitUtil when writing dataset
+* [ARROW-13496](https://issues.apache.org/jira/browse/ARROW-13496) - [CI][R] Repair r-sanitizer job
+* [ARROW-13497](https://issues.apache.org/jira/browse/ARROW-13497) - [C++][R] FunctionOptions not used by aggregation nodes
+* [ARROW-13499](https://issues.apache.org/jira/browse/ARROW-13499) - [R] Aggregation on expression doesn't NSE correctly
+* [ARROW-13500](https://issues.apache.org/jira/browse/ARROW-13500) - [C++] warning: unrecognized command line option '-Wno-unknown-warning-option' when building with gcc 9.3
+* [ARROW-13504](https://issues.apache.org/jira/browse/ARROW-13504) - [Python] It is impossible to skip s3 or hdfs tests with pytest markers
+* [ARROW-13507](https://issues.apache.org/jira/browse/ARROW-13507) - [R] LTO job on CRAN fails
+* [ARROW-13509](https://issues.apache.org/jira/browse/ARROW-13509) - [C++] Take compute function should pass through ChunkedArray type to handle empty input arrays
+* [ARROW-13522](https://issues.apache.org/jira/browse/ARROW-13522) - [C++] Regression with compute \`utf8\_\*trim\` functions on macOS.
+* [ARROW-13523](https://issues.apache.org/jira/browse/ARROW-13523) - Unified the test case name
+* [ARROW-13524](https://issues.apache.org/jira/browse/ARROW-13524) - [C++] Fix description for ApplicationVersion::VersionEq
+* [ARROW-13529](https://issues.apache.org/jira/browse/ARROW-13529) - Too many releases in IPC writer when writing slices
+* [ARROW-13538](https://issues.apache.org/jira/browse/ARROW-13538) - [R] [CI] Don't test DuckDB in the minimal build
+* [ARROW-13543](https://issues.apache.org/jira/browse/ARROW-13543) - [R] Handle summarize() with 0 arguments or no aggregate functions
+* [ARROW-13556](https://issues.apache.org/jira/browse/ARROW-13556) - [C++] on Ubuntu 21.04 with system libs flight is not linked against libprotobuf
+* [ARROW-13559](https://issues.apache.org/jira/browse/ARROW-13559) - [CI][C++] test-conda-cpp-valgrind nightly build failure
+* [ARROW-13560](https://issues.apache.org/jira/browse/ARROW-13560) - [R] Allow Scanner$create() to accept filter / project even with arrow\_dplyr\_querys
+* [ARROW-13580](https://issues.apache.org/jira/browse/ARROW-13580) - [C++] quoted\_strings\_can\_be\_null only applied to string columns
+* [ARROW-13597](https://issues.apache.org/jira/browse/ARROW-13597) - [C++] [R] ExecNode factory named source not present in registry
+* [ARROW-13600](https://issues.apache.org/jira/browse/ARROW-13600) - [C++] Maybe uninitialized warnings
+* [ARROW-13602](https://issues.apache.org/jira/browse/ARROW-13602) - [C++] Tests dereferencing type-punned pointer compiler warnings
+* [ARROW-13603](https://issues.apache.org/jira/browse/ARROW-13603) - [GLib] GARROW\_VERSION\_CHECK() always returns false
+* [ARROW-13605](https://issues.apache.org/jira/browse/ARROW-13605) - [C++] Data race in GroupByNode found by ThreadSanitizer
+* [ARROW-13608](https://issues.apache.org/jira/browse/ARROW-13608) - [R] symbol initialization appears to be depending on undefined behavior
+* [ARROW-13611](https://issues.apache.org/jira/browse/ARROW-13611) - [C++] Scanning datasets does not enforce back pressure
+* [ARROW-13624](https://issues.apache.org/jira/browse/ARROW-13624) - [R] readr short type mapping has T and t backwards
+* [ARROW-13628](https://issues.apache.org/jira/browse/ARROW-13628) - [Format] Add MonthDayNano interval type.
+* [ARROW-13630](https://issues.apache.org/jira/browse/ARROW-13630) - [CI][C++] Travis s390x CI job is failing and blocks endianness related code verification
+* [ARROW-13632](https://issues.apache.org/jira/browse/ARROW-13632) - [Python] Filter mask is always applied to elements at the start of FixedSizeListArray when filtering a slice
+* [ARROW-13638](https://issues.apache.org/jira/browse/ARROW-13638) - [C++][R] GroupByNode accesses FunctionOptions after Init/ExecNode\_Aggregate keep\_alives aren't kept alive
+* [ARROW-13639](https://issues.apache.org/jira/browse/ARROW-13639) - [C++] Concatenate with an empty dictionary segfaults (ASan failure in TestFilterKernelWithString/0.FilterDictionary)
+* [ARROW-13654](https://issues.apache.org/jira/browse/ARROW-13654) - [C++][Parquet] Appending a FileMetaData object to itselfs explodes memory
+* [ARROW-13655](https://issues.apache.org/jira/browse/ARROW-13655) - [C++][Parquet] Reading large Parquet file can give "MaxMessageSize reached" error with Thrift 0.14
+* [ARROW-13662](https://issues.apache.org/jira/browse/ARROW-13662) - [CI] Failing test test\_extract\_datetime\_components with pandas 0.24
+* [ARROW-13662](https://issues.apache.org/jira/browse/ARROW-13662) - [CI] Failing test test\_extract\_datetime\_components with pandas 0.24
+* [ARROW-13669](https://issues.apache.org/jira/browse/ARROW-13669) - [C++] Variant emplace methods appear to be missing curly braces.
+* [ARROW-13671](https://issues.apache.org/jira/browse/ARROW-13671) - [Dev] Fix conda recipe on Arm 64K page system
+* [ARROW-13676](https://issues.apache.org/jira/browse/ARROW-13676) - [C++] Coredump writing Arrow table to Parquet file
+* [ARROW-13681](https://issues.apache.org/jira/browse/ARROW-13681) - [C++] list\_parent\_indices only computes for first chunk
+* [ARROW-13685](https://issues.apache.org/jira/browse/ARROW-13685) - [C++] Cannot write dataset to S3FileSystem if bucket already exists
+* [ARROW-13689](https://issues.apache.org/jira/browse/ARROW-13689) - [C\#] Initial C\# Integration Tests
+* [ARROW-13694](https://issues.apache.org/jira/browse/ARROW-13694) - [R] Arrow filter crashes (R aborted session)
+* [ARROW-13743](https://issues.apache.org/jira/browse/ARROW-13743) - [CI] OSX job fails due to incompatible git and libcurl
+* [ARROW-13744](https://issues.apache.org/jira/browse/ARROW-13744) - [CI] c++14 and 17 nightly job fails
+* [ARROW-13747](https://issues.apache.org/jira/browse/ARROW-13747) - [CI][C++] s3fs test failed in conda-python-pandas nightly job
+* [ARROW-13755](https://issues.apache.org/jira/browse/ARROW-13755) - [Python] Allow usage of field\_names in partitioning when saving datasets
+* [ARROW-13761](https://issues.apache.org/jira/browse/ARROW-13761) - [R] arrow::filter() crashes (aborts R session)
+* [ARROW-13784](https://issues.apache.org/jira/browse/ARROW-13784) - [Python] Table.from\_arrays should raise an error when array is empty but names is not
+* [ARROW-13786](https://issues.apache.org/jira/browse/ARROW-13786) - [R] [CI] Don't fail the RCHK build if arrow doesn't build
+* [ARROW-13788](https://issues.apache.org/jira/browse/ARROW-13788) - [C++] Temporal component extraction functions don't support date32/64
+* [ARROW-13792](https://issues.apache.org/jira/browse/ARROW-13792) - [Java] The toString representation is incorrect for unsigned integer vectors
+* [ARROW-13799](https://issues.apache.org/jira/browse/ARROW-13799) - [R] case\_when error handling is capturing strings
+* [ARROW-13800](https://issues.apache.org/jira/browse/ARROW-13800) - [R] Use divide instead of divide\_checked
+* [ARROW-13812](https://issues.apache.org/jira/browse/ARROW-13812) - [C++] Valgrind failure in Grouper.BooleanKey (uninitialized values)
+* [ARROW-13814](https://issues.apache.org/jira/browse/ARROW-13814) - [CI] Nightly integration build with spark master failing to compile spark
+* [ARROW-13819](https://issues.apache.org/jira/browse/ARROW-13819) - [C++] Build fails with "'subseconds' may be used uninitialized in this function"
+* [ARROW-13846](https://issues.apache.org/jira/browse/ARROW-13846) - [C++] Fix crashes on invalid IPC file (OSS-Fuzz)
+* [ARROW-13850](https://issues.apache.org/jira/browse/ARROW-13850) - [C++] Fix crashes on invalid Parquet file (OSS-Fuzz)
+* [ARROW-13860](https://issues.apache.org/jira/browse/ARROW-13860) - [R] arrow 5.0.0 write\_parquet throws error writing grouped data.frame
+* [ARROW-13872](https://issues.apache.org/jira/browse/ARROW-13872) - [Java] ExtensionTypeVector does not work with RangeEqualsVisitor
+* [ARROW-13876](https://issues.apache.org/jira/browse/ARROW-13876) - [C++] Uniform null handling in compute functions
+* [ARROW-13877](https://issues.apache.org/jira/browse/ARROW-13877) - [C++] Added support for fixed sized list to compute functions that process lists
+* [ARROW-13878](https://issues.apache.org/jira/browse/ARROW-13878) - [C++] Add fixed\_size\_binary support to compute functions
+* [ARROW-13880](https://issues.apache.org/jira/browse/ARROW-13880) - [C++] Compute function sort\_indices does not support timestamps with time zones
+* [ARROW-13881](https://issues.apache.org/jira/browse/ARROW-13881) - [Python] Error message says "Please use a release of Arrow Flight built with gRPC 1.27 or higher." although I'm using gRPC 1.39
+* [ARROW-13882](https://issues.apache.org/jira/browse/ARROW-13882) - [C++] Add compute function min\_max support for more types
+* [ARROW-13884](https://issues.apache.org/jira/browse/ARROW-13884) - Arrow 5.0.0 cannot compile with Typescript 4.2.2
+* [ARROW-13912](https://issues.apache.org/jira/browse/ARROW-13912) - [R] TrimOptions implementation breaks test-r-minimal-build due to dependencies
+* [ARROW-13913](https://issues.apache.org/jira/browse/ARROW-13913) - [C++] segfault if compute function index called with no options supplied
+* [ARROW-13915](https://issues.apache.org/jira/browse/ARROW-13915) - [R][CI] R UCRT C++ bundles are incomplete
+* [ARROW-13916](https://issues.apache.org/jira/browse/ARROW-13916) - [C++] Implement strftime on date32/64 types
+* [ARROW-13921](https://issues.apache.org/jira/browse/ARROW-13921) - [Python][Packaging] Pin minimum setuptools version for the macos wheels
+* [ARROW-13940](https://issues.apache.org/jira/browse/ARROW-13940) - [R] Turn on multithreading with Arrow engine queries
+* [ARROW-13961](https://issues.apache.org/jira/browse/ARROW-13961) - [C++] iso\_calendar may be uninitialized
+* [ARROW-13976](https://issues.apache.org/jira/browse/ARROW-13976) - Adapt to arm architecture CPU in hdfs\_internal.cc
+* [ARROW-13978](https://issues.apache.org/jira/browse/ARROW-13978) - [C++] Bump gtest to 1.11 to unbreak builds with recent clang
+* [ARROW-13981](https://issues.apache.org/jira/browse/ARROW-13981) - [Java] VectorSchemaRootAppender doesn't work for BitVector
+* [ARROW-13982](https://issues.apache.org/jira/browse/ARROW-13982) - [C++] Async scanner stalls if a fragment generates no batches
+* [ARROW-13983](https://issues.apache.org/jira/browse/ARROW-13983) - [C++] fcntl(..., F\_RDADVISE, ...) may fail on macOS with NFS mount
+* [ARROW-13996](https://issues.apache.org/jira/browse/ARROW-13996) - [Go][Parquet] Fix file offsets for row groups
+* [ARROW-13997](https://issues.apache.org/jira/browse/ARROW-13997) - [C++] restore exec node based query performance
+* [ARROW-14001](https://issues.apache.org/jira/browse/ARROW-14001) - [Go] AppendBooleans in BitmapWriter is broken
+* [ARROW-14004](https://issues.apache.org/jira/browse/ARROW-14004) - [Python] to\_pandas() converts to float instead of using pandas nullable types
+* [ARROW-14014](https://issues.apache.org/jira/browse/ARROW-14014) - FlightClient.ClientStreamListener not notified on error when parsing invalid trailers
+* [ARROW-14017](https://issues.apache.org/jira/browse/ARROW-14017) - [C++] NULLPTR is not included in type\_fwd.h
+* [ARROW-14020](https://issues.apache.org/jira/browse/ARROW-14020) - [R] Writing datafames with list columns is slow and scales poorly with nesting level
+* [ARROW-14024](https://issues.apache.org/jira/browse/ARROW-14024) - [C++] ScanOptions::batch\_size not respected in parquet/IPC readers
+* [ARROW-14026](https://issues.apache.org/jira/browse/ARROW-14026) - [C++] Batch readahead not working correctly in Parquet scanner
+* [ARROW-14027](https://issues.apache.org/jira/browse/ARROW-14027) - [C++][R] Ensure groupers accept scalar inputs (was: Allow me to group\_by + summarise() with partitioning fields)
+* [ARROW-14040](https://issues.apache.org/jira/browse/ARROW-14040) - [C++] Spurious test failure in ScanNode.MinimalGroupedAggEndToEnd
+* [ARROW-14053](https://issues.apache.org/jira/browse/ARROW-14053) - [C++] AsyncReaderTests.InvalidRowsSkipped is flaky
+* [ARROW-14057](https://issues.apache.org/jira/browse/ARROW-14057) - [C++] Bump aws-c-common version
+* [ARROW-14063](https://issues.apache.org/jira/browse/ARROW-14063) - [R] open\_dataset() does not work on CSVs without header rows
+* [ARROW-14076](https://issues.apache.org/jira/browse/ARROW-14076) - Unable to use \`red-arrow\` gem on Heroku/Ubuntu 20.04 (focal)
+* [ARROW-14090](https://issues.apache.org/jira/browse/ARROW-14090) - [C++][Parquet] rows\_written\_ should be int64\_t instead of int
+* [ARROW-14103](https://issues.apache.org/jira/browse/ARROW-14103) - [R] [C++] Allow min/max in grouped aggregation
+* [ARROW-14109](https://issues.apache.org/jira/browse/ARROW-14109) - Segfault When Reading JSON With Duplicate Keys
+* [ARROW-14124](https://issues.apache.org/jira/browse/ARROW-14124) - [R] Timezone support in R <= 3.4
+* [ARROW-14129](https://issues.apache.org/jira/browse/ARROW-14129) - [C++] An empty dictionary array crashes on \`unique\` and \`value\_counts\`.
+* [ARROW-14139](https://issues.apache.org/jira/browse/ARROW-14139) - [IR] [C++] Table flatbuffer object fails to compile on older GCCs
+* [ARROW-14141](https://issues.apache.org/jira/browse/ARROW-14141) - [IR] [C++] Join missing from RelationImpl
+* [ARROW-14156](https://issues.apache.org/jira/browse/ARROW-14156) - [C++] StructArray::Flatten is incorrect in some cases
+* [ARROW-14162](https://issues.apache.org/jira/browse/ARROW-14162) - [R] Simple arrange %\>% head does not respect ordering
+* [ARROW-14173](https://issues.apache.org/jira/browse/ARROW-14173) - [IR] Allow typed null literals to be represented
+* [ARROW-14179](https://issues.apache.org/jira/browse/ARROW-14179) - [C++] Import/Export of UnionArray in C data interface has wrong buffer count
+* [ARROW-14192](https://issues.apache.org/jira/browse/ARROW-14192) - [C++][Dataset] Backpressure broken on ordered scans
+* [ARROW-14195](https://issues.apache.org/jira/browse/ARROW-14195) - [R] Fix ExecPlan binding annotations
+* [ARROW-14197](https://issues.apache.org/jira/browse/ARROW-14197) - [C++] Hashjoin + datasets hanging
+* [ARROW-14200](https://issues.apache.org/jira/browse/ARROW-14200) - [R] strftime on a date should not use or be confused by timezones
+* [ARROW-14203](https://issues.apache.org/jira/browse/ARROW-14203) - [C++] Fix description of ExecBatch.length for Scalars in aggregate kernels
+* [ARROW-14204](https://issues.apache.org/jira/browse/ARROW-14204) - [C++] Fails to compile Arrow without RE2 due to missing ifdef guard in MatchLike
+* [ARROW-14206](https://issues.apache.org/jira/browse/ARROW-14206) - [Go] Fix Build for ARM and s390x
+* [ARROW-14206](https://issues.apache.org/jira/browse/ARROW-14206) - [Go] Fix Build for ARM and s390x
+* [ARROW-14208](https://issues.apache.org/jira/browse/ARROW-14208) - [C++] Build errors with Visual Studio 2019
+* [ARROW-14210](https://issues.apache.org/jira/browse/ARROW-14210) - [C++] CMAKE\_AR is not passed to bzip2 thirdparty dependency
+* [ARROW-14211](https://issues.apache.org/jira/browse/ARROW-14211) - [C++] Valgrind and TSAN errors in arrow-compute-hash-join-node-test
+* [ARROW-14214](https://issues.apache.org/jira/browse/ARROW-14214) - [Python][CI] wheel-windows-cp36-amd64 nightly build failure
+* [ARROW-14216](https://issues.apache.org/jira/browse/ARROW-14216) - [R] Disable auto-cleaning of duckdb tables
+* [ARROW-14219](https://issues.apache.org/jira/browse/ARROW-14219) - [R] [CI] DuckDB valgrind failure
+* [ARROW-14220](https://issues.apache.org/jira/browse/ARROW-14220) - [C++] Missing ending quote in thirdpartyversions
+* [ARROW-14221](https://issues.apache.org/jira/browse/ARROW-14221) - [R] [CI] DuckDB tests fail on R < 4.0
+* [ARROW-14223](https://issues.apache.org/jira/browse/ARROW-14223) - [C++] Add google\_cloud\_cpp\_storage to ARROW\_THIRDPARTY\_DEPENDENCIES
+* [ARROW-14224](https://issues.apache.org/jira/browse/ARROW-14224) - [R] [CI] R sanitizer build failing
+* [ARROW-14226](https://issues.apache.org/jira/browse/ARROW-14226) - [R] Handle n\_distinct() with args != 1
+* [ARROW-14237](https://issues.apache.org/jira/browse/ARROW-14237) - [R] [CI] Disable altrep in R <= 3.5
+* [ARROW-14240](https://issues.apache.org/jira/browse/ARROW-14240) - [C++] nlohmann\_json\_ep always rebuilt
+* [ARROW-14246](https://issues.apache.org/jira/browse/ARROW-14246) - [C++] find\_package(CURL) in build\_google\_cloud\_cpp\_storage fails
+* [ARROW-14247](https://issues.apache.org/jira/browse/ARROW-14247) - [C++] Valgrind error in parquet-arrow-test
+* [ARROW-14249](https://issues.apache.org/jira/browse/ARROW-14249) - [R] Slow down in dataframe-to-table benchmark
+* [ARROW-14252](https://issues.apache.org/jira/browse/ARROW-14252) - [R] Partial matching of arguments warning
+* [ARROW-14255](https://issues.apache.org/jira/browse/ARROW-14255) - [Python] FlightClient.do\_action is a generator instead of returning one.
+* [ARROW-14257](https://issues.apache.org/jira/browse/ARROW-14257) - [Doc][Python] dataset doc build fails
+* [ARROW-14260](https://issues.apache.org/jira/browse/ARROW-14260) - [C++] GTest linker error with vcpkg and Visual Studio 2019
+* [ARROW-14283](https://issues.apache.org/jira/browse/ARROW-14283) - [C++][CI] LLVM 13 cannot be used on macOS GHA builds
+* [ARROW-14285](https://issues.apache.org/jira/browse/ARROW-14285) - [C++] Fix crashes when pretty-printing data from valid IPC file (OSS-Fuzz)
+* [ARROW-14299](https://issues.apache.org/jira/browse/ARROW-14299) - [Dev][CI] "linux-apt-r" dockerfile reinstalls Minio
+* [ARROW-14300](https://issues.apache.org/jira/browse/ARROW-14300) - [R][CI] "test-r-gcc-11" nightly build failure
+* [ARROW-14301](https://issues.apache.org/jira/browse/ARROW-14301) - [C++][CI] "test-ubuntu-20.04-cpp-17" nightly build crash in GCSFS test
+* [ARROW-14302](https://issues.apache.org/jira/browse/ARROW-14302) - [C++] Valgrind errors
+* [ARROW-14305](https://issues.apache.org/jira/browse/ARROW-14305) - [C++] Valgrind errors in arrow-compute-hash-join-node-test
+* [ARROW-14307](https://issues.apache.org/jira/browse/ARROW-14307) - [R] crashes when reading empty feather with POSIXct column
+* [ARROW-14313](https://issues.apache.org/jira/browse/ARROW-14313) - [Doc][Dev] Installation instructions for Archery incomplete
+* [ARROW-14321](https://issues.apache.org/jira/browse/ARROW-14321) - [R] segfault converting dictionary ChunkedArray with 0 chunks
+* [ARROW-14340](https://issues.apache.org/jira/browse/ARROW-14340) - [C++] Fix xsimd build error on apple m1
+* [ARROW-14370](https://issues.apache.org/jira/browse/ARROW-14370) - [C++] ASAN CI job failed
+* [ARROW-14373](https://issues.apache.org/jira/browse/ARROW-14373) - [Packaging][Java] Missing LLVM dependency in the macOS java-jars build
+* [ARROW-14377](https://issues.apache.org/jira/browse/ARROW-14377) - [Packaging][Python] Python 3.9 installation fails in macOS wheel build
+* [ARROW-14381](https://issues.apache.org/jira/browse/ARROW-14381) - [CI][Python] Spark integration failures
+* [ARROW-14382](https://issues.apache.org/jira/browse/ARROW-14382) - [C++][Compute] Remove duplicate ThreadIndexer definition
+* [ARROW-14392](https://issues.apache.org/jira/browse/ARROW-14392) - [C++] Bundled gRPC misses bundled Abseil include path
+* [ARROW-14393](https://issues.apache.org/jira/browse/ARROW-14393) - [C++] GTest linking errors during the source release verification
+* [ARROW-14397](https://issues.apache.org/jira/browse/ARROW-14397) - [C++] Fix valgrind error in test utility 
+* [ARROW-14406](https://issues.apache.org/jira/browse/ARROW-14406) - [Python][CI] Nightly dask integration jobs fail
+* [ARROW-14411](https://issues.apache.org/jira/browse/ARROW-14411) - [Release][Integration] Go integration tests fail for 6.0.0-RC1
+* [ARROW-14417](https://issues.apache.org/jira/browse/ARROW-14417) - [R] Joins ignore projection on left dataset
+* [ARROW-14423](https://issues.apache.org/jira/browse/ARROW-14423) - [Python] Fix version constraints in pyproject.toml
+* [ARROW-14424](https://issues.apache.org/jira/browse/ARROW-14424) - [Packaging][Python] Disable windows wheel testing for python 3.6
+* [ARROW-14434](https://issues.apache.org/jira/browse/ARROW-14434) - R crashes when making an empty selection for Datasets with DateTime
+* [PARQUET-2067](https://issues.apache.org/jira/browse/PARQUET-2067) - [C++]  null\_count and num\_nulls incorrect for repeated columns
+* [PARQUET-2089](https://issues.apache.org/jira/browse/PARQUET-2089) - [C++] RowGroupMetaData file\_offset set incorrectly
+
+
+## New Features and Improvements
+
+* [ARROW-1565](https://issues.apache.org/jira/browse/ARROW-1565) - [C++][Compute] Implement TopK/BottomK
+* [ARROW-1568](https://issues.apache.org/jira/browse/ARROW-1568) - [C++] Implement "drop null" kernels that return array without nulls
+* [ARROW-4333](https://issues.apache.org/jira/browse/ARROW-4333) - [C++] Sketch out design for kernels and "query" execution in compute layer
+* [ARROW-4700](https://issues.apache.org/jira/browse/ARROW-4700) - [C++] Add DecimalType support to arrow::json::TableReader
+* [ARROW-5002](https://issues.apache.org/jira/browse/ARROW-5002) - [C++] Implement Hash Aggregation query execution node
+* [ARROW-5244](https://issues.apache.org/jira/browse/ARROW-5244) - [C++] Review experimental / unstable APIs
+* [ARROW-6072](https://issues.apache.org/jira/browse/ARROW-6072) - [C++] Implement casting List <-\> LargeList
+* [ARROW-6607](https://issues.apache.org/jira/browse/ARROW-6607) - [Python] Support for set/list columns when converting from Pandas
+* [ARROW-6626](https://issues.apache.org/jira/browse/ARROW-6626) - [Python] Handle nested "set" values as lists when converting to Arrow
+* [ARROW-6870](https://issues.apache.org/jira/browse/ARROW-6870) - [C\#] Add Support for Dictionary Arrays and Dictionary Encoding
+* [ARROW-7102](https://issues.apache.org/jira/browse/ARROW-7102) - [Python] Make filesystems compatible with fsspec
+* [ARROW-7179](https://issues.apache.org/jira/browse/ARROW-7179) - [C++][Compute] Consolidate fill\_null and coalesce
+* [ARROW-7901](https://issues.apache.org/jira/browse/ARROW-7901) - [Integration][Go] Add null type (and integration test)
+* [ARROW-8022](https://issues.apache.org/jira/browse/ARROW-8022) - [C++] Provide or Vendor a small\_vector implementation
+* [ARROW-8147](https://issues.apache.org/jira/browse/ARROW-8147) - [C++] Add google-cloud-cpp to ThirdpartyToolchain
+* [ARROW-8379](https://issues.apache.org/jira/browse/ARROW-8379) - [R] Investigate/fix thread safety issues (esp. Windows)
+* [ARROW-8621](https://issues.apache.org/jira/browse/ARROW-8621) - [Release][Go] Add Module support by creating tags
+* [ARROW-8780](https://issues.apache.org/jira/browse/ARROW-8780) - [Python] A fsspec-compatible wrapper for pyarrow.fs filesystems
+* [ARROW-8928](https://issues.apache.org/jira/browse/ARROW-8928) - [C++] Measure microperformance associated with ExecBatchIterator
+* [ARROW-9226](https://issues.apache.org/jira/browse/ARROW-9226) - [Python] pyarrow.fs.HadoopFileSystem - retrieve options from core-site.xml or hdfs-site.xml if available
+* [ARROW-9434](https://issues.apache.org/jira/browse/ARROW-9434) - [C++] Store type\_code information in UnionScalar::value
+* [ARROW-9719](https://issues.apache.org/jira/browse/ARROW-9719) - [Doc][Python] Better document the new pa.fs.HadoopFileSystem
+* [ARROW-10094](https://issues.apache.org/jira/browse/ARROW-10094) - [Python][Doc] Update pandas doc
+* [ARROW-10415](https://issues.apache.org/jira/browse/ARROW-10415) - [R] Support for dplyr::distinct()
+* [ARROW-10898](https://issues.apache.org/jira/browse/ARROW-10898) - [C++] Investigate Table sort performance
+* [ARROW-11238](https://issues.apache.org/jira/browse/ARROW-11238) - [Python] Make SubTreeFileSystem print method more informative
+* [ARROW-11243](https://issues.apache.org/jira/browse/ARROW-11243) - [C++] Parse time32 from string and infer in CSV reader
+* [ARROW-11460](https://issues.apache.org/jira/browse/ARROW-11460) - [R] Use system libraries if present on Linux
+* [ARROW-11691](https://issues.apache.org/jira/browse/ARROW-11691) - [Developer][CI] Provide a consolidated .env file for benchmark-relevant environment variables
+* [ARROW-11748](https://issues.apache.org/jira/browse/ARROW-11748) - [C++] Ensure Decimal128 and Decimal256's fields are in native endian order
+* [ARROW-11828](https://issues.apache.org/jira/browse/ARROW-11828) - [C++] Expose CSVWriter object in api
+* [ARROW-11885](https://issues.apache.org/jira/browse/ARROW-11885) - [R] Turn off some capabilities when LIBARROW\_MINIMAL=true
+* [ARROW-11981](https://issues.apache.org/jira/browse/ARROW-11981) - [C++][Dataset][Compute] Replace UnionDataset with Union ExecNode
+* [ARROW-12063](https://issues.apache.org/jira/browse/ARROW-12063) - [C++] Add nulls position option to sort functions
+* [ARROW-12181](https://issues.apache.org/jira/browse/ARROW-12181) - [C++][R] The "CSV dataset" in test-dataset.R is failing on RTools 3.5
+* [ARROW-12216](https://issues.apache.org/jira/browse/ARROW-12216) - [R] Proactively disable multithreading on RTools3.5 (32bit?)
+* [ARROW-12359](https://issues.apache.org/jira/browse/ARROW-12359) - [C++] Deprecate or remove FileSystem::OpenAppendStream
+* [ARROW-12388](https://issues.apache.org/jira/browse/ARROW-12388) - [C++][Gandiva] Implement cast numbers from varbinary functions in gandiva
+* [ARROW-12410](https://issues.apache.org/jira/browse/ARROW-12410) - [C++][Gandiva] Implement regexp\_replace function on Gandiva
+* [ARROW-12479](https://issues.apache.org/jira/browse/ARROW-12479) - [C++][Gandiva] Implement castBigInt, castInt, castIntervalDay and castIntervalYear extra functions
+* [ARROW-12563](https://issues.apache.org/jira/browse/ARROW-12563) - Add space,add\_months and datediff functions for string
+* [ARROW-12615](https://issues.apache.org/jira/browse/ARROW-12615) - [C++] Add options for handling NAs to stddev and variance
+* [ARROW-12650](https://issues.apache.org/jira/browse/ARROW-12650) - [Doc][Python] Improve documentation regarding dealing with memory mapped files
+* [ARROW-12657](https://issues.apache.org/jira/browse/ARROW-12657) - [C++][Python][Compute] String hex to numeric conversion and bit shifting
+* [ARROW-12669](https://issues.apache.org/jira/browse/ARROW-12669) - [C++] Kernel to return Array of elements at index of list in ListArray
+* [ARROW-12673](https://issues.apache.org/jira/browse/ARROW-12673) - [C++] Configure a custom handler for rows with incorrect column counts
+* [ARROW-12688](https://issues.apache.org/jira/browse/ARROW-12688) - [R] Use DuckDB to query an Arrow Dataset
+* [ARROW-12714](https://issues.apache.org/jira/browse/ARROW-12714) - [C++] String title case kernel
+* [ARROW-12725](https://issues.apache.org/jira/browse/ARROW-12725) - [C++][Compute] GroupBy: improve performance by encoding keys in row format only when they are inserted into hash table
+* [ARROW-12728](https://issues.apache.org/jira/browse/ARROW-12728) - [C++][Compute] Implement count\_distinct/distinct hash aggregate kernels 
+* [ARROW-12744](https://issues.apache.org/jira/browse/ARROW-12744) - [C++][Compute] Add rounding kernel
+* [ARROW-12759](https://issues.apache.org/jira/browse/ARROW-12759) - [C++][Compute] Wrap grouped aggregation in an ExecNode
+* [ARROW-12763](https://issues.apache.org/jira/browse/ARROW-12763) - [R] Optimize dplyr queries that use head/tail after arrange
+* [ARROW-12846](https://issues.apache.org/jira/browse/ARROW-12846) - [Release] Improve upload of binaries
+* [ARROW-12866](https://issues.apache.org/jira/browse/ARROW-12866) - [C++][Gandiva] Implement STRPOS function on Gandiva
+* [ARROW-12871](https://issues.apache.org/jira/browse/ARROW-12871) - [R] upgrade to testthat 3e
+* [ARROW-12876](https://issues.apache.org/jira/browse/ARROW-12876) - [R] Fix build flags on Raspberry Pi
+* [ARROW-12944](https://issues.apache.org/jira/browse/ARROW-12944) - [C++] String capitalize kernel
+* [ARROW-12946](https://issues.apache.org/jira/browse/ARROW-12946) - [C++] String swap case kernel
+* [ARROW-12953](https://issues.apache.org/jira/browse/ARROW-12953) - [C++][Compute] Refactor CheckScalar\* to take Datum arguments
+* [ARROW-12959](https://issues.apache.org/jira/browse/ARROW-12959) - [C++][R] Option for is\_null(NaN) to evaluate to true
+* [ARROW-12965](https://issues.apache.org/jira/browse/ARROW-12965) - [Java] Java implementation of Arrow C data interface
+* [ARROW-12980](https://issues.apache.org/jira/browse/ARROW-12980) - [C++] Kernels to extract datetime components should be timezone aware
+* [ARROW-12981](https://issues.apache.org/jira/browse/ARROW-12981) - [R] Install source package from CRAN alone
+* [ARROW-13033](https://issues.apache.org/jira/browse/ARROW-13033) - [C++] Kernel to localize naive timestamps to a timezone (preserving clock-time)
+* [ARROW-13056](https://issues.apache.org/jira/browse/ARROW-13056) - [Dev][MATLAB] Expand PR labeler for supported language
+* [ARROW-13067](https://issues.apache.org/jira/browse/ARROW-13067) - [C++][Compute] Implement integer to decimal cast
+* [ARROW-13089](https://issues.apache.org/jira/browse/ARROW-13089) - [Python] Allow creating RecordBatch from Python dict
+* [ARROW-13112](https://issues.apache.org/jira/browse/ARROW-13112) - [R] altrep vectors for strings and other types
+* [ARROW-13132](https://issues.apache.org/jira/browse/ARROW-13132) - [C++] Add Scalar validation
+* [ARROW-13138](https://issues.apache.org/jira/browse/ARROW-13138) - [C++] Implement kernel to extract datetime components (year, month, day, etc) from date type objects
+* [ARROW-13141](https://issues.apache.org/jira/browse/ARROW-13141) - [C++][Python] HadoopFileSystem: automatically set CLASSPATH based on HADOOP\_HOME env variable?
+* [ARROW-13163](https://issues.apache.org/jira/browse/ARROW-13163) - [C++][Gandiva] Implement REPEAT function on Gandiva
+* [ARROW-13164](https://issues.apache.org/jira/browse/ARROW-13164) - [R] altrep vectors from Array with nulls
+* [ARROW-13172](https://issues.apache.org/jira/browse/ARROW-13172) - [Java] Make TYPE\_WIDTH in Vector public
+* [ARROW-13174](https://issues.apache.org/jira/browse/ARROW-13174) - [C++][Compute] Add strftime kernel
+* [ARROW-13202](https://issues.apache.org/jira/browse/ARROW-13202) - [MATLAB] Enable GitHub Actions CI for MATLAB Interface on Linux
+* [ARROW-13218](https://issues.apache.org/jira/browse/ARROW-13218) - [Doc] Document/clarify conventions for timestamp storage
+* [ARROW-13220](https://issues.apache.org/jira/browse/ARROW-13220) - [C++] Add a 'choose' kernel/scalar compute function
+* [ARROW-13222](https://issues.apache.org/jira/browse/ARROW-13222) - [C++] Support variable-width types in case\_when function
+* [ARROW-13227](https://issues.apache.org/jira/browse/ARROW-13227) - [C++][Compute] Document ExecNode, ExecPlan
+* [ARROW-13257](https://issues.apache.org/jira/browse/ARROW-13257) - [Java][Dataset] Allow passing empty columns for projection
+* [ARROW-13260](https://issues.apache.org/jira/browse/ARROW-13260) - [Doc] Host different released versions of the documentation + version switcher
+* [ARROW-13268](https://issues.apache.org/jira/browse/ARROW-13268) - [C++][Compute] Add ExecNode for semi and anti-semi join
+* [ARROW-13279](https://issues.apache.org/jira/browse/ARROW-13279) - [R] Use C++ DayOfWeekOptions in wday implementation instead of manually calculating via Expression
+* [ARROW-13287](https://issues.apache.org/jira/browse/ARROW-13287) - [C++] [Dataset] FileSystemDataset::Write should use an async scan
+* [ARROW-13295](https://issues.apache.org/jira/browse/ARROW-13295) - [C++] Implement hash\_aggregate mean/stdev/variance kernels
+* [ARROW-13298](https://issues.apache.org/jira/browse/ARROW-13298) - [C++] Implement hash\_aggregate any/all Boolean kernels
+* [ARROW-13307](https://issues.apache.org/jira/browse/ARROW-13307) - [C++] Remove reflection-based enums (was: Use reflection-based enums for compute options)
+* [ARROW-13311](https://issues.apache.org/jira/browse/ARROW-13311) - [C++][Documentation] List hash aggregate kernels somewhere
+* [ARROW-13317](https://issues.apache.org/jira/browse/ARROW-13317) - [Python] Improve documentation on what 'use\_threads' does in 'read\_feather'
+* [ARROW-13326](https://issues.apache.org/jira/browse/ARROW-13326) - [R] [Archery] Add linting to dev CI
+* [ARROW-13327](https://issues.apache.org/jira/browse/ARROW-13327) - [Python] Improve consistency of explicit C++ types in PyArrow files
+* [ARROW-13330](https://issues.apache.org/jira/browse/ARROW-13330) - [Go][Parquet] Add Encoding Package Part 2
+* [ARROW-13344](https://issues.apache.org/jira/browse/ARROW-13344) - [R] Initial bindings for ExecPlan/ExecNode
+* [ARROW-13345](https://issues.apache.org/jira/browse/ARROW-13345) - [C++] Implement logN compute function
+* [ARROW-13358](https://issues.apache.org/jira/browse/ARROW-13358) - [C++] Extend type support for if\_else kernel
+* [ARROW-13379](https://issues.apache.org/jira/browse/ARROW-13379) - [Dev][Docs] Improvements to archery docs
+* [ARROW-13390](https://issues.apache.org/jira/browse/ARROW-13390) - [C++] Improve type support for 'coalesce' kernel
+* [ARROW-13397](https://issues.apache.org/jira/browse/ARROW-13397) - [R] Update arrow.Rmd vignette
+* [ARROW-13399](https://issues.apache.org/jira/browse/ARROW-13399) - [R] Update dataset.Rmd vignette
+* [ARROW-13402](https://issues.apache.org/jira/browse/ARROW-13402) - [R] Update flight.Rmd vignette
+* [ARROW-13403](https://issues.apache.org/jira/browse/ARROW-13403) - [R] Update developing.Rmd vignette
+* [ARROW-13404](https://issues.apache.org/jira/browse/ARROW-13404) - [Python] [Doc] Make Python landing page less coupled to the rest of arrow documentation
+* [ARROW-13405](https://issues.apache.org/jira/browse/ARROW-13405) - [Doc] Make "Libraries" the entry point for the documentation
+* [ARROW-13416](https://issues.apache.org/jira/browse/ARROW-13416) - [C++] Implement mod compute function
+* [ARROW-13420](https://issues.apache.org/jira/browse/ARROW-13420) - [JS] Update dependencies
+* [ARROW-13421](https://issues.apache.org/jira/browse/ARROW-13421) - [C++]  Add functionality for reading in columns as floats from delimited files where a comma has been used as a decimal separator
+* [ARROW-13433](https://issues.apache.org/jira/browse/ARROW-13433) - [R] Remove CLI hack from Valgrind test
+* [ARROW-13434](https://issues.apache.org/jira/browse/ARROW-13434) - [R] group\_by() with an unnammed expression
+* [ARROW-13435](https://issues.apache.org/jira/browse/ARROW-13435) - [R] Add function arrow\_table() as alias for Table$create()
+* [ARROW-13444](https://issues.apache.org/jira/browse/ARROW-13444) - [C++] C++20 compatibility by updating std::result\_of to std::invoke\_result
+* [ARROW-13448](https://issues.apache.org/jira/browse/ARROW-13448) - [R] Bindings for strftime
+* [ARROW-13453](https://issues.apache.org/jira/browse/ARROW-13453) - [R] DuckDB has not yet released 0.2.8
+* [ARROW-13455](https://issues.apache.org/jira/browse/ARROW-13455) - [C++][Docs] Typo in RecordBatch::SetColumn
+* [ARROW-13458](https://issues.apache.org/jira/browse/ARROW-13458) - [C++][Docs] Typo in RecordBatch::schema
+* [ARROW-13459](https://issues.apache.org/jira/browse/ARROW-13459) - [C++][Docs] Missing param docs for RecordBatch::SetColumn
+* [ARROW-13461](https://issues.apache.org/jira/browse/ARROW-13461) - [Python][Packaging] Build M1 wheels for python 3.8
+* [ARROW-13463](https://issues.apache.org/jira/browse/ARROW-13463) - [Release][Python] Verify python 3.8 macOS arm64 wheel
+* [ARROW-13465](https://issues.apache.org/jira/browse/ARROW-13465) - [R] to\_arrow() from duckdb
+* [ARROW-13466](https://issues.apache.org/jira/browse/ARROW-13466) - [R] make installation fail if Arrow C++ dependencies cannot be installed
+* [ARROW-13468](https://issues.apache.org/jira/browse/ARROW-13468) - [Release] Fix binary download/upload failures
+* [ARROW-13472](https://issues.apache.org/jira/browse/ARROW-13472) - [R] Remove .engine = "duckdb" argument
+* [ARROW-13475](https://issues.apache.org/jira/browse/ARROW-13475) - [Release] Don't consider rust tarballs when cleaning up old releases
+* [ARROW-13476](https://issues.apache.org/jira/browse/ARROW-13476) - [Doc][Python] Ensure that ipc/io documentation uses context managers instead of manually closing streams
+* [ARROW-13478](https://issues.apache.org/jira/browse/ARROW-13478) - [Release] Unnecessary rc-number argument for the version bumping post-release script
+* [ARROW-13480](https://issues.apache.org/jira/browse/ARROW-13480) - [C++] [R] [Python] Dataset SyncScanner may freeze on error
+* [ARROW-13482](https://issues.apache.org/jira/browse/ARROW-13482) - [C++][Compute] Provide a registry for ExecNode implementations
+* [ARROW-13485](https://issues.apache.org/jira/browse/ARROW-13485) - [Release] Replace ${PREVIOUS\_RELEASE}.9000 in r/NEWS.md by post-12-bump-versions.sh
+* [ARROW-13488](https://issues.apache.org/jira/browse/ARROW-13488) - [Website] Update Linux packages install information for 5.0.0
+* [ARROW-13489](https://issues.apache.org/jira/browse/ARROW-13489) - [R] Bump CI jobs after 5.0.0
+* [ARROW-13501](https://issues.apache.org/jira/browse/ARROW-13501) - [R] Bindings for count aggregation
+* [ARROW-13502](https://issues.apache.org/jira/browse/ARROW-13502) - [R] Bindings for min/max aggregation
+* [ARROW-13503](https://issues.apache.org/jira/browse/ARROW-13503) - [GLib][Ruby][Flight] Add support for DoGet
+* [ARROW-13506](https://issues.apache.org/jira/browse/ARROW-13506) - Upgrade ORC to 1.6.9
+* [ARROW-13508](https://issues.apache.org/jira/browse/ARROW-13508) - [C++] Allow custom RetryStrategy objects to be passed to S3FileSystem
+* [ARROW-13510](https://issues.apache.org/jira/browse/ARROW-13510) - [CI][R][C++] Add -Wall to fedora-clang-devel as-cran checks
+* [ARROW-13511](https://issues.apache.org/jira/browse/ARROW-13511) - [CI][R] Fail in the docker build step if R deps don't install
+* [ARROW-13516](https://issues.apache.org/jira/browse/ARROW-13516) - [C++] Mingw-w64 + Clang (lld) doesn't support --version-script
+* [ARROW-13519](https://issues.apache.org/jira/browse/ARROW-13519) - [R] Make doc examples less noisy
+* [ARROW-13520](https://issues.apache.org/jira/browse/ARROW-13520) - [C++] Implement hash\_aggregate approximate quantile kernel
+* [ARROW-13521](https://issues.apache.org/jira/browse/ARROW-13521) - [C++][Docs] Add note about tdigest in compute functions docs
+* [ARROW-13525](https://issues.apache.org/jira/browse/ARROW-13525) - [Python] Mention alternatives in deprecation message of ParquetDataset attributes
+* [ARROW-13528](https://issues.apache.org/jira/browse/ARROW-13528) - [R] Bindings for mean, var, sd aggregation
+* [ARROW-13532](https://issues.apache.org/jira/browse/ARROW-13532) - [C++][Compute] Join: add set membership test method to the grouper
+* [ARROW-13534](https://issues.apache.org/jira/browse/ARROW-13534) - [C++] Improve csv chunker
+* [ARROW-13540](https://issues.apache.org/jira/browse/ARROW-13540) - [C++][Compute] Add OrderByNode for ordering of rows in an ExecPlan
+* [ARROW-13541](https://issues.apache.org/jira/browse/ARROW-13541) - [C++][Python] Implement ExtensionScalar
+* [ARROW-13542](https://issues.apache.org/jira/browse/ARROW-13542) - [C++][Compute][Dataset] Add dataset::WriteNode for writing rows from an ExecPlan to disk
+* [ARROW-13544](https://issues.apache.org/jira/browse/ARROW-13544) - [Java] Remove APIs that have been deprecated for long
+* [ARROW-13544](https://issues.apache.org/jira/browse/ARROW-13544) - [Java] Remove APIs that have been deprecated for long
+* [ARROW-13544](https://issues.apache.org/jira/browse/ARROW-13544) - [Java] Remove APIs that have been deprecated for long
+* [ARROW-13548](https://issues.apache.org/jira/browse/ARROW-13548) - [C++] Implement datediff kernel
+* [ARROW-13549](https://issues.apache.org/jira/browse/ARROW-13549) - [C++] Implement timestamp to date/time cast that extracts value
+* [ARROW-13550](https://issues.apache.org/jira/browse/ARROW-13550) - [R] Support .groups argument to dplyr::summarize()
+* [ARROW-13552](https://issues.apache.org/jira/browse/ARROW-13552) - [C++] Remove deprecated APIs
+* [ARROW-13557](https://issues.apache.org/jira/browse/ARROW-13557) - [Packaging][Python] Skip test\_cancellation test case on M1
+* [ARROW-13561](https://issues.apache.org/jira/browse/ARROW-13561) - [C++] Implement week kernel that accepts WeekOptions
+* [ARROW-13562](https://issues.apache.org/jira/browse/ARROW-13562) - [R] Styler followups
+* [ARROW-13565](https://issues.apache.org/jira/browse/ARROW-13565) - [Packaging][Ubuntu] Drop support for 20.10
+* [ARROW-13572](https://issues.apache.org/jira/browse/ARROW-13572) - [C++][Python] Add basic ORC support to the pyarrow.datasets API
+* [ARROW-13573](https://issues.apache.org/jira/browse/ARROW-13573) - [C++] Support dictionaries directly in case\_when kernel
+* [ARROW-13574](https://issues.apache.org/jira/browse/ARROW-13574) - [C++] Add 'count all' option to count (hash) aggregate kernel
+* [ARROW-13575](https://issues.apache.org/jira/browse/ARROW-13575) - [C++] Implement product aggregate & hash aggregate kernels
+* [ARROW-13576](https://issues.apache.org/jira/browse/ARROW-13576) - [C++][Compute] Replace ExecNode::InputReceived with ::MakeTask
+* [ARROW-13577](https://issues.apache.org/jira/browse/ARROW-13577) - [Python][FlightRPC] pyarrow client do\_put close method after write\_table did not throw flight error
+* [ARROW-13585](https://issues.apache.org/jira/browse/ARROW-13585) - [GLib] Add support for C ABI interface
+* [ARROW-13587](https://issues.apache.org/jira/browse/ARROW-13587) - [R] Handle --use-LTO override
+* [ARROW-13595](https://issues.apache.org/jira/browse/ARROW-13595) - [C++] Add debug mode check for compute kernel output type
+* [ARROW-13604](https://issues.apache.org/jira/browse/ARROW-13604) - [Java] Remove deprecation annotations for APIs representing unsupported operations
+* [ARROW-13606](https://issues.apache.org/jira/browse/ARROW-13606) - [R] Actually disable LTO
+* [ARROW-13613](https://issues.apache.org/jira/browse/ARROW-13613) - [C++] Implement sum/mean aggregations over decimals
+* [ARROW-13614](https://issues.apache.org/jira/browse/ARROW-13614) - [C++] Implement min\_max aggregation over decimal
+* [ARROW-13618](https://issues.apache.org/jira/browse/ARROW-13618) - [R] Use Arrow engine for summarize() by default  
+* [ARROW-13620](https://issues.apache.org/jira/browse/ARROW-13620) - [R] Binding for n\_distinct()
+* [ARROW-13626](https://issues.apache.org/jira/browse/ARROW-13626) - [R] Bindings for log base b
+* [ARROW-13627](https://issues.apache.org/jira/browse/ARROW-13627) - [C++] ScalarAggregateOptions don't make sense (in hash aggregation)
+* [ARROW-13629](https://issues.apache.org/jira/browse/ARROW-13629) - [Ruby] Add support for building/converting map
+* [ARROW-13633](https://issues.apache.org/jira/browse/ARROW-13633) - [Packaging][Debian] Add support for bookworm
+* [ARROW-13634](https://issues.apache.org/jira/browse/ARROW-13634) - [R] Update distro() in nixlibs.R to map from "bookworm" to 12
+* [ARROW-13635](https://issues.apache.org/jira/browse/ARROW-13635) - [Packaging][Python] Define --with-lg-page for jemalloc in the arm manylinux builds
+* [ARROW-13637](https://issues.apache.org/jira/browse/ARROW-13637) - [Python][Doc] Make docstrings conform to same style
+* [ARROW-13642](https://issues.apache.org/jira/browse/ARROW-13642) - [C++][Compute] Implement many-to-many inner hash join
+* [ARROW-13645](https://issues.apache.org/jira/browse/ARROW-13645) - [Java] Allow NullVectors to have distinct field names
+* [ARROW-13646](https://issues.apache.org/jira/browse/ARROW-13646) - [Go][Parquet] Add Metadata Package
+* [ARROW-13648](https://issues.apache.org/jira/browse/ARROW-13648) - [Dev] Use \#!/usr/bin/env instead of \#!/bin where possible
+* [ARROW-13650](https://issues.apache.org/jira/browse/ARROW-13650) - [C++] Create dataset writer to encapsulate dataset writer logic
+* [ARROW-13651](https://issues.apache.org/jira/browse/ARROW-13651) - [Ruby] Add support for converting [Symbol] to Arrow array
+* [ARROW-13652](https://issues.apache.org/jira/browse/ARROW-13652) - [Python] Expose the CopyFiles utility in Python
+* [ARROW-13660](https://issues.apache.org/jira/browse/ARROW-13660) - [C++][Compute] Remove \`seq\` as a parameter of ExecNode::InputReceived
+* [ARROW-13670](https://issues.apache.org/jira/browse/ARROW-13670) - [C++] Do a round of compiler warning cleanups
+* [ARROW-13674](https://issues.apache.org/jira/browse/ARROW-13674) - [Dev][CI] PR checks workflow should check for JIRA components
+* [ARROW-13675](https://issues.apache.org/jira/browse/ARROW-13675) - [Doc][Python] Add a recipe on how to save partitioned datasets to the Cookbook
+* [ARROW-13679](https://issues.apache.org/jira/browse/ARROW-13679) - [GLib][Ruby] Add support for group aggregation
+* [ARROW-13680](https://issues.apache.org/jira/browse/ARROW-13680) - [C++] Create an asynchronous nursery to simplify capture logic
+* [ARROW-13682](https://issues.apache.org/jira/browse/ARROW-13682) - [C++] Add TDigest::Merge(const TDigest&)
+* [ARROW-13684](https://issues.apache.org/jira/browse/ARROW-13684) - [C++][Compute] Strftime kernel follow-up
+* [ARROW-13686](https://issues.apache.org/jira/browse/ARROW-13686) - [Python] Update deprecated pytest yield\_fixture functions
+* [ARROW-13687](https://issues.apache.org/jira/browse/ARROW-13687) - [Ruby] Add support for loading table by Arrow Dataset
+* [ARROW-13691](https://issues.apache.org/jira/browse/ARROW-13691) - [C++] Add option to handle NAs to VarianceOptions
+* [ARROW-13693](https://issues.apache.org/jira/browse/ARROW-13693) - [Website] arrow-site should pin down a specific Ruby version and leverage toolings like rbenv
+* [ARROW-13696](https://issues.apache.org/jira/browse/ARROW-13696) - [Python] Support for MapType with Fields
+* [ARROW-13699](https://issues.apache.org/jira/browse/ARROW-13699) - [Python][Doc] Refactor the FileSystem Interface documentation
+* [ARROW-13700](https://issues.apache.org/jira/browse/ARROW-13700) - [Docs][C++] Clarify DayOfWeekOptions args
+* [ARROW-13702](https://issues.apache.org/jira/browse/ARROW-13702) - [Python] test\_parquet\_dataset\_deprecated\_properties missing a dataset mark
+* [ARROW-13704](https://issues.apache.org/jira/browse/ARROW-13704) - [C\#] Add support for reading streaming format delta dictionaries
+* [ARROW-13705](https://issues.apache.org/jira/browse/ARROW-13705) - [Website] Pin node version
+* [ARROW-13721](https://issues.apache.org/jira/browse/ARROW-13721) - [Doc][Cookbook] Specifying Schemas - Python
+* [ARROW-13733](https://issues.apache.org/jira/browse/ARROW-13733) - [Java] Allow JDBC adapters to reuse vector schema roots
+* [ARROW-13734](https://issues.apache.org/jira/browse/ARROW-13734) - [Format] Clarify allowed values for time types
+* [ARROW-13736](https://issues.apache.org/jira/browse/ARROW-13736) - [C++] Reconcile PrettyPrint and StringFormatter
+* [ARROW-13737](https://issues.apache.org/jira/browse/ARROW-13737) - [C++] Support scalar columns in hash aggregations (was: hash\_sum on scalar column segfaults)
+* [ARROW-13739](https://issues.apache.org/jira/browse/ARROW-13739) - [R] Support dplyr::count() and tally()
+* [ARROW-13740](https://issues.apache.org/jira/browse/ARROW-13740) - [R] summarize() should not eagerly evaluate
+* [ARROW-13757](https://issues.apache.org/jira/browse/ARROW-13757) - [R] Fix download of C++ source for CRAN patch releases
+* [ARROW-13759](https://issues.apache.org/jira/browse/ARROW-13759) - [C++] Update linting and formatting scripts to specify python3 in shebang line
+* [ARROW-13760](https://issues.apache.org/jira/browse/ARROW-13760) - [C++] Bump Protobuf version to 3.15 when Flight is enabled
+* [ARROW-13764](https://issues.apache.org/jira/browse/ARROW-13764) - [C++] Implement ScalarAggregateOptions for count\_distinct (grouped) 
+* [ARROW-13768](https://issues.apache.org/jira/browse/ARROW-13768) - [R] Allow JSON to be an optional component
+* [ARROW-13772](https://issues.apache.org/jira/browse/ARROW-13772) - [R] Binding for median() and quantile() aggregation functions
+* [ARROW-13776](https://issues.apache.org/jira/browse/ARROW-13776) - [C++] Offline thirdparty versions.txt is missing extensions for some files
+* [ARROW-13777](https://issues.apache.org/jira/browse/ARROW-13777) - [R] mutate after group\_by should be ok as long as there are only scalar functions
+* [ARROW-13778](https://issues.apache.org/jira/browse/ARROW-13778) - [R] Handle complex summarize expressions
+* [ARROW-13782](https://issues.apache.org/jira/browse/ARROW-13782) - [C++] Add option to handle NAs to TDigest, Index, Mode, Quantile aggregates
+* [ARROW-13783](https://issues.apache.org/jira/browse/ARROW-13783) - [Python] Improve Table.to\_string (and maybe \_\_repr\_\_) to also preview data of the table
+* [ARROW-13785](https://issues.apache.org/jira/browse/ARROW-13785) - [C++] Print methods for ExecPlan and ExecNode
+* [ARROW-13787](https://issues.apache.org/jira/browse/ARROW-13787) - [C++] Verify third-party downloads
+* [ARROW-13789](https://issues.apache.org/jira/browse/ARROW-13789) - [Go] Implement Arrow Scalar Values for Go
+* [ARROW-13793](https://issues.apache.org/jira/browse/ARROW-13793) - [C++] Migrate ORCFileReader to Result<T\>
+* [ARROW-13794](https://issues.apache.org/jira/browse/ARROW-13794) - [C++] Deprecate Parquet pseudo-version "2.0"
+* [ARROW-13797](https://issues.apache.org/jira/browse/ARROW-13797) - [C++] Implement column projection pushdown to ORC reader in Datasets API
+* [ARROW-13803](https://issues.apache.org/jira/browse/ARROW-13803) - [C++] Segfault on filtering taxi dataset
+* [ARROW-13804](https://issues.apache.org/jira/browse/ARROW-13804) - [Go] Add Support for Interval Type Month, Day, Nano
+* [ARROW-13806](https://issues.apache.org/jira/browse/ARROW-13806) - [Python] Add conversion to/from Pandas/Python for Month, Day Nano Interval Type
+* [ARROW-13809](https://issues.apache.org/jira/browse/ARROW-13809) - [C ABI] Add support for Month, Day, Nanosecond interval type to C-ABI
+* [ARROW-13810](https://issues.apache.org/jira/browse/ARROW-13810) - [C++][Compute] Predicate IsAsciiCharacter allows invalid types and values
+* [ARROW-13815](https://issues.apache.org/jira/browse/ARROW-13815) - [R] Adapt to new callstack changes in rlang
+* [ARROW-13816](https://issues.apache.org/jira/browse/ARROW-13816) - [Go] Implement Consumer APIs for C Data Interface
+* [ARROW-13820](https://issues.apache.org/jira/browse/ARROW-13820) - [R] Rename na.min\_count to min\_count and na.rm to skip\_nulls
+* [ARROW-13821](https://issues.apache.org/jira/browse/ARROW-13821) - [R] Handle na.rm in sd, var bindings
+* [ARROW-13823](https://issues.apache.org/jira/browse/ARROW-13823) - Exclude .factorypath from git and RAT plugin
+* [ARROW-13824](https://issues.apache.org/jira/browse/ARROW-13824) - [C++][Compute] Make constexpr BooleanToNumber kernel
+* [ARROW-13831](https://issues.apache.org/jira/browse/ARROW-13831) - [GLib][Ruby] Add support for writing by Arrow Dataset
+* [ARROW-13835](https://issues.apache.org/jira/browse/ARROW-13835) - [Python] Document utility to unify schemas
+* [ARROW-13842](https://issues.apache.org/jira/browse/ARROW-13842) - [C++] Bump vendored date library version
+* [ARROW-13843](https://issues.apache.org/jira/browse/ARROW-13843) - [C++][CI] Exercise ToString / PrettyPrint in fuzzing setup
+* [ARROW-13845](https://issues.apache.org/jira/browse/ARROW-13845) - [C++] Reconcile RandomArrayGenerator::ArrayOf variants
+* [ARROW-13847](https://issues.apache.org/jira/browse/ARROW-13847) - Avoid unnecessary copies of collection
+* [ARROW-13849](https://issues.apache.org/jira/browse/ARROW-13849) - [C++] Add min and max aggregation functions
+* [ARROW-13852](https://issues.apache.org/jira/browse/ARROW-13852) - [R] Handle Dataset schema metadata in ExecPlan
+* [ARROW-13853](https://issues.apache.org/jira/browse/ARROW-13853) - [R] String to\_title, to\_lower, to\_upper kernels
+* [ARROW-13855](https://issues.apache.org/jira/browse/ARROW-13855) - [C++] [Python] Add support for exporting extension types
+* [ARROW-13857](https://issues.apache.org/jira/browse/ARROW-13857) - [R][CI] Remove checkbashisms download
+* [ARROW-13859](https://issues.apache.org/jira/browse/ARROW-13859) - [Java] Add code coverage support
+* [ARROW-13866](https://issues.apache.org/jira/browse/ARROW-13866) - [R] Implement Options for all compute kernels available via list\_compute\_functions
+* [ARROW-13869](https://issues.apache.org/jira/browse/ARROW-13869) - [R] Implement options for non-bound MatchSubstringOptions kernels
+* [ARROW-13871](https://issues.apache.org/jira/browse/ARROW-13871) - [C++] JSON reader can fail if a list array key is present in one chunk but not in a later chunk
+* [ARROW-13874](https://issues.apache.org/jira/browse/ARROW-13874) - [R] Implement TrimOptions
+* [ARROW-13883](https://issues.apache.org/jira/browse/ARROW-13883) - [Python] Allow more than numpy.array as masks when creating arrays 
+* [ARROW-13890](https://issues.apache.org/jira/browse/ARROW-13890) - [R] Split up test-dataset.R and test-dplyr.R
+* [ARROW-13893](https://issues.apache.org/jira/browse/ARROW-13893) - [R] Make head/tail lazy on datasets and queries
+* [ARROW-13897](https://issues.apache.org/jira/browse/ARROW-13897) - [Python] TimestampScalar.as\_py() and DurationScalar.as\_py() docs inaccurately describe return types
+* [ARROW-13898](https://issues.apache.org/jira/browse/ARROW-13898) - [C++][Compute] Add support for string binary transforms
+* [ARROW-13899](https://issues.apache.org/jira/browse/ARROW-13899) - [Ruby] Implement slicer by compute kernels
+* [ARROW-13901](https://issues.apache.org/jira/browse/ARROW-13901) - [R] Implement IndexOptions
+* [ARROW-13904](https://issues.apache.org/jira/browse/ARROW-13904) - [R] Implement ModeOptions
+* [ARROW-13905](https://issues.apache.org/jira/browse/ARROW-13905) - [R] Implement ReplaceSliceOptions
+* [ARROW-13906](https://issues.apache.org/jira/browse/ARROW-13906) - [R] Implement PartitionNthOptions
+* [ARROW-13908](https://issues.apache.org/jira/browse/ARROW-13908) - [R] Implement ExtractRegexOptions
+* [ARROW-13909](https://issues.apache.org/jira/browse/ARROW-13909) - [GLib] Add GArrowVarianceOptions
+* [ARROW-13909](https://issues.apache.org/jira/browse/ARROW-13909) - [GLib] Add GArrowVarianceOptions
+* [ARROW-13910](https://issues.apache.org/jira/browse/ARROW-13910) - [Ruby] Arrow::Table\#[]/Arrow::RecordBatch\#[] accepts Range and selectors
+* [ARROW-13919](https://issues.apache.org/jira/browse/ARROW-13919) - [GLib] Add GArrowFunctionDoc
+* [ARROW-13924](https://issues.apache.org/jira/browse/ARROW-13924) - [R] Bindings for stringr::str\_starts, stringr::str\_ends, base::startsWith and base::endsWith
+* [ARROW-13925](https://issues.apache.org/jira/browse/ARROW-13925) - [R] Remove system installation devdocs jobs
+* [ARROW-13927](https://issues.apache.org/jira/browse/ARROW-13927) - [R] Add Karl to the contributors list for the pacakge
+* [ARROW-13928](https://issues.apache.org/jira/browse/ARROW-13928) - [R] Rename the version(s) tasks so that it's clearer which is which
+* [ARROW-13937](https://issues.apache.org/jira/browse/ARROW-13937) - [C++][Compute] Add explicit output values to sign function and fix unary type checks
+* [ARROW-13942](https://issues.apache.org/jira/browse/ARROW-13942) - [Dev] cmake\_format autotune doesn't work
+* [ARROW-13944](https://issues.apache.org/jira/browse/ARROW-13944) - [C++] Bump xsimd to latest version
+* [ARROW-13958](https://issues.apache.org/jira/browse/ARROW-13958) - [Python] Migrate Python ORC bindings to use new Result-based APIs
+* [ARROW-13959](https://issues.apache.org/jira/browse/ARROW-13959) - [R] Update tests for extracting components from date32 objects 
+* [ARROW-13962](https://issues.apache.org/jira/browse/ARROW-13962) - [R] Catch up on the NEWS
+* [ARROW-13963](https://issues.apache.org/jira/browse/ARROW-13963) - [Go] Shift Bitmap Reader/Writer implementations from Parquet to Arrow bituil package
+* [ARROW-13964](https://issues.apache.org/jira/browse/ARROW-13964) - [Go] Remove Parquet bitmap reader/writer implementations and use the shared arrow bitutils versions
+* [ARROW-13965](https://issues.apache.org/jira/browse/ARROW-13965) - [C++] dynamic\_casts in parquet TypedColumnWriterImpl impacting performance
+* [ARROW-13966](https://issues.apache.org/jira/browse/ARROW-13966) - [C++] Comparison kernel(s) for decimals
+* [ARROW-13967](https://issues.apache.org/jira/browse/ARROW-13967) - [Go] Implement Concatenate function for Arrays
+* [ARROW-13973](https://issues.apache.org/jira/browse/ARROW-13973) - [C++] Add a SelectKSinkNode
+* [ARROW-13974](https://issues.apache.org/jira/browse/ARROW-13974) - [C++] Resolve follow-up reviews for TopK/BottomK
+* [ARROW-13975](https://issues.apache.org/jira/browse/ARROW-13975) - [C++][Compute] Add decimal support to round functions
+* [ARROW-13977](https://issues.apache.org/jira/browse/ARROW-13977) - [Format] Clarify leap seconds and leap days for interval type
+* [ARROW-13979](https://issues.apache.org/jira/browse/ARROW-13979) - [Go] Enable -race argument for Go tests
+* [ARROW-13990](https://issues.apache.org/jira/browse/ARROW-13990) - [R] Bindings for round kernels
+* [ARROW-13994](https://issues.apache.org/jira/browse/ARROW-13994) - [Doc][C++] Build document misses git submodule update
+* [ARROW-13995](https://issues.apache.org/jira/browse/ARROW-13995) - [R] Bindings for join node
+* [ARROW-13999](https://issues.apache.org/jira/browse/ARROW-13999) - [C++][CI] Make must be installed to build LZ4 on MinGW
+* [ARROW-14002](https://issues.apache.org/jira/browse/ARROW-14002) - [Python] unify\_schema should accept tuples too
+* [ARROW-14003](https://issues.apache.org/jira/browse/ARROW-14003) - [C++][Python] Not providing a sort\_key in the "select\_k\_unstable" kernel crashes
+* [ARROW-14005](https://issues.apache.org/jira/browse/ARROW-14005) - [R] Fix tests for PartitionNthOptions so that can run on various platforms
+* [ARROW-14006](https://issues.apache.org/jira/browse/ARROW-14006) - [C++][Python] Support cast of naive timestamps to strings
+* [ARROW-14007](https://issues.apache.org/jira/browse/ARROW-14007) - [C++] Fix compiler warnings in decimal promotion machinery
+* [ARROW-14008](https://issues.apache.org/jira/browse/ARROW-14008) - [R][Compute] ExecPlan\_run should return RecordBatchReader instead of Table
+* [ARROW-14009](https://issues.apache.org/jira/browse/ARROW-14009) - [C++] Ensure SourceNode truly feeds batches to plan in parallel
+* [ARROW-14012](https://issues.apache.org/jira/browse/ARROW-14012) - [Python] Update kernel categories in compute doc to match C++
+* [ARROW-14013](https://issues.apache.org/jira/browse/ARROW-14013) - [C++][Docs] Instructions on installing on Fedora Linux
+* [ARROW-14016](https://issues.apache.org/jira/browse/ARROW-14016) - [C++] Wrong type\_name used for directory partitioning
+* [ARROW-14019](https://issues.apache.org/jira/browse/ARROW-14019) - [R] expect\_dplyr\_equal() test helper function ignores grouping
+* [ARROW-14023](https://issues.apache.org/jira/browse/ARROW-14023) - [Ruby] Arrow::Table\#slice accepts Hash
+* [ARROW-14025](https://issues.apache.org/jira/browse/ARROW-14025) - [R][C++] PreBuffer is not enabled when scanning parquet via exec nodes
+* [ARROW-14030](https://issues.apache.org/jira/browse/ARROW-14030) - [GLib] Use arrow::Result based ORC API
+* [ARROW-14031](https://issues.apache.org/jira/browse/ARROW-14031) - [Ruby] Use min and max separately
+* [ARROW-14033](https://issues.apache.org/jira/browse/ARROW-14033) - [Ruby][Doc] Add macOS development guide for Red Arrow
+* [ARROW-14033](https://issues.apache.org/jira/browse/ARROW-14033) - [Ruby][Doc] Add macOS development guide for Red Arrow
+* [ARROW-14035](https://issues.apache.org/jira/browse/ARROW-14035) - [C++][Compute] Implement non-hash count\_distinct aggregate kernel
+* [ARROW-14036](https://issues.apache.org/jira/browse/ARROW-14036) - [R] Binding for n\_distinct() with no grouping
+* [ARROW-14043](https://issues.apache.org/jira/browse/ARROW-14043) - [Python] Add support for unsigned indexes in dictionary array?
+* [ARROW-14044](https://issues.apache.org/jira/browse/ARROW-14044) - [R] Handle group\_by .drop parameter in summarize
+* [ARROW-14049](https://issues.apache.org/jira/browse/ARROW-14049) - [C++][Java] Upgrade ORC to 1.7.0
+* [ARROW-14050](https://issues.apache.org/jira/browse/ARROW-14050) - [C++] tdigest, quantile return empty arrays when nulls not skipped
+* [ARROW-14052](https://issues.apache.org/jira/browse/ARROW-14052) - [C++] Add appx\_median, hash\_appx\_median functions
+* [ARROW-14054](https://issues.apache.org/jira/browse/ARROW-14054) - [C++][Docs] Improve clarity of row\_conversion\_example.cpp
+* [ARROW-14055](https://issues.apache.org/jira/browse/ARROW-14055) - [Docs] Add canonical url to the docs 
+* [ARROW-14056](https://issues.apache.org/jira/browse/ARROW-14056) - [C++][Doc] Mention ArrayData
+* [ARROW-14061](https://issues.apache.org/jira/browse/ARROW-14061) - [Go] Add Cgo Arrow Memory Pool Allocator
+* [ARROW-14062](https://issues.apache.org/jira/browse/ARROW-14062) - [Format] Initial arrow-internal specification of compute IR
+* [ARROW-14064](https://issues.apache.org/jira/browse/ARROW-14064) - [CI] Use Debian 11
+* [ARROW-14069](https://issues.apache.org/jira/browse/ARROW-14069) - [R] By default, filter out hash functions in list\_compute\_functions()
+* [ARROW-14070](https://issues.apache.org/jira/browse/ARROW-14070) - [C++][CI] Remove support for VisualStudio 2015
+* [ARROW-14072](https://issues.apache.org/jira/browse/ARROW-14072) - [GLib][Parquet] Add support for getting number of rows through metadata
+* [ARROW-14073](https://issues.apache.org/jira/browse/ARROW-14073) - [C++] De-duplicate sort keys
+* [ARROW-14084](https://issues.apache.org/jira/browse/ARROW-14084) - [GLib][Ruby][Dataset] Add support for scanning from directory
+* [ARROW-14088](https://issues.apache.org/jira/browse/ARROW-14088) - [GLib][Ruby][Dataset] Add support for filter
+* [ARROW-14106](https://issues.apache.org/jira/browse/ARROW-14106) - [Go][C] Implement Exporting the C data interface
+* [ARROW-14107](https://issues.apache.org/jira/browse/ARROW-14107) - [R][CI] Parallelize Windows CI jobs
+* [ARROW-14111](https://issues.apache.org/jira/browse/ARROW-14111) - [C++] Add extraction function support for time32/time64
+* [ARROW-14116](https://issues.apache.org/jira/browse/ARROW-14116) - [C++][Docs] Consistent variable names in WriteCSV example
+* [ARROW-14127](https://issues.apache.org/jira/browse/ARROW-14127) - [C++][Docs] Example of using compute function and output
+* [ARROW-14128](https://issues.apache.org/jira/browse/ARROW-14128) - [Go] Implement MakeArrayFromScalar for nested types
+* [ARROW-14132](https://issues.apache.org/jira/browse/ARROW-14132) - [C++] Test mixed quoting and escaping in CSV chunker test
+* [ARROW-14135](https://issues.apache.org/jira/browse/ARROW-14135) - [Python] Missing Python tests for compute kernels
+* [ARROW-14140](https://issues.apache.org/jira/browse/ARROW-14140) - [R] skip arrow\_binary/arrow\_large\_binary class from R metadata
+* [ARROW-14143](https://issues.apache.org/jira/browse/ARROW-14143) - [IR] [C++] Add explicit cast node to IR
+* [ARROW-14146](https://issues.apache.org/jira/browse/ARROW-14146) - [Dev] Update merge script to specify python3 in shebang line
+* [ARROW-14150](https://issues.apache.org/jira/browse/ARROW-14150) - [C++] Skip delimiter checking in CSV chunker if quoting is false
+* [ARROW-14155](https://issues.apache.org/jira/browse/ARROW-14155) - [Go] Add functions for creating fingerprints/hashes of data types and scalars
+* [ARROW-14157](https://issues.apache.org/jira/browse/ARROW-14157) - [C++] Refactor Abseil build in ThirdpartyToolchain
+* [ARROW-14165](https://issues.apache.org/jira/browse/ARROW-14165) - [C++] Improve table sort performance \#2
+* [ARROW-14178](https://issues.apache.org/jira/browse/ARROW-14178) - [C++] Boost download location has moved
+* [ARROW-14180](https://issues.apache.org/jira/browse/ARROW-14180) - [Packaging] Add support for AlmaLinux 8
+* [ARROW-14189](https://issues.apache.org/jira/browse/ARROW-14189) - [Docs] Add version dropdown to the sphinx docs
+* [ARROW-14191](https://issues.apache.org/jira/browse/ARROW-14191) - [C++][Dataset] Dataset writes should respect backpressure
+* [ARROW-14194](https://issues.apache.org/jira/browse/ARROW-14194) - [Docs] Improve vertical spacing in the sphinx API docs
+* [ARROW-14198](https://issues.apache.org/jira/browse/ARROW-14198) - [Java] Upgrade Netty and gRPC dependencies
+* [ARROW-14207](https://issues.apache.org/jira/browse/ARROW-14207) - [C++] Add missing dependencies for bundled Boost targets
+* [ARROW-14212](https://issues.apache.org/jira/browse/ARROW-14212) - [GLib][Ruby] Add GArrowTableConcatenateOptions
+* [ARROW-14217](https://issues.apache.org/jira/browse/ARROW-14217) - [Python][CI] Add support for python 3.10 
+* [ARROW-14222](https://issues.apache.org/jira/browse/ARROW-14222) - [C++] Create GcsFileSystem skeleton
+* [ARROW-14228](https://issues.apache.org/jira/browse/ARROW-14228) - [R] Allow for creation of nullable fields
+* [ARROW-14230](https://issues.apache.org/jira/browse/ARROW-14230) - [C++] Deprecate ArrayBuilder::Advance
+* [ARROW-14232](https://issues.apache.org/jira/browse/ARROW-14232) - [C++] Update crc32c dependency to 1.1.2
+* [ARROW-14235](https://issues.apache.org/jira/browse/ARROW-14235) - [C++][Compute] Use a node counter as the label if no label is supplied
+* [ARROW-14236](https://issues.apache.org/jira/browse/ARROW-14236) - [C++] Install GCS testbench for CI builds
+* [ARROW-14239](https://issues.apache.org/jira/browse/ARROW-14239) - [R] Don't use rlang::as\_label
+* [ARROW-14241](https://issues.apache.org/jira/browse/ARROW-14241) - [C++] Dataset ORC build failing in java-jars nightly build
+* [ARROW-14243](https://issues.apache.org/jira/browse/ARROW-14243) - [C++] Split up vector\_sort.cc
+* [ARROW-14244](https://issues.apache.org/jira/browse/ARROW-14244) - [C++] Investigate scalar\_temporal.cc compilation speed
+* [ARROW-14258](https://issues.apache.org/jira/browse/ARROW-14258) - [R] Warn if an SF column is made into a table
+* [ARROW-14259](https://issues.apache.org/jira/browse/ARROW-14259) - [R] converting from R vector to Array when the R vector is altrep
+* [ARROW-14261](https://issues.apache.org/jira/browse/ARROW-14261) - [C++] Includes should be in alphabetical order
+* [ARROW-14269](https://issues.apache.org/jira/browse/ARROW-14269) - [C++] Consolidate utf8 benchmark
+* [ARROW-14274](https://issues.apache.org/jira/browse/ARROW-14274) - [C++] Upgrade vendored base64 code
+* [ARROW-14284](https://issues.apache.org/jira/browse/ARROW-14284) - [C++][Python] Improve error message when trying use SyncScanner when requiring async
+* [ARROW-14291](https://issues.apache.org/jira/browse/ARROW-14291) - [CI][C++] Add cpp/examples/ files to lint targets
+* [ARROW-14295](https://issues.apache.org/jira/browse/ARROW-14295) - [Doc] Indicate location of archery
+* [ARROW-14296](https://issues.apache.org/jira/browse/ARROW-14296) - [Go] Update flatbuf generated code
+* [ARROW-14304](https://issues.apache.org/jira/browse/ARROW-14304) - [R] Update news for 6.0.0
+* [ARROW-14309](https://issues.apache.org/jira/browse/ARROW-14309) - [Python] CompressedInputStream doesn't support str or file objects
+* [ARROW-14317](https://issues.apache.org/jira/browse/ARROW-14317) - [Doc] Update implementation status
+* [ARROW-14326](https://issues.apache.org/jira/browse/ARROW-14326) - [Docs] Add C/GLib and Ruby to C Data/Stream interface supported libraries
+* [ARROW-14327](https://issues.apache.org/jira/browse/ARROW-14327) - [Release] Remove conda-\* from packaging group
+* [ARROW-14335](https://issues.apache.org/jira/browse/ARROW-14335) - [GLib][Ruby] Add support for expression
+* [ARROW-14337](https://issues.apache.org/jira/browse/ARROW-14337) - [C++] Arrow doesn't build on M1 when SIMD acceleration is enabled
+* [ARROW-14341](https://issues.apache.org/jira/browse/ARROW-14341) - [C++] Refine decimal benchmark
+* [ARROW-14343](https://issues.apache.org/jira/browse/ARROW-14343) - [Packaging][Python] Enable NEON SIMD optimization for M1 wheels
+* [ARROW-14345](https://issues.apache.org/jira/browse/ARROW-14345) - [C++] Implement streaming reads for GCS FileSystem
+* [ARROW-14348](https://issues.apache.org/jira/browse/ARROW-14348) - [R] add group\_vars.RecordBatchReader method
+* [ARROW-14349](https://issues.apache.org/jira/browse/ARROW-14349) - [IR] Remove RelBase
+* [ARROW-14358](https://issues.apache.org/jira/browse/ARROW-14358) - Update CMake options in documentation
+* [ARROW-14361](https://issues.apache.org/jira/browse/ARROW-14361) - [C++] Define a DEFAULT value for ARROW\_SIMD\_LEVEL
+* [ARROW-14364](https://issues.apache.org/jira/browse/ARROW-14364) - [CI][C++] Support LLVM 13
+* [ARROW-14368](https://issues.apache.org/jira/browse/ARROW-14368) - [CI] ubuntu-16.04 isn't available on Azure Pipelines
+* [ARROW-14369](https://issues.apache.org/jira/browse/ARROW-14369) - [C++][Python] Failed to build with g++ 4.8.5
+* [ARROW-14386](https://issues.apache.org/jira/browse/ARROW-14386) - [Packaging][Java] devtoolset is upgraded to 10 in the manylinux2014 image
+* [ARROW-14387](https://issues.apache.org/jira/browse/ARROW-14387) - [Release][Ruby] Check Homebrew/MSYS2 package version before releasing
+* [ARROW-14396](https://issues.apache.org/jira/browse/ARROW-14396) - [R][Doc] Remove relic note in write\_dataset that columns cannot be renamed
+* [ARROW-14400](https://issues.apache.org/jira/browse/ARROW-14400) - [Go] Equals and ApproxEquals for Tables and Chunked Arrays
+* [ARROW-14401](https://issues.apache.org/jira/browse/ARROW-14401) - [C++] Bundled crc32c 's include path is wrong
+* [ARROW-14402](https://issues.apache.org/jira/browse/ARROW-14402) - [Release][Yum] Signing RPM is failed
+* [ARROW-14404](https://issues.apache.org/jira/browse/ARROW-14404) - [Release][APT] Skip arm64 Debian GNU/Linux bookwarm verification
+* [ARROW-14408](https://issues.apache.org/jira/browse/ARROW-14408) - [Packaging][Crossbow] Option for skipping artifact pattern validation
+* [ARROW-14410](https://issues.apache.org/jira/browse/ARROW-14410) - [Python][Packaging] Use numpy 1.21.3 to build python 3.10 wheels for macOS and windows
+* [ARROW-14452](https://issues.apache.org/jira/browse/ARROW-14452) - [Release][JS] Update Javascript testing
+* [PARQUET-490](https://issues.apache.org/jira/browse/PARQUET-490) - [C++] Incorporate DELTA\_BINARY\_PACKED value encoder into library and add unit tests
+
+
+
+# Apache Arrow 5.0.0 (2021-07-28)
+
+## Bug Fixes
+
+* [ARROW-6189](https://issues.apache.org/jira/browse/ARROW-6189) - [Rust] [Parquet] Plain encoded boolean column chunks limited to 2048 values
+* [ARROW-6312](https://issues.apache.org/jira/browse/ARROW-6312) - [C++] Declare required Libs.private in arrow.pc package config
+* [ARROW-7948](https://issues.apache.org/jira/browse/ARROW-7948) - [Go][Integration] Decimal integration failures
+* [ARROW-9594](https://issues.apache.org/jira/browse/ARROW-9594) - [Python] DictionaryArray.to\_numpy does not correctly convert null indexes to null values
+* [ARROW-10910](https://issues.apache.org/jira/browse/ARROW-10910) - [Python] Segmentation Fault when None given to read\_table with legacy dataset
+* [ARROW-10958](https://issues.apache.org/jira/browse/ARROW-10958) - [GLib] "Nested data conversions not implemented" through glib, but not through pyarrow
+* [ARROW-11077](https://issues.apache.org/jira/browse/ARROW-11077) - [Rust] ParquetFileArrowReader panicks when trying to read nested list
+* [ARROW-11146](https://issues.apache.org/jira/browse/ARROW-11146) - [CI][Python] Failing conda-python-3.8-jpype Nightly Build
+* [ARROW-11161](https://issues.apache.org/jira/browse/ARROW-11161) - [Python][C++] S3Filesystem: file Content-Type not set correctly?
+* [ARROW-11633](https://issues.apache.org/jira/browse/ARROW-11633) - [CI] [Documentation] Maven default skin not found
+* [ARROW-11780](https://issues.apache.org/jira/browse/ARROW-11780) - [C++][Python] StructArray.from\_arrays() crashes Python interpreter
+* [ARROW-11908](https://issues.apache.org/jira/browse/ARROW-11908) - [Rust] Intermittent Flight integration test failures
+* [ARROW-12007](https://issues.apache.org/jira/browse/ARROW-12007) - [C++] Loading parquet file returns "Invalid UTF8 payload" error
+* [ARROW-12055](https://issues.apache.org/jira/browse/ARROW-12055) - [R] is.na() evaluates to FALSE on Arrow NaN values
+* [ARROW-12096](https://issues.apache.org/jira/browse/ARROW-12096) - [Python][C++] Pyarrow Parquet reader overflows INT96 timestamps when converting to Arrow Array (timestamp[ns])
+* [ARROW-12122](https://issues.apache.org/jira/browse/ARROW-12122) - [Python] Cannot install via pip M1 mac
+* [ARROW-12142](https://issues.apache.org/jira/browse/ARROW-12142) - [Python] undefined symbol: \_ZN5arrow6StatusC1ENS\_10StatusCodeERKNSt7\_\_cxx1112basic\_stringIcSt11char\_traitsIcESaIcEEE
+* [ARROW-12150](https://issues.apache.org/jira/browse/ARROW-12150) - [Python] Bad type inference of mixed-precision Decimals
+* [ARROW-12232](https://issues.apache.org/jira/browse/ARROW-12232) - [Rust][Datafusion] Error with CAST: Unsupported SQL type Time
+* [ARROW-12240](https://issues.apache.org/jira/browse/ARROW-12240) - [Python] invalid-offsetof warning from apple clang-12
+* [ARROW-12377](https://issues.apache.org/jira/browse/ARROW-12377) - [Doc][Java] Java doc build broken
+* [ARROW-12407](https://issues.apache.org/jira/browse/ARROW-12407) - [Python] Deprecation warning when building PyArrow
+* [ARROW-12431](https://issues.apache.org/jira/browse/ARROW-12431) - [Python] pa.array mask inverted when type is binary and value to be converted is numpy array
+* [ARROW-12472](https://issues.apache.org/jira/browse/ARROW-12472) - [Python] read\_table fails when passing a PEP519 filesystem object
+* [ARROW-12482](https://issues.apache.org/jira/browse/ARROW-12482) - [Doc][Python] Mention CSVStreamingReader pitfalls with type inference
+* [ARROW-12491](https://issues.apache.org/jira/browse/ARROW-12491) - [Packaging] Required dependency on LZ4 \>= 1.8 missing from CentOS RPM packages
+* [ARROW-12503](https://issues.apache.org/jira/browse/ARROW-12503) - [C++] Ensure using "lib/" for jemalloc's library directory
+* [ARROW-12508](https://issues.apache.org/jira/browse/ARROW-12508) - [R] expect\_as\_vector implementation causes test failure on R <= 3.3  and variables defined outside of test\_that break build when no arrow install
+* [ARROW-12543](https://issues.apache.org/jira/browse/ARROW-12543) - [CI][Python] Failing conda-python-3.9 Nightly Build
+* [ARROW-12568](https://issues.apache.org/jira/browse/ARROW-12568) - [Python][C++] Segfault when casting a sliced ListArray of int64 in v4.0.0
+* [ARROW-12569](https://issues.apache.org/jira/browse/ARROW-12569) - [R] [CI] Run revdep in CI
+* [ARROW-12570](https://issues.apache.org/jira/browse/ARROW-12570) - [JS] Fix issues that blocked the v4.0.0 release
+* [ARROW-12579](https://issues.apache.org/jira/browse/ARROW-12579) - [Python] Pyarrow 4.0.0 dependency numpy 1.19.4 throws errors on Apple silicon/M1 compilation
+* [ARROW-12589](https://issues.apache.org/jira/browse/ARROW-12589) - [C++] Compiling on windows doesn't work when -DARROW\_WITH\_BACKTRACE=OFF
+* [ARROW-12601](https://issues.apache.org/jira/browse/ARROW-12601) - [R][Packaging] Fix pkg-config check in r/configure
+* [ARROW-12604](https://issues.apache.org/jira/browse/ARROW-12604) - [R][Packaging] Dataset, Parquet off in autobrew and CRAN Mac builds
+* [ARROW-12605](https://issues.apache.org/jira/browse/ARROW-12605) - [Documentation] Repair line numbers in dataset.rst
+* [ARROW-12606](https://issues.apache.org/jira/browse/ARROW-12606) - [C++] Quantile and Mode functions failing on arrays with offset
+* [ARROW-12610](https://issues.apache.org/jira/browse/ARROW-12610) - [C++] Skip TestS3FSGeneric TestDeleteDir and TestDeleteDirContents on windows as they are flaky
+* [ARROW-12611](https://issues.apache.org/jira/browse/ARROW-12611) - [CI][Python] Nightly test-conda-python-pandas-0.24 is failing due to numpy compat issue
+* [ARROW-12613](https://issues.apache.org/jira/browse/ARROW-12613) - [Python] AttributeError when comparing a Scalar with None
+* [ARROW-12614](https://issues.apache.org/jira/browse/ARROW-12614) - [C++][Compute] Revert support for Tables in ExecuteScalarExpression
+* [ARROW-12617](https://issues.apache.org/jira/browse/ARROW-12617) - [Python] pyarrow.orc.write\_table signature reverses that of pyarrow.parquet.write\_table
+* [ARROW-12620](https://issues.apache.org/jira/browse/ARROW-12620) - [C++] Dataset writing can only include projected columns if input columns are also included
+* [ARROW-12622](https://issues.apache.org/jira/browse/ARROW-12622) - [Python] Segfault when reading CSV inside Flight server
+* [ARROW-12630](https://issues.apache.org/jira/browse/ARROW-12630) - [Dev][Integration] conda-integration docker build fails
+* [ARROW-12639](https://issues.apache.org/jira/browse/ARROW-12639) - [CI][Archery] Archery build fails to create branch
+* [ARROW-12640](https://issues.apache.org/jira/browse/ARROW-12640) - [C++] Fix errors from VS 2019 in cpp/src/parquet/types.h
+* [ARROW-12642](https://issues.apache.org/jira/browse/ARROW-12642) - [R] LIBARROW\_MINIMAL, LIBARROW\_DOWNLOAD, NOT\_CRAN env vars should not be case-sensitive
+* [ARROW-12644](https://issues.apache.org/jira/browse/ARROW-12644) - [C++][Dataset] Support reading date/time-partitioned datasets accounting for URL encoding (Spark)
+* [ARROW-12646](https://issues.apache.org/jira/browse/ARROW-12646) - [C++][CI][Packaging][Python] Bump vcpkg version to its latest release
+* [ARROW-12663](https://issues.apache.org/jira/browse/ARROW-12663) - [C++] segfault when arrow header is compiled with nvcc 11.2
+* [ARROW-12668](https://issues.apache.org/jira/browse/ARROW-12668) - [C++][Dataset] CountRows occasionally segfaulting
+* [ARROW-12670](https://issues.apache.org/jira/browse/ARROW-12670) - [C++] extract\_regex gives bizarre behavior after nulls or non-matches
+* [ARROW-12672](https://issues.apache.org/jira/browse/ARROW-12672) - [C++] Segfault casting result of "fill\_null()" (not bitmap but unknown null\_count)
+* [ARROW-12679](https://issues.apache.org/jira/browse/ARROW-12679) - [Java] JDBC adapter does not preserve SQL-nullability
+* [ARROW-12684](https://issues.apache.org/jira/browse/ARROW-12684) - [Go][Flight] Fix nil dereference in error case
+* [ARROW-12708](https://issues.apache.org/jira/browse/ARROW-12708) - [C++] Valgrind errors when calling negate\_checked
+* [ARROW-12729](https://issues.apache.org/jira/browse/ARROW-12729) - [R] Fix length method for Table, RecordBatch
+* [ARROW-12746](https://issues.apache.org/jira/browse/ARROW-12746) - [Go][Flight] Client Auth handler overwrites outgoing metadata
+* [ARROW-12756](https://issues.apache.org/jira/browse/ARROW-12756) - [C++] MSVC build fails with latest gtest from vcpkg
+* [ARROW-12757](https://issues.apache.org/jira/browse/ARROW-12757) - [Dev][Archery] Warning about RUST variable in "archery docker run"
+* [ARROW-12762](https://issues.apache.org/jira/browse/ARROW-12762) - [Python] ListType doesn't preserve field name after pickle and unpickle
+* [ARROW-12769](https://issues.apache.org/jira/browse/ARROW-12769) - [Python] Negative out of range slices yield invalid arrays
+* [ARROW-12771](https://issues.apache.org/jira/browse/ARROW-12771) - [C++] Arrow compute hash\_count skips following chunked arrays in streaming execution 
+* [ARROW-12772](https://issues.apache.org/jira/browse/ARROW-12772) - [CI] Merge script test fails due to missing dependency
+* [ARROW-12773](https://issues.apache.org/jira/browse/ARROW-12773) - [Docs] Clarify Java support for ORC and Parquet via JNI bindings
+* [ARROW-12774](https://issues.apache.org/jira/browse/ARROW-12774) - [C++][Compute] replace\_substring\_regex() creates invalid arrays =\> crash
+* [ARROW-12776](https://issues.apache.org/jira/browse/ARROW-12776) - [Archery][Integration] Fix decimal case generation in write\_js\_test\_json
+* [ARROW-12779](https://issues.apache.org/jira/browse/ARROW-12779) - [Python][FlightRPC] Flight server segfaults with certain data
+* [ARROW-12780](https://issues.apache.org/jira/browse/ARROW-12780) - [CI][C++] MinGW builds failing when trying to build Gandiva
+* [ARROW-12790](https://issues.apache.org/jira/browse/ARROW-12790) - [Python] Cannot read from HDFS with blanks in path names
+* [ARROW-12793](https://issues.apache.org/jira/browse/ARROW-12793) - [Python] PYARROW\_BUILD\_TYPE=Debug does not work correctly
+* [ARROW-12797](https://issues.apache.org/jira/browse/ARROW-12797) - [JS] Update readme with new links and remove outdated examples
+* [ARROW-12798](https://issues.apache.org/jira/browse/ARROW-12798) - [JS] Use == null Comparison
+* [ARROW-12799](https://issues.apache.org/jira/browse/ARROW-12799) - [JS] Use Nullish Coalescing Operator (??) For Defaults
+* [ARROW-12804](https://issues.apache.org/jira/browse/ARROW-12804) - [C++] Array methods IsNull and IsValid is confused for NullType
+* [ARROW-12807](https://issues.apache.org/jira/browse/ARROW-12807) - [C++] Fix merge conflicts with Future refactor/async IPC
+* [ARROW-12838](https://issues.apache.org/jira/browse/ARROW-12838) - [Java][Gandiva] Fix JNI CI test for Gandiva
+* [ARROW-12842](https://issues.apache.org/jira/browse/ARROW-12842) - [Java][FlightRPC] Error metadata from FlightStatusException is not propagated to client
+* [ARROW-12850](https://issues.apache.org/jira/browse/ARROW-12850) - [R] is.nan() evaluates to null on Arrow null values
+* [ARROW-12854](https://issues.apache.org/jira/browse/ARROW-12854) - [Dev][Release] Windows wheel verification script fails to download artifacts
+* [ARROW-12857](https://issues.apache.org/jira/browse/ARROW-12857) - [C++] hash\_aggregate\_test not building on master
+* [ARROW-12864](https://issues.apache.org/jira/browse/ARROW-12864) - [C++] Remove needless out argument from arrow::internal::InvertBitmap
+* [ARROW-12865](https://issues.apache.org/jira/browse/ARROW-12865) - [C++][Python] Python FlightRPC server cannot find RE2 symbols
+* [ARROW-12882](https://issues.apache.org/jira/browse/ARROW-12882) - [C++][Gandiva] Fix behavior of convevrt\_replace function for empty replacement char
+* [ARROW-12887](https://issues.apache.org/jira/browse/ARROW-12887) - [CI] AppVeyor pip install failure during setup
+* [ARROW-12906](https://issues.apache.org/jira/browse/ARROW-12906) - [Python] \`fill\_null\` called with a null value seg faults on non fixed-sized types.
+* [ARROW-12907](https://issues.apache.org/jira/browse/ARROW-12907) - [Java] Memory leak possible when exception reading from channel happens
+* [ARROW-12911](https://issues.apache.org/jira/browse/ARROW-12911) - [Python] Export scalar aggregate options to pc.sum (sum of zero rows gives null; should give 0)
+* [ARROW-12917](https://issues.apache.org/jira/browse/ARROW-12917) - [C++][R][pyarrow] Failure importing some decimal types using the C data interface
+* [ARROW-12918](https://issues.apache.org/jira/browse/ARROW-12918) - [C++] Build errors with Visual Studio 16.10.31321.278
+* [ARROW-12919](https://issues.apache.org/jira/browse/ARROW-12919) - [Developer Tools] Crossbow comment bot failing to react to comments
+* [ARROW-12935](https://issues.apache.org/jira/browse/ARROW-12935) - [C++][CI] Compiler error on some clang versions
+* [ARROW-12941](https://issues.apache.org/jira/browse/ARROW-12941) - [C++] csv reader skip\_row does not properly update num\_rows\_seen
+* [ARROW-12942](https://issues.apache.org/jira/browse/ARROW-12942) - [C++][Compute] The result of Arrow compute hash\_min\_max is incorrect if there are new groups in the subsequent chunks
+* [ARROW-12956](https://issues.apache.org/jira/browse/ARROW-12956) - [C++] Fix crash on Parquet file (OSS-Fuzz)
+* [ARROW-12969](https://issues.apache.org/jira/browse/ARROW-12969) - [C++] match\_substring doesn't match empty needle to empty haystack
+* [ARROW-12974](https://issues.apache.org/jira/browse/ARROW-12974) - [R] test-r-without-arrow build fails because of example requiring Arrow
+* [ARROW-12983](https://issues.apache.org/jira/browse/ARROW-12983) - [C++][Python] Converter::Extend gets stuck in infinite loop causing OOM if values don't fit in single chunk
+* [ARROW-12987](https://issues.apache.org/jira/browse/ARROW-12987) - [CI] test-ubuntu-18.04 nightly builds are failing due to Gandiva "TestUpper" test failure
+* [ARROW-12988](https://issues.apache.org/jira/browse/ARROW-12988) - [CI] The kartothek nightly integration build is failing (test\_update\_dataset\_from\_ddf\_empty)
+* [ARROW-12988](https://issues.apache.org/jira/browse/ARROW-12988) - [CI] The kartothek nightly integration build is failing (test\_update\_dataset\_from\_ddf\_empty)
+* [ARROW-12989](https://issues.apache.org/jira/browse/ARROW-12989) - [CI] "Dev PR" jobs undully cancelled
+* [ARROW-12991](https://issues.apache.org/jira/browse/ARROW-12991) - [CI] Travis ARM builds often crash
+* [ARROW-12993](https://issues.apache.org/jira/browse/ARROW-12993) - [Python] Address boundary error with invalid Feather file and stackprinter
+* [ARROW-12995](https://issues.apache.org/jira/browse/ARROW-12995) - [C++] CSV reader should validate options
+* [ARROW-12998](https://issues.apache.org/jira/browse/ARROW-12998) - [C++] Datasets needs dependency on xsimd
+* [ARROW-13001](https://issues.apache.org/jira/browse/ARROW-13001) - [Go] Build failure in parquet/internal/bmi on s390x
+* [ARROW-13003](https://issues.apache.org/jira/browse/ARROW-13003) - [C++] unaligned access in compute/exec/ cc files
+* [ARROW-13008](https://issues.apache.org/jira/browse/ARROW-13008) - [C++] Deprecation warning when compiling minimal example
+* [ARROW-13010](https://issues.apache.org/jira/browse/ARROW-13010) - [C++][Compute] Support outputting to slices from kleene kernels
+* [ARROW-13018](https://issues.apache.org/jira/browse/ARROW-13018) - [C++][Docs] Use consistent terminology for nulls (min\_count) in scalar aggregate kernels
+* [ARROW-13026](https://issues.apache.org/jira/browse/ARROW-13026) - [C++][CI] s390x job setup fails
+* [ARROW-13037](https://issues.apache.org/jira/browse/ARROW-13037) - [R] Incorrect param when creating Expression crashes R
+* [ARROW-13039](https://issues.apache.org/jira/browse/ARROW-13039) - [R] Fix error message handling
+* [ARROW-13041](https://issues.apache.org/jira/browse/ARROW-13041) - [C++] Unary kernels can leave uninitialized data under null entries
+* [ARROW-13046](https://issues.apache.org/jira/browse/ARROW-13046) - [Release] JS package failing test prior to publish
+* [ARROW-13048](https://issues.apache.org/jira/browse/ARROW-13048) - [C++] S3FileSystem fails moving filepaths containing = or +
+* [ARROW-13053](https://issues.apache.org/jira/browse/ARROW-13053) - [Python] Build fails on MacOS Big Sur using homebrewed Arrow libraries
+* [ARROW-13069](https://issues.apache.org/jira/browse/ARROW-13069) - [Website] Add Daniël to committer list
+* [ARROW-13073](https://issues.apache.org/jira/browse/ARROW-13073) - [Developer] archery benchmark list: unexpected keyword 'benchmark\_filter'
+* [ARROW-13080](https://issues.apache.org/jira/browse/ARROW-13080) - [Release] Generate the API docs in ubuntu 20.10
+* [ARROW-13083](https://issues.apache.org/jira/browse/ARROW-13083) - [Python] Wrong SCM version detection both in setup.py and crossbow
+* [ARROW-13085](https://issues.apache.org/jira/browse/ARROW-13085) - [Python] Apache Arrow minimal cpp build segfaults with pyarrow libs
+* [ARROW-13090](https://issues.apache.org/jira/browse/ARROW-13090) - [Python] Test failure with ffspec 2021.6.0
+* [ARROW-13104](https://issues.apache.org/jira/browse/ARROW-13104) - [C++] ByteStreamSplit implementation uses invalid pointer cast
+* [ARROW-13108](https://issues.apache.org/jira/browse/ARROW-13108) - [Python] Pyarrow 4.0.0 crashes upon import on macOS 10.13.6
+* [ARROW-13116](https://issues.apache.org/jira/browse/ARROW-13116) - [R] Test for RecordBatchReader to C-interface fails on arrow-r-minimal due to missing dependencies
+* [ARROW-13125](https://issues.apache.org/jira/browse/ARROW-13125) - [R] Throw error when 2+ args passed to desc() in arrange()
+* [ARROW-13128](https://issues.apache.org/jira/browse/ARROW-13128) - [C\#] TimestampArray conversion logic for nano and micro is wrong
+* [ARROW-13135](https://issues.apache.org/jira/browse/ARROW-13135) - [C++] Fix Status propagation in END\_PARQUET\_CATCH\_EXCEPTIONS
+* [ARROW-13139](https://issues.apache.org/jira/browse/ARROW-13139) - [C++] ReadaheadGenerator cannot be safely copied/moved
+* [ARROW-13145](https://issues.apache.org/jira/browse/ARROW-13145) - [C++][CI] Flight test crashes on MinGW
+* [ARROW-13148](https://issues.apache.org/jira/browse/ARROW-13148) - [Dev][Archery] Crossbow build submission fails
+* [ARROW-13153](https://issues.apache.org/jira/browse/ARROW-13153) - [C++] \`parquet\_dataset\` loses ordering of files in \`\_metadata\`
+* [ARROW-13154](https://issues.apache.org/jira/browse/ARROW-13154) - [C++] Unions can not have 126 and 127 as type\_codes
+* [ARROW-13169](https://issues.apache.org/jira/browse/ARROW-13169) - [R] [C++] sorted partition keys can cause issues
+* [ARROW-13173](https://issues.apache.org/jira/browse/ARROW-13173) - [C++] TestAsyncUtil.ReadaheadFailed asserts occasionally 
+* [ARROW-13187](https://issues.apache.org/jira/browse/ARROW-13187) - [c++][python] Possibly memory not deallocated when reading in CSV
+* [ARROW-13189](https://issues.apache.org/jira/browse/ARROW-13189) - [R] Disable row-level metadata application on datasets
+* [ARROW-13203](https://issues.apache.org/jira/browse/ARROW-13203) - [R] Fix optional component checks causing failures
+* [ARROW-13207](https://issues.apache.org/jira/browse/ARROW-13207) - [Python][Doc] Dataset documentation still suggests deprecated scan method as the preferred iterative approach
+* [ARROW-13216](https://issues.apache.org/jira/browse/ARROW-13216) - [R] Type checks test fails with rtools35
+* [ARROW-13217](https://issues.apache.org/jira/browse/ARROW-13217) - [C++][Gandiva] Correct convert\_replace function for invalid chars on string beginning
+* [ARROW-13223](https://issues.apache.org/jira/browse/ARROW-13223) - [C++][CI] Fix thread sanitizer failures
+* [ARROW-13225](https://issues.apache.org/jira/browse/ARROW-13225) - [Go][Flight] Implement Custom Middleware Interface and Enable Integration Tests
+* [ARROW-13229](https://issues.apache.org/jira/browse/ARROW-13229) - [Python] ascii\_trim, ascii\_ltrim and ascii\_rtrim lack options
+* [ARROW-13239](https://issues.apache.org/jira/browse/ARROW-13239) - [Doc][Python] Dataset.head function doesn't mention required argument
+* [ARROW-13243](https://issues.apache.org/jira/browse/ARROW-13243) - [R] altrep function call in R 3.5
+* [ARROW-13246](https://issues.apache.org/jira/browse/ARROW-13246) - [C++] CSV skip\_rows\_after\_names can discard data prematurally
+* [ARROW-13249](https://issues.apache.org/jira/browse/ARROW-13249) - [Java][CI] Consistent timeout in the Java JNI build
+* [ARROW-13253](https://issues.apache.org/jira/browse/ARROW-13253) - [C++][FlightRPC] Segfault when sending record batch \>2GB
+* [ARROW-13254](https://issues.apache.org/jira/browse/ARROW-13254) - [Python] Processes killed and semaphore objects leaked when reading pandas data
+* [ARROW-13265](https://issues.apache.org/jira/browse/ARROW-13265) - [R] cli valgrind errors in nightlies
+* [ARROW-13266](https://issues.apache.org/jira/browse/ARROW-13266) - [JS] Improve benchmark names & add suite name to json
+* [ARROW-13281](https://issues.apache.org/jira/browse/ARROW-13281) - [C++][Gandiva] Error on timestampDiffMonth function behavior for negative diff values
+* [ARROW-13284](https://issues.apache.org/jira/browse/ARROW-13284) - [C++] Wrong pkg\_check\_modules() option name
+* [ARROW-13288](https://issues.apache.org/jira/browse/ARROW-13288) - [Python] Missing default values of kernel options in PyArrow
+* [ARROW-13290](https://issues.apache.org/jira/browse/ARROW-13290) - Compilation fails on clang-12 and gcc-11 due to missing include
+* [ARROW-13305](https://issues.apache.org/jira/browse/ARROW-13305) - [C++] Unable to install nightly on Ubuntu 21.04 due to CSV options
+* [ARROW-13315](https://issues.apache.org/jira/browse/ARROW-13315) - [R] Wrap r\_task\_group includes with ARROW\_R\_WITH\_ARROW checking
+* [ARROW-13321](https://issues.apache.org/jira/browse/ARROW-13321) - [C++][Python] MakeArrayFromScalar doesn't work for FixedSizeBinaryType
+* [ARROW-13324](https://issues.apache.org/jira/browse/ARROW-13324) - [R] Typo in bindings for utf8\_reverse and ascii\_reverse
+* [ARROW-13332](https://issues.apache.org/jira/browse/ARROW-13332) - [C++] TSAN failure in TestAsyncUtil.ReadaheadFailed
+* [ARROW-13341](https://issues.apache.org/jira/browse/ARROW-13341) - [C++] Segfault in arrow-compute-plan-test ExecPlanExecution.SourceScalarAggSink
+* [ARROW-13350](https://issues.apache.org/jira/browse/ARROW-13350) - [Python][CI] conda-python-3.7-pandas-0.24 nightly build failing in test\_extract\_datetime\_components
+* [ARROW-13352](https://issues.apache.org/jira/browse/ARROW-13352) - [C++] Valgrind failure in case\_when kernel
+* [ARROW-13353](https://issues.apache.org/jira/browse/ARROW-13353) - [Documentation] Build failing with sphinx.util.cfamily.DefinitionError
+* [ARROW-13360](https://issues.apache.org/jira/browse/ARROW-13360) - [C++] Missing dependencies in C++ thirdparty offline dependencies versions.txt 
+* [ARROW-13363](https://issues.apache.org/jira/browse/ARROW-13363) - [R] is.nan() errors on non-floating point data
+* [ARROW-13368](https://issues.apache.org/jira/browse/ARROW-13368) - [C++][Doc] Rename project to make\_struct in docs
+* [ARROW-13381](https://issues.apache.org/jira/browse/ARROW-13381) - [C++] ArrayFromJSON doesn't work for float value dictionary type
+* [ARROW-13382](https://issues.apache.org/jira/browse/ARROW-13382) - [C++] Aggregation over scalars fails autobrew R job
+* [ARROW-13384](https://issues.apache.org/jira/browse/ARROW-13384) - [C++] Specify minimum required zstd version in cmake
+* [ARROW-13391](https://issues.apache.org/jira/browse/ARROW-13391) - [C++] CSV streaming reader does not include same error information as table reader
+* [ARROW-13417](https://issues.apache.org/jira/browse/ARROW-13417) - [C++] The merged generator can sometimes pull from source sync-reentrant
+* [ARROW-13419](https://issues.apache.org/jira/browse/ARROW-13419) - [JS] Fix perf tests
+* [ARROW-13428](https://issues.apache.org/jira/browse/ARROW-13428) - [C++][Flight] -lssl is missing with bundled gRPC and system shared OpenSSL
+* [ARROW-13431](https://issues.apache.org/jira/browse/ARROW-13431) - [Release] Bump go version to 1.15; don't verify rust source anymore
+* [ARROW-13432](https://issues.apache.org/jira/browse/ARROW-13432) - [Release] Fix ssh connection to the binary uploader container
+
+
+## New Features and Improvements
+
+* [ARROW-2665](https://issues.apache.org/jira/browse/ARROW-2665) - [Python/C++] Add index() method to find first occurence of Python scalar
+* [ARROW-3014](https://issues.apache.org/jira/browse/ARROW-3014) - [C++] Minimal writer adapter for ORC file format
+* [ARROW-3316](https://issues.apache.org/jira/browse/ARROW-3316) - [R] Multi-threaded conversion from R data.frame to Arrow table / record batch
+* [ARROW-5385](https://issues.apache.org/jira/browse/ARROW-5385) - [Go] implement EXTENSION datatype
+* [ARROW-5640](https://issues.apache.org/jira/browse/ARROW-5640) - [Go] implement Map array
+* [ARROW-6513](https://issues.apache.org/jira/browse/ARROW-6513) - [CI] The conda environment files arrow/ci/conda\_env\_\*.yml should have .txt extension
+* [ARROW-6513](https://issues.apache.org/jira/browse/ARROW-6513) - [CI] The conda environment files arrow/ci/conda\_env\_\*.yml should have .txt extension
+* [ARROW-7001](https://issues.apache.org/jira/browse/ARROW-7001) - [C++] Develop threading APIs to accommodate nested parallelism 
+* [ARROW-7114](https://issues.apache.org/jira/browse/ARROW-7114) - [JS][CI] NodeJS build fails on Github Actions Windows node
+* [ARROW-7252](https://issues.apache.org/jira/browse/ARROW-7252) - [Rust] [Parquet] Reading UTF-8/JSON/ENUM field results in a lot of vec allocation
+* [ARROW-7396](https://issues.apache.org/jira/browse/ARROW-7396) - [Format] Register media types (MIME types) for Apache Arrow formats to IANA
+* [ARROW-8421](https://issues.apache.org/jira/browse/ARROW-8421) - [Rust] [Parquet] Implement parquet writer
+* [ARROW-8459](https://issues.apache.org/jira/browse/ARROW-8459) - [Dev][Archery] Use a more recent cmake-format
+* [ARROW-8527](https://issues.apache.org/jira/browse/ARROW-8527) - [C++][CSV] Add support for ReadOptions::skip\_rows \>= block\_size
+* [ARROW-8655](https://issues.apache.org/jira/browse/ARROW-8655) - [C++][Dataset][Python][R] Preserve partitioning information for a discovered Dataset
+* [ARROW-8676](https://issues.apache.org/jira/browse/ARROW-8676) - [Rust] Create implementation of IPC RecordBatch body buffer compression from ARROW-300
+* [ARROW-9054](https://issues.apache.org/jira/browse/ARROW-9054) - [C++] Add ScalarAggregateOptions
+* [ARROW-9056](https://issues.apache.org/jira/browse/ARROW-9056) - [C++] Support scalar aggregation over scalars
+* [ARROW-9140](https://issues.apache.org/jira/browse/ARROW-9140) - [R] Zero-copy Arrow to R where possible
+* [ARROW-9295](https://issues.apache.org/jira/browse/ARROW-9295) - [Archery] Support rust clippy in the lint command
+* [ARROW-9299](https://issues.apache.org/jira/browse/ARROW-9299) - [Python] Expose ORC metadata() in Python ORCFile
+* [ARROW-9313](https://issues.apache.org/jira/browse/ARROW-9313) - [Rust] Use feature enum
+* [ARROW-9421](https://issues.apache.org/jira/browse/ARROW-9421) - [C++][Parquet] Redundancies SchemaManifest::GetFieldIndices
+* [ARROW-9430](https://issues.apache.org/jira/browse/ARROW-9430) - [C++/Python] Kernel for SetItem(BooleanArray, values)
+* [ARROW-9697](https://issues.apache.org/jira/browse/ARROW-9697) - [C++][Dataset] num\_rows method for Dataset/Scanner
+* [ARROW-10031](https://issues.apache.org/jira/browse/ARROW-10031) - [Java] Support Java benchmark in Archery
+* [ARROW-10115](https://issues.apache.org/jira/browse/ARROW-10115) - [C++] CSV empty quoted string is treated as NULL
+* [ARROW-10316](https://issues.apache.org/jira/browse/ARROW-10316) - [Python] Consider using \_\_wrapped\_\_ for compute function introspection
+* [ARROW-10391](https://issues.apache.org/jira/browse/ARROW-10391) - [Rust] [Parquet] Nested Arrow reader
+* [ARROW-10440](https://issues.apache.org/jira/browse/ARROW-10440) - [C++][Dataset][Python] Add a callback to visit file writers just before Finish()
+* [ARROW-10550](https://issues.apache.org/jira/browse/ARROW-10550) - [Rust] [Parquet] Write nested types (struct, list)
+* [ARROW-10557](https://issues.apache.org/jira/browse/ARROW-10557) - [C++] Add scalar string slicing/substring extract kernel 
+* [ARROW-10640](https://issues.apache.org/jira/browse/ARROW-10640) - [C++] An "if\_else" kernel to combine two arrays based on a mask
+* [ARROW-10658](https://issues.apache.org/jira/browse/ARROW-10658) - [Python][Packaging] Wheel builds for Apple Silicon
+* [ARROW-10675](https://issues.apache.org/jira/browse/ARROW-10675) - [C++][Python] Support AWS S3 Web identity credentials
+* [ARROW-10797](https://issues.apache.org/jira/browse/ARROW-10797) - [C++] Investigate faster random generation for tests and benchmarks
+* [ARROW-10926](https://issues.apache.org/jira/browse/ARROW-10926) - [Rust] Add parquet reader / writer for decimal types
+* [ARROW-10959](https://issues.apache.org/jira/browse/ARROW-10959) - [C++] Add scalar string join kernel
+* [ARROW-11061](https://issues.apache.org/jira/browse/ARROW-11061) - [Rust] Validate array properties against schema
+* [ARROW-11173](https://issues.apache.org/jira/browse/ARROW-11173) - Add Map type as reader / writer in FieldReader / FieldWriter
+* [ARROW-11199](https://issues.apache.org/jira/browse/ARROW-11199) - [C++][Python] Fix the unit tests for the ORC reader
+* [ARROW-11206](https://issues.apache.org/jira/browse/ARROW-11206) - [C++][Compute][Python] Rename "project" kernel to "make\_struct"
+* [ARROW-11342](https://issues.apache.org/jira/browse/ARROW-11342) - [Python] [Gandiva] Expose ToString and result type information
+* [ARROW-11499](https://issues.apache.org/jira/browse/ARROW-11499) - [Packaging] Remove all use of bintray
+* [ARROW-11514](https://issues.apache.org/jira/browse/ARROW-11514) - [R][C++] Bindings for paste(), paste0(), str\_c()
+* [ARROW-11515](https://issues.apache.org/jira/browse/ARROW-11515) - [R] Bindings for strsplit
+* [ARROW-11565](https://issues.apache.org/jira/browse/ARROW-11565) - [C++][Gandiva] Modify upper()/lower() to work with UTF8 and add INIT\_CAP function
+* [ARROW-11581](https://issues.apache.org/jira/browse/ARROW-11581) - [Packaging][C++] Formalize distribution through vcpkg
+* [ARROW-11608](https://issues.apache.org/jira/browse/ARROW-11608) - [CI] turbodbc integration tests are failing (build isue)
+* [ARROW-11660](https://issues.apache.org/jira/browse/ARROW-11660) - [C++] Move RecordBatch::SelectColumns method from R to C++ library
+* [ARROW-11673](https://issues.apache.org/jira/browse/ARROW-11673) - [C++] Casting dictionary type to use different index type
+* [ARROW-11675](https://issues.apache.org/jira/browse/ARROW-11675) - [CI][C++] Resolve ctest failures on VS 2019 builds
+* [ARROW-11705](https://issues.apache.org/jira/browse/ARROW-11705) - [R] Support scalar value recycling in RecordBatch/Table$create()
+* [ARROW-11759](https://issues.apache.org/jira/browse/ARROW-11759) - [C++] Kernel to extract datetime components (year, month, day, etc) from timestamp type
+* [ARROW-11769](https://issues.apache.org/jira/browse/ARROW-11769) - [R] Pull groups from grouped\_df into RecordBatch or Table
+* [ARROW-11772](https://issues.apache.org/jira/browse/ARROW-11772) - [C++] Add asynchronous read to ipc::RecordBatchFileReader
+* [ARROW-11782](https://issues.apache.org/jira/browse/ARROW-11782) - [GLib][Ruby][Dataset] Remove bindings for internal classes
+* [ARROW-11787](https://issues.apache.org/jira/browse/ARROW-11787) - [R] Implement write csv
+* [ARROW-11843](https://issues.apache.org/jira/browse/ARROW-11843) - [C++] Add asynchronous read to parquet::arrow::FileReader
+* [ARROW-11849](https://issues.apache.org/jira/browse/ARROW-11849) - [R] Use roxygen @examplesIf tag in R docs
+* [ARROW-11889](https://issues.apache.org/jira/browse/ARROW-11889) - [C++] Add parallelism to streaming CSV reader
+* [ARROW-11909](https://issues.apache.org/jira/browse/ARROW-11909) - [C++] Get rid of MakeIteratorGenerator
+* [ARROW-11926](https://issues.apache.org/jira/browse/ARROW-11926) - [R] Pass on the new UCRT CRAN windows builds
+* [ARROW-11926](https://issues.apache.org/jira/browse/ARROW-11926) - [R] Pass on the new UCRT CRAN windows builds
+* [ARROW-11928](https://issues.apache.org/jira/browse/ARROW-11928) - [C++][Compute] Add ExecNode hierarchy
+* [ARROW-11929](https://issues.apache.org/jira/browse/ARROW-11929) - [C++][Compute] Promote Expression to the compute namespace
+* [ARROW-11930](https://issues.apache.org/jira/browse/ARROW-11930) - [C++][Dataset][Compute] Refactor Dataset scans to use an ExecNode graph
+* [ARROW-11932](https://issues.apache.org/jira/browse/ARROW-11932) - [C++] Provide ArrayBuilder::AppendScalar
+* [ARROW-11950](https://issues.apache.org/jira/browse/ARROW-11950) - [C++][Compute] Add unary negative kernel
+* [ARROW-11960](https://issues.apache.org/jira/browse/ARROW-11960) - [C++][Gandiva] Support escape in LIKE
+* [ARROW-11980](https://issues.apache.org/jira/browse/ARROW-11980) - [Python] Remove "experimental" status from Table.replace\_schema\_metadata
+* [ARROW-11986](https://issues.apache.org/jira/browse/ARROW-11986) - [C++][Gandiva] Implement IN expressions for doubles and floats
+* [ARROW-11990](https://issues.apache.org/jira/browse/ARROW-11990) - [C++][Compute] Use Status/Result return consistently to indicate errors
+* [ARROW-12004](https://issues.apache.org/jira/browse/ARROW-12004) - [C++] Result<detail::Empty\> is annoying
+* [ARROW-12010](https://issues.apache.org/jira/browse/ARROW-12010) - [C++][Compute] Improve performance of the hash table used in GroupIdentifier
+* [ARROW-12016](https://issues.apache.org/jira/browse/ARROW-12016) - [C++] Implement array\_sort\_indices and sort\_indices for BOOL type
+* [ARROW-12050](https://issues.apache.org/jira/browse/ARROW-12050) - [C++][Python][FlightRPC] Use StopToken to enable interrupting long Flight operations
+* [ARROW-12074](https://issues.apache.org/jira/browse/ARROW-12074) - [C++][Compute] Add scalar arithmetic kernels for decimal inputs
+* [ARROW-12083](https://issues.apache.org/jira/browse/ARROW-12083) - [R] schema use in open\_dataset
+* [ARROW-12092](https://issues.apache.org/jira/browse/ARROW-12092) - [R] Make expect\_dplyr\_equal() a bit stricter
+* [ARROW-12166](https://issues.apache.org/jira/browse/ARROW-12166) - [C++][Gandiva] Implements CONVERT\_TO(value, type) function
+* [ARROW-12184](https://issues.apache.org/jira/browse/ARROW-12184) - [R] Bindings for na.fail, na.omit, na.exclude, na.pass
+* [ARROW-12185](https://issues.apache.org/jira/browse/ARROW-12185) - [R] Bindings for any, all
+* [ARROW-12198](https://issues.apache.org/jira/browse/ARROW-12198) - [R] bindings for strptime
+* [ARROW-12199](https://issues.apache.org/jira/browse/ARROW-12199) - [R] bindings for stddev, variance
+* [ARROW-12205](https://issues.apache.org/jira/browse/ARROW-12205) - [C++][Gandiva] Implement TO\_TIME([number] secs) and TO\_TIMESTAMP([number] secs) function
+* [ARROW-12231](https://issues.apache.org/jira/browse/ARROW-12231) - [C++][Dataset] Separate datasets backed by readers from InMemoryDataset
+* [ARROW-12253](https://issues.apache.org/jira/browse/ARROW-12253) - [Rust] [Ballista] Implement scalable joins
+* [ARROW-12255](https://issues.apache.org/jira/browse/ARROW-12255) - [Rust] [Ballista] Integrate scheduler with DataFusion
+* [ARROW-12256](https://issues.apache.org/jira/browse/ARROW-12256) - [Rust] [Ballista] Add DataFrame support
+* [ARROW-12257](https://issues.apache.org/jira/browse/ARROW-12257) - [Rust] [Ballista] Publish user guide to Arrow site
+* [ARROW-12261](https://issues.apache.org/jira/browse/ARROW-12261) - [Rust] [Ballista] Ballista should not have its own DataFrame API
+* [ARROW-12291](https://issues.apache.org/jira/browse/ARROW-12291) - [R] Determine the type of an unevaluated expression
+* [ARROW-12310](https://issues.apache.org/jira/browse/ARROW-12310) - [Java] ValueVector\#getObject should support covariance for complex types
+* [ARROW-12355](https://issues.apache.org/jira/browse/ARROW-12355) - [C++] Implement efficient async CSV scanning
+* [ARROW-12362](https://issues.apache.org/jira/browse/ARROW-12362) - [Rust] [DataFusion] topk\_query test failure
+* [ARROW-12364](https://issues.apache.org/jira/browse/ARROW-12364) - [Python] [Dataset] Add metadata\_collector option to ds.write\_dataset()
+* [ARROW-12378](https://issues.apache.org/jira/browse/ARROW-12378) - [C++][Gandiva] Implement castVARBINARY functions
+* [ARROW-12386](https://issues.apache.org/jira/browse/ARROW-12386) - [C++] Support file parallelism in AsyncScanner
+* [ARROW-12391](https://issues.apache.org/jira/browse/ARROW-12391) - [Rust][DataFusion] Implement date\_trunc() function
+* [ARROW-12392](https://issues.apache.org/jira/browse/ARROW-12392) - [C++] Restore asynchronous streaming CSV reader
+* [ARROW-12393](https://issues.apache.org/jira/browse/ARROW-12393) - [JS] Optimally use closure compiler
+* [ARROW-12403](https://issues.apache.org/jira/browse/ARROW-12403) - [Rust] [Ballista] Integration tests should check that query results are correct
+* [ARROW-12415](https://issues.apache.org/jira/browse/ARROW-12415) - [CI] [Python] ERROR: Failed building wheel for pygit2 on ARM64
+* [ARROW-12424](https://issues.apache.org/jira/browse/ARROW-12424) - [Go][Parquet] Add Schema Package
+* [ARROW-12428](https://issues.apache.org/jira/browse/ARROW-12428) - [Python] pyarrow.parquet.read\_\* should use pre\_buffer=True
+* [ARROW-12434](https://issues.apache.org/jira/browse/ARROW-12434) - [Rust] [Ballista] Show executed plans with metrics
+* [ARROW-12442](https://issues.apache.org/jira/browse/ARROW-12442) - [CI] Set job timeouts on GitHub Actions
+* [ARROW-12443](https://issues.apache.org/jira/browse/ARROW-12443) - [C++][Gandiva] Implement castVARCHAR function for binary input
+* [ARROW-12444](https://issues.apache.org/jira/browse/ARROW-12444) - [RUST] [CI] Remove Rust and point integration tests to arrow-rs repo
+* [ARROW-12445](https://issues.apache.org/jira/browse/ARROW-12445) - [Rust] Design and implement packaging process to bundle Rust in signed tar
+* [ARROW-12468](https://issues.apache.org/jira/browse/ARROW-12468) - [Python][R] Expose UseAsync to python/R
+* [ARROW-12478](https://issues.apache.org/jira/browse/ARROW-12478) - [C++] Support LLVM 12
+* [ARROW-12484](https://issues.apache.org/jira/browse/ARROW-12484) - [CI] Change jinja macros to not require CROSSBOW\_TOKEN to upload artifacts in Github Actions
+* [ARROW-12489](https://issues.apache.org/jira/browse/ARROW-12489) - [Developer] autotune is broken
+* [ARROW-12490](https://issues.apache.org/jira/browse/ARROW-12490) - [Dev] Use miniforge for all platforms
+* [ARROW-12492](https://issues.apache.org/jira/browse/ARROW-12492) - [Python] Add an helper method to decode a DictionaryArray back to a plain Array
+* [ARROW-12496](https://issues.apache.org/jira/browse/ARROW-12496) - [C++][Dataset] Ensure Scanner tests fully cover async
+* [ARROW-12499](https://issues.apache.org/jira/browse/ARROW-12499) - [C++][Compute][R] Add ScalarAggregateOptions to Any and All kernels
+* [ARROW-12500](https://issues.apache.org/jira/browse/ARROW-12500) - [C++][Dataset] Consolidate similar tests for file formats
+* [ARROW-12501](https://issues.apache.org/jira/browse/ARROW-12501) - [CI][Ruby] Remove needless workaround for MinGW build
+* [ARROW-12507](https://issues.apache.org/jira/browse/ARROW-12507) - [CI] Remove duplicated cron/nightly builds
+* [ARROW-12512](https://issues.apache.org/jira/browse/ARROW-12512) - [C++][Dataset] Implement CSV writing support
+* [ARROW-12514](https://issues.apache.org/jira/browse/ARROW-12514) - [Release] Don't run Gandiva related Ruby test with ARROW\_GANDIVA=OFF
+* [ARROW-12517](https://issues.apache.org/jira/browse/ARROW-12517) - [Go] Expose App Metadata in Flight client
+* [ARROW-12518](https://issues.apache.org/jira/browse/ARROW-12518) - [Python] Expose Parquet statistics has\_null\_count / has\_distinct\_count
+* [ARROW-12520](https://issues.apache.org/jira/browse/ARROW-12520) - [R] Minor docs updates
+* [ARROW-12522](https://issues.apache.org/jira/browse/ARROW-12522) - [C++] Implement asynchronous/"lazy" variants of ReadRangeCache
+* [ARROW-12525](https://issues.apache.org/jira/browse/ARROW-12525) - [JS] Vector toJSON returns an array
+* [ARROW-12527](https://issues.apache.org/jira/browse/ARROW-12527) - [Dev] Don't try getting JIRA information for MINOR PR
+* [ARROW-12528](https://issues.apache.org/jira/browse/ARROW-12528) - [JS] Support typed arrays in Table.new
+* [ARROW-12530](https://issues.apache.org/jira/browse/ARROW-12530) - [C++] Remove Buffer::mutable\_data\_ member and use const\_cast on data\_ only if is\_mutable\_ is true
+* [ARROW-12533](https://issues.apache.org/jira/browse/ARROW-12533) - [C++] Random real generator is slow on Arm64 Linux when built with clang
+* [ARROW-12534](https://issues.apache.org/jira/browse/ARROW-12534) - [C++][Gandiva] Implement LEFT and RIGHT functions on Gandiva for string input values
+* [ARROW-12537](https://issues.apache.org/jira/browse/ARROW-12537) - [JS] Docs build should not include test sources
+* [ARROW-12541](https://issues.apache.org/jira/browse/ARROW-12541) - [Docs] Improve styling/readability of tables in the new doc theme
+* [ARROW-12551](https://issues.apache.org/jira/browse/ARROW-12551) - [Java][Release] Java post-release tests fail due to missing testing data
+* [ARROW-12554](https://issues.apache.org/jira/browse/ARROW-12554) - Allow duplicates in the value\_set for compute::is\_in  
+* [ARROW-12555](https://issues.apache.org/jira/browse/ARROW-12555) - [Java][Release] Java post-release script misses dataset JNI bindings
+* [ARROW-12556](https://issues.apache.org/jira/browse/ARROW-12556) - [C++][Gandiva] Implement BYTESUBSTRING functions on Gandiva
+* [ARROW-12560](https://issues.apache.org/jira/browse/ARROW-12560) - [C++] Investigate utilizing aggressive thread task creation when adding callback to finished future
+* [ARROW-12567](https://issues.apache.org/jira/browse/ARROW-12567) - [C++][Gandiva] Implement LPAD and RPAD functions for string input values
+* [ARROW-12567](https://issues.apache.org/jira/browse/ARROW-12567) - [C++][Gandiva] Implement LPAD and RPAD functions for string input values
+* [ARROW-12571](https://issues.apache.org/jira/browse/ARROW-12571) - [R][CI] Run nightly R with valgrind
+* [ARROW-12575](https://issues.apache.org/jira/browse/ARROW-12575) - [R] Use unary negative kernel
+* [ARROW-12577](https://issues.apache.org/jira/browse/ARROW-12577) - [Website] Use Artifactory instead of Bintray in all places
+* [ARROW-12578](https://issues.apache.org/jira/browse/ARROW-12578) - [JS] Simplify UTF8 handling in NodeJS
+* [ARROW-12581](https://issues.apache.org/jira/browse/ARROW-12581) - [C++][FlightRPC] Benchmark compression with real data
+* [ARROW-12584](https://issues.apache.org/jira/browse/ARROW-12584) - [C++][Python] Expose method for benchmarking tools to release unused memory from the allocators
+* [ARROW-12591](https://issues.apache.org/jira/browse/ARROW-12591) - [Java][Gandiva] Create single Gandiva jar for MacOS and Linux
+* [ARROW-12593](https://issues.apache.org/jira/browse/ARROW-12593) - [Packaging][Ubuntu] Add support for Ubuntu 21.04
+* [ARROW-12597](https://issues.apache.org/jira/browse/ARROW-12597) - [C++] Implement OptionalParallelForAsync
+* [ARROW-12598](https://issues.apache.org/jira/browse/ARROW-12598) - [C++][Dataset] Implement row-count for CSV or allow selecting 0 columns from CSV
+* [ARROW-12599](https://issues.apache.org/jira/browse/ARROW-12599) - [Doc][Python] Documentation missing for pyarrow.Table
+* [ARROW-12600](https://issues.apache.org/jira/browse/ARROW-12600) - [CI] Push docker images from crossbow tasks
+* [ARROW-12602](https://issues.apache.org/jira/browse/ARROW-12602) - [R] Add BuildInfo from C++ to arrow\_info
+* [ARROW-12608](https://issues.apache.org/jira/browse/ARROW-12608) - [C++] Add split\_pattern\_regex function
+* [ARROW-12612](https://issues.apache.org/jira/browse/ARROW-12612) - [C++][Compute] Add Expression to type\_fwd.h
+* [ARROW-12619](https://issues.apache.org/jira/browse/ARROW-12619) - [Python] pyarrow sdist should not require git
+* [ARROW-12621](https://issues.apache.org/jira/browse/ARROW-12621) - [C++][Gandiva] Add alias to sha1 and sha256 functions
+* [ARROW-12631](https://issues.apache.org/jira/browse/ARROW-12631) - [Python] pyarrow.dataset.write\_table should accept a Scanner to write
+* [ARROW-12643](https://issues.apache.org/jira/browse/ARROW-12643) - Add documentation for experimental repos
+* [ARROW-12645](https://issues.apache.org/jira/browse/ARROW-12645) - [Python] Fix numpydoc validation
+* [ARROW-12648](https://issues.apache.org/jira/browse/ARROW-12648) - [C++][FlightRPC] Allow using TLS in benchmark
+* [ARROW-12649](https://issues.apache.org/jira/browse/ARROW-12649) - [Python/Packaging] Move conda-aarch64 to Azure with cross-compilation
+* [ARROW-12653](https://issues.apache.org/jira/browse/ARROW-12653) - [Archery] allow me to add a comment to crossbow requests
+* [ARROW-12658](https://issues.apache.org/jira/browse/ARROW-12658) - [C++] Bump aws-c-common to v0.5.10 
+* [ARROW-12660](https://issues.apache.org/jira/browse/ARROW-12660) - [R] Post-4.0 adjustments for CRAN
+* [ARROW-12661](https://issues.apache.org/jira/browse/ARROW-12661) - [C++] CSV add skip rows after column names
+* [ARROW-12662](https://issues.apache.org/jira/browse/ARROW-12662) - [Website] Force to use squash merge
+* [ARROW-12667](https://issues.apache.org/jira/browse/ARROW-12667) - [Python] Ensure test coverage for conversion of strided numpy arrays
+* [ARROW-12675](https://issues.apache.org/jira/browse/ARROW-12675) - [C++] CSV should include line/row numbers in parsing error messages
+* [ARROW-12677](https://issues.apache.org/jira/browse/ARROW-12677) - [Python] Add a mask argument to pyarrow.StructArray.from\_arrays
+* [ARROW-12685](https://issues.apache.org/jira/browse/ARROW-12685) - [C++][Compute] Add unary absolute value kernel
+* [ARROW-12686](https://issues.apache.org/jira/browse/ARROW-12686) - [C++][Python][FlightRPC] Support export\_to\_c in DoGet/inherit from RecordBatchReader
+* [ARROW-12687](https://issues.apache.org/jira/browse/ARROW-12687) - [C++][Python][Dataset] Support C Data Interface with Scanner
+* [ARROW-12689](https://issues.apache.org/jira/browse/ARROW-12689) - [R] Implement ArrowArrayStream C interface
+* [ARROW-12692](https://issues.apache.org/jira/browse/ARROW-12692) - [R] Improve tests and comments for strsplit() bindings
+* [ARROW-12694](https://issues.apache.org/jira/browse/ARROW-12694) - [R][CI] rtools35 job failing on 32-bit build tests
+* [ARROW-12696](https://issues.apache.org/jira/browse/ARROW-12696) - [R] Improve testing of error messages converted to warnings
+* [ARROW-12699](https://issues.apache.org/jira/browse/ARROW-12699) - [CI][Packaging][Java] Generate a jar compatible with Linux and MacOS for all Arrow components
+* [ARROW-12702](https://issues.apache.org/jira/browse/ARROW-12702) - [JS] Upgrade Webpack and terser
+* [ARROW-12703](https://issues.apache.org/jira/browse/ARROW-12703) - [JS] Separate Table from DataFrame
+* [ARROW-12704](https://issues.apache.org/jira/browse/ARROW-12704) - [JS] use optional chaining
+* [ARROW-12709](https://issues.apache.org/jira/browse/ARROW-12709) - [C++] Add variadic string join kernel
+* [ARROW-12713](https://issues.apache.org/jira/browse/ARROW-12713) - [C++] String reverse kernel
+* [ARROW-12715](https://issues.apache.org/jira/browse/ARROW-12715) - [C++] SQL-style glob string match kernel
+* [ARROW-12716](https://issues.apache.org/jira/browse/ARROW-12716) - [C++] Left/right/center string padding kernels
+* [ARROW-12717](https://issues.apache.org/jira/browse/ARROW-12717) - [C++] Substring find position kernel
+* [ARROW-12719](https://issues.apache.org/jira/browse/ARROW-12719) - [C++][Python] pyarrow.fs.S3FileSystem pass extra kwargs i.e ACL
+* [ARROW-12721](https://issues.apache.org/jira/browse/ARROW-12721) - [CI] Fix path for uploading aarch64 conda artifacts from the nightly builds
+* [ARROW-12722](https://issues.apache.org/jira/browse/ARROW-12722) - [R] Raise error when attemping to print table with duplicated naming
+* [ARROW-12730](https://issues.apache.org/jira/browse/ARROW-12730) - [MATLAB] Update featherreadmex and featherwritemex to build against latest arrow c++ APIs
+* [ARROW-12731](https://issues.apache.org/jira/browse/ARROW-12731) - [R] Use InMemoryDataset for Table/RecordBatch in dplyr code
+* [ARROW-12736](https://issues.apache.org/jira/browse/ARROW-12736) - [C++] Eliminate unnecessary copy in FieldPath::Get()
+* [ARROW-12738](https://issues.apache.org/jira/browse/ARROW-12738) - [CI] [Gandiva] Nightly build error in azure-conda-osx-clang-py38 (and py39, py\*-r\*)
+* [ARROW-12741](https://issues.apache.org/jira/browse/ARROW-12741) - [CI] Configure GitHub Token for Nightly Builds
+* [ARROW-12745](https://issues.apache.org/jira/browse/ARROW-12745) - [C++][Compute] Add floor, ceiling, and truncate kernels
+* [ARROW-12749](https://issues.apache.org/jira/browse/ARROW-12749) - [C++] Unnecessary copy cause by constructing RecordBatch/Table/Schema from lvalues
+* [ARROW-12750](https://issues.apache.org/jira/browse/ARROW-12750) - [CI] [R] Actually pass parameterized docker options to the templates
+* [ARROW-12751](https://issues.apache.org/jira/browse/ARROW-12751) - [C++] Add variadic row-wise min/max kernels (least/greatest)
+* [ARROW-12758](https://issues.apache.org/jira/browse/ARROW-12758) - [R] Add examples to more function documentation
+* [ARROW-12760](https://issues.apache.org/jira/browse/ARROW-12760) - [C++][Python][R] S3FileSystem: IO thread parallelism limited to 8 threads
+* [ARROW-12761](https://issues.apache.org/jira/browse/ARROW-12761) - [R] Better error handling for write\_to\_raw
+* [ARROW-12764](https://issues.apache.org/jira/browse/ARROW-12764) - [CI] Fix arguments in Conda Windows builds
+* [ARROW-12777](https://issues.apache.org/jira/browse/ARROW-12777) - [R] Convert all inputs to Arrow objects in match\_arrow and is\_in
+* [ARROW-12781](https://issues.apache.org/jira/browse/ARROW-12781) - [R] Implement is.type() functions for dplyr
+* [ARROW-12785](https://issues.apache.org/jira/browse/ARROW-12785) - [CI] the r-devdocs build errors when brew installing gcc
+* [ARROW-12791](https://issues.apache.org/jira/browse/ARROW-12791) - [R] Better error handling for DatasetFactory$Finish() when no format specified
+* [ARROW-12796](https://issues.apache.org/jira/browse/ARROW-12796) - [JS] Support JSON output from benchmarks
+* [ARROW-12800](https://issues.apache.org/jira/browse/ARROW-12800) - [JS] Drop IE Support and remove text encoder and decoder polyfills
+* [ARROW-12801](https://issues.apache.org/jira/browse/ARROW-12801) - [CI][Packaging][Java] Include all modules in script that generate Arrow jars
+* [ARROW-12806](https://issues.apache.org/jira/browse/ARROW-12806) - [Python] test\_write\_to\_dataset\_filesystem missing a dataset mark
+* [ARROW-12808](https://issues.apache.org/jira/browse/ARROW-12808) - [JS] Document browser support
+* [ARROW-12810](https://issues.apache.org/jira/browse/ARROW-12810) - [Python] Run tests with AWS\_EC2\_METADATA\_DISABLED=true
+* [ARROW-12812](https://issues.apache.org/jira/browse/ARROW-12812) - [Packaging][Java] Improve JNI jars build
+* [ARROW-12824](https://issues.apache.org/jira/browse/ARROW-12824) - [R][CI] Upgrade builds for R 4.1 release
+* [ARROW-12827](https://issues.apache.org/jira/browse/ARROW-12827) - [C++] [Dataset] Review error pass-through in the datasets API
+* [ARROW-12829](https://issues.apache.org/jira/browse/ARROW-12829) - [GLib][Ruby] Add support for Apache Arrow Flight
+* [ARROW-12831](https://issues.apache.org/jira/browse/ARROW-12831) - [CI][macOS] Remove needless Homebrew workaround
+* [ARROW-12832](https://issues.apache.org/jira/browse/ARROW-12832) - [JS] Write benchmarks in TypeScript
+* [ARROW-12833](https://issues.apache.org/jira/browse/ARROW-12833) - [JS] Construct perf data in JS
+* [ARROW-12835](https://issues.apache.org/jira/browse/ARROW-12835) - [C++] Implement case insenstive match in match\_substring(\_regex) and match\_like
+* [ARROW-12836](https://issues.apache.org/jira/browse/ARROW-12836) - [C++] Installation on IBM i fails because of CxxFlags
+* [ARROW-12841](https://issues.apache.org/jira/browse/ARROW-12841) - [R] Add examples to more function documentation - part 2
+* [ARROW-12843](https://issues.apache.org/jira/browse/ARROW-12843) - [C++][Compute] Add is\_inf kernel for floating point arrays
+* [ARROW-12848](https://issues.apache.org/jira/browse/ARROW-12848) - [Release] Mail template points to 404
+* [ARROW-12851](https://issues.apache.org/jira/browse/ARROW-12851) - [Go][Parquet] Add Encoding Package Part 1
+* [ARROW-12856](https://issues.apache.org/jira/browse/ARROW-12856) - [C++][Gandiva] Implement castBIT and castBOOLEAN functions on Gandiva
+* [ARROW-12859](https://issues.apache.org/jira/browse/ARROW-12859) - [C++] Add ScalarFromJSON for easier testing
+* [ARROW-12861](https://issues.apache.org/jira/browse/ARROW-12861) - [C++][Compute] Add sign function kernels
+* [ARROW-12867](https://issues.apache.org/jira/browse/ARROW-12867) - [R] Bindings for abs()
+* [ARROW-12868](https://issues.apache.org/jira/browse/ARROW-12868) - [R] Bindings for find\_substring and find\_substring\_regex
+* [ARROW-12869](https://issues.apache.org/jira/browse/ARROW-12869) - [R] Bindings for utf8\_reverse and ascii\_reverse
+* [ARROW-12870](https://issues.apache.org/jira/browse/ARROW-12870) - [R] Bindings for stringr::str\_like
+* [ARROW-12875](https://issues.apache.org/jira/browse/ARROW-12875) - [JS] Upgrade Jest and other minor updates
+* [ARROW-12883](https://issues.apache.org/jira/browse/ARROW-12883) - [R] [CI] version compatibility fails on R 4.1
+* [ARROW-12891](https://issues.apache.org/jira/browse/ARROW-12891) - [C++][Compute][Dataset] Extract subtree pruning logic to compute::
+* [ARROW-12894](https://issues.apache.org/jira/browse/ARROW-12894) - [R] Bump R version
+* [ARROW-12895](https://issues.apache.org/jira/browse/ARROW-12895) - [CI] Use "concurrency" setting on Github Actions
+* [ARROW-12898](https://issues.apache.org/jira/browse/ARROW-12898) - [Release][C\#] Package upload script is broken
+* [ARROW-12900](https://issues.apache.org/jira/browse/ARROW-12900) - [Python][Documentation] an np import in Reading Datasets docs
+* [ARROW-12901](https://issues.apache.org/jira/browse/ARROW-12901) - [R] Follow on to more examples
+* [ARROW-12909](https://issues.apache.org/jira/browse/ARROW-12909) - [R][Release] Build of ubuntu-docs is failing
+* [ARROW-12912](https://issues.apache.org/jira/browse/ARROW-12912) - [Website] Use .asf.yaml for publishing
+* [ARROW-12915](https://issues.apache.org/jira/browse/ARROW-12915) - [Release] Build of ubuntu-docs is failing on thrift
+* [ARROW-12936](https://issues.apache.org/jira/browse/ARROW-12936) - [C++][Gandiva] Implement ASCII Hive function on Gandiva
+* [ARROW-12937](https://issues.apache.org/jira/browse/ARROW-12937) - [C++] Allow specifying default metadata for new S3 files
+* [ARROW-12939](https://issues.apache.org/jira/browse/ARROW-12939) - [R] Simplify RTask stop handling
+* [ARROW-12940](https://issues.apache.org/jira/browse/ARROW-12940) - [R] Expose C interface as R6 methods
+* [ARROW-12948](https://issues.apache.org/jira/browse/ARROW-12948) - [C++] Add string slice replace kernel
+* [ARROW-12949](https://issues.apache.org/jira/browse/ARROW-12949) - [C++] Add string starts-with/ends-with kernels
+* [ARROW-12950](https://issues.apache.org/jira/browse/ARROW-12950) - [C++] Add substring count kernel
+* [ARROW-12951](https://issues.apache.org/jira/browse/ARROW-12951) - [C++] Refactor StringTransform
+* [ARROW-12952](https://issues.apache.org/jira/browse/ARROW-12952) - [C++] Add regex count kernel
+* [ARROW-12955](https://issues.apache.org/jira/browse/ARROW-12955) - [C++] Add additional type support for if\_else kernel
+* [ARROW-12957](https://issues.apache.org/jira/browse/ARROW-12957) - [R] rchk issues on cran
+* [ARROW-12961](https://issues.apache.org/jira/browse/ARROW-12961) - [C++] MSVC issues warning building PyArrow on Windows
+* [ARROW-12962](https://issues.apache.org/jira/browse/ARROW-12962) - [GLib][Ruby] Add Arrow:Scalar
+* [ARROW-12964](https://issues.apache.org/jira/browse/ARROW-12964) - [R] Add bindings for ifelse() and if\_else()
+* [ARROW-12966](https://issues.apache.org/jira/browse/ARROW-12966) - [Python] Expose Python binding for ElementWiseAggregateOptions
+* [ARROW-12967](https://issues.apache.org/jira/browse/ARROW-12967) - [R] Add bindings for pmin() and pmax()
+* [ARROW-12968](https://issues.apache.org/jira/browse/ARROW-12968) - [R] [CI] Add an rchk job to our nightlies
+* [ARROW-12972](https://issues.apache.org/jira/browse/ARROW-12972) - [CI] ][C++] archive\_write\_add\_filter\_zstd error on CentOS + ARM64
+* [ARROW-12975](https://issues.apache.org/jira/browse/ARROW-12975) - [C++][Python] if\_else kernel doesn't support upcasting 
+* [ARROW-12982](https://issues.apache.org/jira/browse/ARROW-12982) - [C++] Re-enable unused-variable warning
+* [ARROW-12984](https://issues.apache.org/jira/browse/ARROW-12984) - [C++] Passing options parameter of Count/Index aggregation by reference
+* [ARROW-12985](https://issues.apache.org/jira/browse/ARROW-12985) - [Python][Packaging] Unable to install pygit2 in the arm64 wheel builds
+* [ARROW-12986](https://issues.apache.org/jira/browse/ARROW-12986) - [C++][Gandiva] Implement new cache eviction policy in Gandiva
+* [ARROW-12992](https://issues.apache.org/jira/browse/ARROW-12992) - [R] bindings for substr(), substring(), str\_sub()
+* [ARROW-12994](https://issues.apache.org/jira/browse/ARROW-12994) - [R] Fix tests that assume UTC local tz
+* [ARROW-12996](https://issues.apache.org/jira/browse/ARROW-12996) - [C++] CSV stream reader has no progress indication
+* [ARROW-13002](https://issues.apache.org/jira/browse/ARROW-13002) - [C++] Add a check for the utf8proc's version in CMake
+* [ARROW-13005](https://issues.apache.org/jira/browse/ARROW-13005) - [C++] Support filter/take for union data type.
+* [ARROW-13006](https://issues.apache.org/jira/browse/ARROW-13006) - [C++][Gandiva] Implement BASE64 and UNBASE64 Hive functions on Gandiva
+* [ARROW-13009](https://issues.apache.org/jira/browse/ARROW-13009) - [Doc][Dev] Document builds mailing-list
+* [ARROW-13022](https://issues.apache.org/jira/browse/ARROW-13022) - [R] bindings for lubridate's year, isoyear, quarter, month, day, wday, yday, isoweek, hour, minute, and second functions
+* [ARROW-13025](https://issues.apache.org/jira/browse/ARROW-13025) - [C++][Compute] Enhance FunctionOptions with equality, debug representability, and serializability
+* [ARROW-13027](https://issues.apache.org/jira/browse/ARROW-13027) - [C++] Fix ASAN stack traces in CI
+* [ARROW-13030](https://issues.apache.org/jira/browse/ARROW-13030) - [CI][Go] Setup Arm64 golang CI
+* [ARROW-13031](https://issues.apache.org/jira/browse/ARROW-13031) - [JS] Support arm in closure compiler on macOS
+* [ARROW-13032](https://issues.apache.org/jira/browse/ARROW-13032) - [Java] Update gauva version
+* [ARROW-13034](https://issues.apache.org/jira/browse/ARROW-13034) - [Python][Docs] Update outdated examples for hdfs/azure on the Parquet doc page
+* [ARROW-13036](https://issues.apache.org/jira/browse/ARROW-13036) - [Doc] Mention recommended file extension(s) for Arrow IPC
+* [ARROW-13042](https://issues.apache.org/jira/browse/ARROW-13042) - [C++] Automatic checks that kernels don't leave uninitialized data in output
+* [ARROW-13043](https://issues.apache.org/jira/browse/ARROW-13043) - [GLib][Ruby] Add GArrowEqualOptions
+* [ARROW-13044](https://issues.apache.org/jira/browse/ARROW-13044) - [Java] Union vectors should extend ValueVector
+* [ARROW-13045](https://issues.apache.org/jira/browse/ARROW-13045) - [Packaging][RPM][deb] Don't install system utf8proc if it's old
+* [ARROW-13047](https://issues.apache.org/jira/browse/ARROW-13047) - [Website] Add kiszk to committer list
+* [ARROW-13049](https://issues.apache.org/jira/browse/ARROW-13049) - [C++][Gandiva] Implement BIN Hive function on Gandiva
+* [ARROW-13050](https://issues.apache.org/jira/browse/ARROW-13050) - [C++][Gandiva] Implement SPACE Hive function on Gandiva
+* [ARROW-13054](https://issues.apache.org/jira/browse/ARROW-13054) - [C++] Add option to specify the first day of the week for the "day\_of\_week" temporal kernel
+* [ARROW-13064](https://issues.apache.org/jira/browse/ARROW-13064) - [C++] Add a general "if, ifelse, ..., else" kernel ("CASE WHEN")
+* [ARROW-13065](https://issues.apache.org/jira/browse/ARROW-13065) - [Packaging][RPM] Add missing required LZ4 version information
+* [ARROW-13068](https://issues.apache.org/jira/browse/ARROW-13068) - [GLib][Dataset] Change prefix to gadataset\_ from gad\_
+* [ARROW-13070](https://issues.apache.org/jira/browse/ARROW-13070) - [R] bindings for sd and var
+* [ARROW-13072](https://issues.apache.org/jira/browse/ARROW-13072) - [C++] Add bitwise arithmetic compute functions
+* [ARROW-13074](https://issues.apache.org/jira/browse/ARROW-13074) - [Python] Start with deprecating ParquetDataset custom attributes
+* [ARROW-13075](https://issues.apache.org/jira/browse/ARROW-13075) - [Python] Expose C data interface API for pyarrow.Field
+* [ARROW-13076](https://issues.apache.org/jira/browse/ARROW-13076) - [Java] Enable ExtensionType to use StructVector and UnionVector for underlying storage
+* [ARROW-13082](https://issues.apache.org/jira/browse/ARROW-13082) - [CI] Forward R argument to ubuntu-docs build
+* [ARROW-13086](https://issues.apache.org/jira/browse/ARROW-13086) - [Python] Expose Parquet ArrowReaderProperties::coerce\_int96\_timestamp\_unit\_
+* [ARROW-13086](https://issues.apache.org/jira/browse/ARROW-13086) - [Python] Expose Parquet ArrowReaderProperties::coerce\_int96\_timestamp\_unit\_
+* [ARROW-13091](https://issues.apache.org/jira/browse/ARROW-13091) - [Python] Add compression\_level argument to IpcWriteOptions constructor
+* [ARROW-13092](https://issues.apache.org/jira/browse/ARROW-13092) - [C++] CreateDir should fail if the target exists and is not a directory
+* [ARROW-13095](https://issues.apache.org/jira/browse/ARROW-13095) - [C++] Implement trigonometric compute functions
+* [ARROW-13096](https://issues.apache.org/jira/browse/ARROW-13096) - [C++] Implement logarithm compute functions
+* [ARROW-13097](https://issues.apache.org/jira/browse/ARROW-13097) - [C++] Provide a simple reflection utility for {{struct}}s
+* [ARROW-13098](https://issues.apache.org/jira/browse/ARROW-13098) - [Dev][Archery] Reorganize docker submodule to its own subpackage
+* [ARROW-13100](https://issues.apache.org/jira/browse/ARROW-13100) - [MATLAB] Integrate GoogleTest with MATLAB Interface C++ Code
+* [ARROW-13101](https://issues.apache.org/jira/browse/ARROW-13101) - [Python][Doc] pyarrow.FixedSizeListArray does not appear in the documentation
+* [ARROW-13110](https://issues.apache.org/jira/browse/ARROW-13110) - [C++] Deadlock can happen when using BackgroundGenerator without transferring callbacks
+* [ARROW-13113](https://issues.apache.org/jira/browse/ARROW-13113) - [R] use RTasks to manage parallel in converting arrow to R
+* [ARROW-13117](https://issues.apache.org/jira/browse/ARROW-13117) - [R] Retain schema in new Expressions
+* [ARROW-13119](https://issues.apache.org/jira/browse/ARROW-13119) - [R] Set empty schema in scalar Expressions
+* [ARROW-13124](https://issues.apache.org/jira/browse/ARROW-13124) - [Ruby] Add support for memory view
+* [ARROW-13127](https://issues.apache.org/jira/browse/ARROW-13127) - [R] Valgrind nightly errors
+* [ARROW-13136](https://issues.apache.org/jira/browse/ARROW-13136) - [C++] Add a "coalesce" variadic scalar kernel
+* [ARROW-13137](https://issues.apache.org/jira/browse/ARROW-13137) - [C++][Documentation] Make in-table references consistent
+* [ARROW-13140](https://issues.apache.org/jira/browse/ARROW-13140) - [C++/Python] Upgrade libthrift pin in the nightlies
+* [ARROW-13142](https://issues.apache.org/jira/browse/ARROW-13142) - [Python] Use vector append when converting from list of non-strided numpy arrays
+* [ARROW-13147](https://issues.apache.org/jira/browse/ARROW-13147) - [Java] Respect the rounding policy when allocating vector buffers
+* [ARROW-13157](https://issues.apache.org/jira/browse/ARROW-13157) - [C++] Add find\_substring\_regex kernel and implement ignore\_case for find\_substring
+* [ARROW-13158](https://issues.apache.org/jira/browse/ARROW-13158) - [Python] Fix repr and contains of StructScalar with duplicate field names
+* [ARROW-13162](https://issues.apache.org/jira/browse/ARROW-13162) - [C++][Gandiva] Add new alias for extract date functions in Gandiva registry
+* [ARROW-13171](https://issues.apache.org/jira/browse/ARROW-13171) - [R] Add binding for str\_pad()
+* [ARROW-13190](https://issues.apache.org/jira/browse/ARROW-13190) - [C++] [Gandiva] Change behavior of INITCAP function
+* [ARROW-13194](https://issues.apache.org/jira/browse/ARROW-13194) - [Java][Document] Create prose document about Java algorithms
+* [ARROW-13195](https://issues.apache.org/jira/browse/ARROW-13195) - [R] Problem with rlang reverse dependency checks
+* [ARROW-13199](https://issues.apache.org/jira/browse/ARROW-13199) - [R] add ubuntu 21.04 to nightly builds
+* [ARROW-13200](https://issues.apache.org/jira/browse/ARROW-13200) - [R] Add binding for case\_when()
+* [ARROW-13201](https://issues.apache.org/jira/browse/ARROW-13201) - [R] Add binding for coalesce()
+* [ARROW-13210](https://issues.apache.org/jira/browse/ARROW-13210) - [Python][CI] Fix vcpkg caching mechanism for the macOS wheels
+* [ARROW-13211](https://issues.apache.org/jira/browse/ARROW-13211) - [C++][CI] Remove outdated Github Actions ARM builds
+* [ARROW-13212](https://issues.apache.org/jira/browse/ARROW-13212) - [Release] Support deploying to test PyPI in the python post release script
+* [ARROW-13215](https://issues.apache.org/jira/browse/ARROW-13215) - [R] [CI] Add ENV TZ to docker files
+* [ARROW-13218](https://issues.apache.org/jira/browse/ARROW-13218) - [Doc] Document/clarify conventions for timestamp storage
+* [ARROW-13219](https://issues.apache.org/jira/browse/ARROW-13219) - [C++][GLib] Demote/deprecate CompareOptions
+* [ARROW-13224](https://issues.apache.org/jira/browse/ARROW-13224) - [Python][Doc] Documentation missing for pyarrow.dataset.write\_dataset
+* [ARROW-13226](https://issues.apache.org/jira/browse/ARROW-13226) - [Python] Add a general purpose cython trampolining utility
+* [ARROW-13228](https://issues.apache.org/jira/browse/ARROW-13228) - [C++] S3 CreateBucket fails because AWS treats us-east-1 differently than other regions
+* [ARROW-13230](https://issues.apache.org/jira/browse/ARROW-13230) - Add CSV Writer documentation
+* [ARROW-13234](https://issues.apache.org/jira/browse/ARROW-13234) - [C++] Add string padding option to determine which side the extra space goes on
+* [ARROW-13235](https://issues.apache.org/jira/browse/ARROW-13235) - [C++] Make type\_name equal to options class name for all FunctionOptionTypes
+* [ARROW-13236](https://issues.apache.org/jira/browse/ARROW-13236) - [Python] Improve repr of pyarrow.compute.FunctionOptions
+* [ARROW-13238](https://issues.apache.org/jira/browse/ARROW-13238) - [C++][Dataset][Compute] Substitute ExecPlan impl for dataset scans
+* [ARROW-13242](https://issues.apache.org/jira/browse/ARROW-13242) - [C++] Improve decimal random generation
+* [ARROW-13244](https://issues.apache.org/jira/browse/ARROW-13244) - [C++] Add facility to get current thread id
+* [ARROW-13258](https://issues.apache.org/jira/browse/ARROW-13258) - [Python] Improve the repr of ParquetFileFragment
+* [ARROW-13262](https://issues.apache.org/jira/browse/ARROW-13262) - [R] transmute() fails after pulling data into R
+* [ARROW-13273](https://issues.apache.org/jira/browse/ARROW-13273) - [C++] Don't use .pc only in CMake paths for Requires.private
+* [ARROW-13274](https://issues.apache.org/jira/browse/ARROW-13274) - [JS] Remove Webpack
+* [ARROW-13275](https://issues.apache.org/jira/browse/ARROW-13275) - [JS] Fix perf tests
+* [ARROW-13276](https://issues.apache.org/jira/browse/ARROW-13276) - [GLib][Ruby][Flight] Add support for ListFlights
+* [ARROW-13277](https://issues.apache.org/jira/browse/ARROW-13277) - [JS] Add declaration maps
+* [ARROW-13280](https://issues.apache.org/jira/browse/ARROW-13280) - [R] Bindings for log and trig functions
+* [ARROW-13282](https://issues.apache.org/jira/browse/ARROW-13282) - [C++] Remove obsolete generated files
+* [ARROW-13283](https://issues.apache.org/jira/browse/ARROW-13283) - [Developer Tools] Support passing through memory limits in archery docker run
+* [ARROW-13286](https://issues.apache.org/jira/browse/ARROW-13286) - [CI] Require docker-compose 1.27.0 or later
+* [ARROW-13289](https://issues.apache.org/jira/browse/ARROW-13289) - [C++] Log functions don't have int kernels
+* [ARROW-13291](https://issues.apache.org/jira/browse/ARROW-13291) - [GLib][CI] Require gobject-introspection 3.4.5 or later
+* [ARROW-13296](https://issues.apache.org/jira/browse/ARROW-13296) - [C++] Provide reflection-compatible enum replacement
+* [ARROW-13299](https://issues.apache.org/jira/browse/ARROW-13299) - [JS] Upgrade ix and rxjs
+* [ARROW-13303](https://issues.apache.org/jira/browse/ARROW-13303) - [JS] Revise bundles
+* [ARROW-13306](https://issues.apache.org/jira/browse/ARROW-13306) - [Java][JDBC] use ResultSetMetaData.getColumnLabel instead of ResultSetMetaData.getColumnName
+* [ARROW-13313](https://issues.apache.org/jira/browse/ARROW-13313) - [C++][Compute] Add ScalarAggregateNode
+* [ARROW-13320](https://issues.apache.org/jira/browse/ARROW-13320) - [Website] Add MIME types to FAQ
+* [ARROW-13323](https://issues.apache.org/jira/browse/ARROW-13323) - [Archery] Validate docker compose configuration
+* [ARROW-13343](https://issues.apache.org/jira/browse/ARROW-13343) - [R] Update NEWS.md for 5.0
+* [ARROW-13346](https://issues.apache.org/jira/browse/ARROW-13346) - [C++] Remove compile time parsing from EnumType
+* [ARROW-13355](https://issues.apache.org/jira/browse/ARROW-13355) - [R] ensure that sf is installed in our revdep job
+* [ARROW-13357](https://issues.apache.org/jira/browse/ARROW-13357) - [R] bindings for sign()
+* [ARROW-13365](https://issues.apache.org/jira/browse/ARROW-13365) - [R] bindings for floor/ceiling/truncate
+* [ARROW-13385](https://issues.apache.org/jira/browse/ARROW-13385) - [C++][Compute] Document out-of-source addition to the FunctionRegistry
+* [ARROW-13386](https://issues.apache.org/jira/browse/ARROW-13386) - [R][C++] CSV streaming changes break Rtools 35 32-bit build
+* [ARROW-13418](https://issues.apache.org/jira/browse/ARROW-13418) - [R] typo in python.r
+* [ARROW-13461](https://issues.apache.org/jira/browse/ARROW-13461) - [Python][Packaging] Build M1 wheels for python 3.8
+* [PARQUET-1798](https://issues.apache.org/jira/browse/PARQUET-1798) - [C++] Review logic around automatic assignment of field\_id's
+* [PARQUET-1998](https://issues.apache.org/jira/browse/PARQUET-1998) - [C++] Implement LZ4\_RAW compression
+* [PARQUET-2056](https://issues.apache.org/jira/browse/PARQUET-2056) - [C++] Add ability  for retrieving dictionary and indices separately for ColumnReader
+
+
+
+# Apache Arrow 4.0.1 (2021-05-26)
+
+## Bug Fixes
+
+* [ARROW-12568](https://issues.apache.org/jira/browse/ARROW-12568) - [Python][C++] Segfault when casting a sliced ListArray of int64 in v4.0.0
+* [ARROW-12601](https://issues.apache.org/jira/browse/ARROW-12601) - [R][Packaging] Fix pkg-config check in r/configure
+* [ARROW-12603](https://issues.apache.org/jira/browse/ARROW-12603) - [R] open\_dataset ignoring provided schema when using select
+* [ARROW-12604](https://issues.apache.org/jira/browse/ARROW-12604) - [R][Packaging] Dataset, Parquet off in autobrew and CRAN Mac builds
+* [ARROW-12617](https://issues.apache.org/jira/browse/ARROW-12617) - [Python] pyarrow.orc.write\_table signature reverses that of pyarrow.parquet.write\_table
+* [ARROW-12622](https://issues.apache.org/jira/browse/ARROW-12622) - [Python] Segfault when reading CSV inside Flight server
+* [ARROW-12642](https://issues.apache.org/jira/browse/ARROW-12642) - [R] LIBARROW\_MINIMAL, LIBARROW\_DOWNLOAD, NOT\_CRAN env vars should not be case-sensitive
+* [ARROW-12663](https://issues.apache.org/jira/browse/ARROW-12663) - [C++] segfault when arrow header is compiled with nvcc 11.2
+* [ARROW-12670](https://issues.apache.org/jira/browse/ARROW-12670) - [C++] extract\_regex gives bizarre behavior after nulls or non-matches
+* [ARROW-12746](https://issues.apache.org/jira/browse/ARROW-12746) - [Go][Flight] Client Auth handler overwrites outgoing metadata
+* [ARROW-12769](https://issues.apache.org/jira/browse/ARROW-12769) - [Python] Negative out of range slices yield invalid arrays
+* [ARROW-12774](https://issues.apache.org/jira/browse/ARROW-12774) - [C++][Compute] replace\_substring\_regex() creates invalid arrays =\> crash
+* [ARROW-12776](https://issues.apache.org/jira/browse/ARROW-12776) - [Archery][Integration] Fix decimal case generation in write\_js\_test\_json
+* [ARROW-12855](https://issues.apache.org/jira/browse/ARROW-12855) - error: no member named 'TableReader' in namespace during compilation
+
+
+## New Features and Improvements
+
+* [ARROW-11926](https://issues.apache.org/jira/browse/ARROW-11926) - [R] Pass on the new UCRT CRAN windows builds
+* [ARROW-12520](https://issues.apache.org/jira/browse/ARROW-12520) - [R] Minor docs updates
+* [ARROW-12571](https://issues.apache.org/jira/browse/ARROW-12571) - [R][CI] Run nightly R with valgrind
+* [ARROW-12578](https://issues.apache.org/jira/browse/ARROW-12578) - [JS] Simplify UTF8 handling in NodeJS
+* [ARROW-12619](https://issues.apache.org/jira/browse/ARROW-12619) - [Python] pyarrow sdist should not require git
+* [ARROW-12806](https://issues.apache.org/jira/browse/ARROW-12806) - [Python] test\_write\_to\_dataset\_filesystem missing a dataset mark
+* [ARROW-13533](https://issues.apache.org/jira/browse/ARROW-13533) - Buy Yellow Xanax Bars R039 | Buy Yellow Xanax Bars 2mg Online With Creditcard
+
+
+
+# Apache Arrow 4.0.0 (2021-04-26)
+
+## Bug Fixes
+
+* [ARROW-4784](https://issues.apache.org/jira/browse/ARROW-4784) - [C++][CI] Re-enable flaky mingw tests.
+* [ARROW-6818](https://issues.apache.org/jira/browse/ARROW-6818) - [Doc] Format docs confusing
+* [ARROW-7288](https://issues.apache.org/jira/browse/ARROW-7288) - [C++][R] read\_parquet() freezes on Windows with Japanese locale
+* [ARROW-7830](https://issues.apache.org/jira/browse/ARROW-7830) - [C++] Parquet library version doesn't change with releases
+* [ARROW-9451](https://issues.apache.org/jira/browse/ARROW-9451) - [Python] Unsigned integer types will accept string values in pyarrow.array
+* [ARROW-9634](https://issues.apache.org/jira/browse/ARROW-9634) - [C++][Python] Restore non-UTC time zones when reading Parquet file that was previously Arrow
+* [ARROW-9878](https://issues.apache.org/jira/browse/ARROW-9878) - [Python] table to\_pandas self\_destruct=True + split\_blocks=True cannot prevent doubling memory
+* [ARROW-10038](https://issues.apache.org/jira/browse/ARROW-10038) - [C++] SetCpuThreadPoolCapacity(1) spins up nCPUs threads
+* [ARROW-10056](https://issues.apache.org/jira/browse/ARROW-10056) - [C++] Increase flatbuffers max\_tables parameter in order to read wide tables
+* [ARROW-10364](https://issues.apache.org/jira/browse/ARROW-10364) - [Dev][Archery] Test is failed with semver 2.13.0
+* [ARROW-10370](https://issues.apache.org/jira/browse/ARROW-10370) - [Python] Spurious s3fs-related test failures
+* [ARROW-10403](https://issues.apache.org/jira/browse/ARROW-10403) - [C++] Implement unique kernel for dictionary type
+* [ARROW-10405](https://issues.apache.org/jira/browse/ARROW-10405) - [C++] IsIn kernel should be able to lookup dictionary in string
+* [ARROW-10457](https://issues.apache.org/jira/browse/ARROW-10457) - [CI] Fix Spark branch-3.0 integration tests
+* [ARROW-10489](https://issues.apache.org/jira/browse/ARROW-10489) - [C++] Unable to configure or make with intel compiler
+* [ARROW-10514](https://issues.apache.org/jira/browse/ARROW-10514) - [C++][Parquet] Data inconsistency in parquet-reader output modes
+* [ARROW-10953](https://issues.apache.org/jira/browse/ARROW-10953) - [R] Validate when creating Table with schema
+* [ARROW-11066](https://issues.apache.org/jira/browse/ARROW-11066) - [Java] Is there a bug in flight AddWritableBuffer
+* [ARROW-11066](https://issues.apache.org/jira/browse/ARROW-11066) - [Java] Is there a bug in flight AddWritableBuffer
+* [ARROW-11066](https://issues.apache.org/jira/browse/ARROW-11066) - [Java] Is there a bug in flight AddWritableBuffer
+* [ARROW-11066](https://issues.apache.org/jira/browse/ARROW-11066) - [Java] Is there a bug in flight AddWritableBuffer
+* [ARROW-11066](https://issues.apache.org/jira/browse/ARROW-11066) - [Java] Is there a bug in flight AddWritableBuffer
+* [ARROW-11134](https://issues.apache.org/jira/browse/ARROW-11134) - [C++][CI] ARM64 job on Travis-CI doesn't run tests
+* [ARROW-11147](https://issues.apache.org/jira/browse/ARROW-11147) - [Python][CI] Parquet tests failing in nightly build with Dask master
+* [ARROW-11180](https://issues.apache.org/jira/browse/ARROW-11180) - [Developer] cmake-format pre-commit hook doesn't run
+* [ARROW-11192](https://issues.apache.org/jira/browse/ARROW-11192) - [Documentation] Describe opening Visual Studio so it inherits a working env
+* [ARROW-11223](https://issues.apache.org/jira/browse/ARROW-11223) - [Java] BaseVariableWidthVector/BaseLargeVariableWidthVector setNull and getBufferSizeFor is buggy
+* [ARROW-11235](https://issues.apache.org/jira/browse/ARROW-11235) - [Python] S3 test failures inside non-default regions
+* [ARROW-11239](https://issues.apache.org/jira/browse/ARROW-11239) - [Rust] array::transform::tests::test\_struct failed
+* [ARROW-11269](https://issues.apache.org/jira/browse/ARROW-11269) - [Rust] Unable to read Parquet file because of mismatch in column-derived and embedded schemas
+* [ARROW-11277](https://issues.apache.org/jira/browse/ARROW-11277) - [C++] Fix compilation error in dataset expressions on macOS 10.11
+* [ARROW-11299](https://issues.apache.org/jira/browse/ARROW-11299) - [Python] build warning in python
+* [ARROW-11303](https://issues.apache.org/jira/browse/ARROW-11303) - [Release][C++] Enable mimalloc in the windows verification script
+* [ARROW-11305](https://issues.apache.org/jira/browse/ARROW-11305) - [Rust]: parquet-rowcount binary tries to open itself as a parquet file
+* [ARROW-11311](https://issues.apache.org/jira/browse/ARROW-11311) - [Rust] unset\_bit is toggling bits, not unsetting them
+* [ARROW-11313](https://issues.apache.org/jira/browse/ARROW-11313) - [Rust] Size hint of iterators is incorrect
+* [ARROW-11315](https://issues.apache.org/jira/browse/ARROW-11315) - [Packaging][APT][arm64] Add missing gir1.2 files
+* [ARROW-11320](https://issues.apache.org/jira/browse/ARROW-11320) - [C++] Spurious test failure when creating temporary dir
+* [ARROW-11322](https://issues.apache.org/jira/browse/ARROW-11322) - [Rust] Arrow \`memory\` made private is a breaking API change
+* [ARROW-11323](https://issues.apache.org/jira/browse/ARROW-11323) - [Rust][DataFusion] ComputeError("concat requires input of at least one array")) with queries with ORDER BY or GROUP BY that return no 
+* [ARROW-11328](https://issues.apache.org/jira/browse/ARROW-11328) - [R] Collecting zero columns from a dataset returns entire dataset
+* [ARROW-11334](https://issues.apache.org/jira/browse/ARROW-11334) - [Python][CI] Nightly pandas builds failing because of internal pandas change
+* [ARROW-11337](https://issues.apache.org/jira/browse/ARROW-11337) - [C++] Compilation error with ThreadSanitizer
+* [ARROW-11357](https://issues.apache.org/jira/browse/ARROW-11357) - [Rust] take primitive implementation is unsound
+* [ARROW-11376](https://issues.apache.org/jira/browse/ARROW-11376) - [C++] ThreadedTaskGroup failure with Thread Sanitizer enabled
+* [ARROW-11379](https://issues.apache.org/jira/browse/ARROW-11379) - [C++][Dataset] Reading dataset with filtering on timestamp partition field crashes
+* [ARROW-11387](https://issues.apache.org/jira/browse/ARROW-11387) - [Rust] Arrow 3.0.0 release with simd feature doesn't compile without feature=avx512.
+* [ARROW-11391](https://issues.apache.org/jira/browse/ARROW-11391) - [C++] HdfsOutputStream::Write unsafely truncates integers exceeding INT32\_MAX
+* [ARROW-11394](https://issues.apache.org/jira/browse/ARROW-11394) - [Rust] Slice + Concat incorrect for structs
+* [ARROW-11400](https://issues.apache.org/jira/browse/ARROW-11400) - [Python] Pickled ParquetFileFragment has invalid partition\_expresion with dictionary type in pyarrow 2.0
+* [ARROW-11403](https://issues.apache.org/jira/browse/ARROW-11403) - [Developer] archery benchmark list: unexpected keyword 'benchmark\_filter'
+* [ARROW-11412](https://issues.apache.org/jira/browse/ARROW-11412) - [Python] Expressions not working with logical boolean operators  (and, or, not)
+* [ARROW-11412](https://issues.apache.org/jira/browse/ARROW-11412) - [Python] Expressions not working with logical boolean operators  (and, or, not)
+* [ARROW-11427](https://issues.apache.org/jira/browse/ARROW-11427) - [C++] Arrow uses AVX512 instructions even when not supported by the OS
+* [ARROW-11448](https://issues.apache.org/jira/browse/ARROW-11448) - [C++] tdigest build failure on Windows with Visual Studio
+* [ARROW-11451](https://issues.apache.org/jira/browse/ARROW-11451) - [C++] Fix gcc-4.8 build error
+* [ARROW-11452](https://issues.apache.org/jira/browse/ARROW-11452) - [Rust] Parquet reader cannot read file where a struct column has the same name as struct member columns 
+* [ARROW-11461](https://issues.apache.org/jira/browse/ARROW-11461) - [Flight][Go] GetSchema does not work with Java Flight Server
+* [ARROW-11464](https://issues.apache.org/jira/browse/ARROW-11464) - [Python] pyarrow.parquet.read\_pandas doesn't conform to its docs
+* [ARROW-11470](https://issues.apache.org/jira/browse/ARROW-11470) - [C++] Overflow occurs on integer multiplications in ComputeRowMajorStrides, ComputeColumnMajorStrides, and CheckTensorStridesValidity
+* [ARROW-11472](https://issues.apache.org/jira/browse/ARROW-11472) - [Python][CI] Kartothek integrations build is failing with numpy 1.20
+* [ARROW-11472](https://issues.apache.org/jira/browse/ARROW-11472) - [Python][CI] Kartothek integrations build is failing with numpy 1.20
+* [ARROW-11480](https://issues.apache.org/jira/browse/ARROW-11480) - [Python] Segmentation fault reading parquet with date filter with INT96 column
+* [ARROW-11483](https://issues.apache.org/jira/browse/ARROW-11483) - [Java][C++][Integration] C++ integration test creates JSON files incompatible with Java
+* [ARROW-11488](https://issues.apache.org/jira/browse/ARROW-11488) - [Rust]: StructBuilder's Drop impl leaks memory
+* [ARROW-11490](https://issues.apache.org/jira/browse/ARROW-11490) - [C++] BM\_ArrowBinaryDict/EncodeLowLevel is not deterministic
+* [ARROW-11494](https://issues.apache.org/jira/browse/ARROW-11494) - [Rust] Fix take bench
+* [ARROW-11497](https://issues.apache.org/jira/browse/ARROW-11497) - [Python] pyarrow parquet writer for list does not conform with Apache Parquet specification
+* [ARROW-11538](https://issues.apache.org/jira/browse/ARROW-11538) - [Python] Segfault reading Parquet dataset with Timestamp filter
+* [ARROW-11547](https://issues.apache.org/jira/browse/ARROW-11547) - [Packaging][Conda][Drone] Nightly builds are failed by undefined variable error
+* [ARROW-11548](https://issues.apache.org/jira/browse/ARROW-11548) - [C++] RandomArrayGenerator::List size mismatch 
+* [ARROW-11551](https://issues.apache.org/jira/browse/ARROW-11551) - [C++][Gandiva] castTIMESTAMP(utf8) function doesn't show error out for invalid inputs
+* [ARROW-11560](https://issues.apache.org/jira/browse/ARROW-11560) - [FlightRPC][C++][Python] Interrupting a Flight server results in abort 
+* [ARROW-11567](https://issues.apache.org/jira/browse/ARROW-11567) - [C++][Compute] Variance kernel has precision issue
+* [ARROW-11577](https://issues.apache.org/jira/browse/ARROW-11577) - [Rust] Concat kernel panics on slices of string arrays
+* [ARROW-11582](https://issues.apache.org/jira/browse/ARROW-11582) - [R] write\_dataset "format" argument default and validation could be better
+* [ARROW-11586](https://issues.apache.org/jira/browse/ARROW-11586) - [Rust] [Datafusion] Invalid SQL sometimes panics
+* [ARROW-11595](https://issues.apache.org/jira/browse/ARROW-11595) - [C++][NIGHTLY:test-conda-cpp-valgrind] GenerateBitsUnrolled triggers valgrind on uninit inputs
+* [ARROW-11596](https://issues.apache.org/jira/browse/ARROW-11596) - [Python][Dataset] SIGSEGV when executing scan tasks with Python executors
+* [ARROW-11603](https://issues.apache.org/jira/browse/ARROW-11603) - [Rust] Fix clippy error
+* [ARROW-11607](https://issues.apache.org/jira/browse/ARROW-11607) - [Python] Error when reading table with list values from parquet
+* [ARROW-11614](https://issues.apache.org/jira/browse/ARROW-11614) - [C++][Gandiva] Fix round() logic to return positive zero when argument is zero
+* [ARROW-11617](https://issues.apache.org/jira/browse/ARROW-11617) - [C++][Gandiva] Fix nested if-else optimisation in gandiva
+* [ARROW-11620](https://issues.apache.org/jira/browse/ARROW-11620) - [Rust] [DataFusion] Inconsistent use of Box and Arc for TableProvider
+* [ARROW-11630](https://issues.apache.org/jira/browse/ARROW-11630) - [Rust] Introduce partial\_sort and limit option for sort kernel
+* [ARROW-11632](https://issues.apache.org/jira/browse/ARROW-11632) - [Rust] csv::Reader doesn't propagate schema metadata to RecordBatches
+* [ARROW-11639](https://issues.apache.org/jira/browse/ARROW-11639) - [C++][Gandiva] Fix signbit compilation issue in Ubuntu nightly build
+* [ARROW-11642](https://issues.apache.org/jira/browse/ARROW-11642) - [C++] Incorrect preprocessor directive for Windows in JVM detection
+* [ARROW-11657](https://issues.apache.org/jira/browse/ARROW-11657) - [R] group\_by with .drop specified errors
+* [ARROW-11658](https://issues.apache.org/jira/browse/ARROW-11658) - [R] Handle mutate/rename inside group\_by
+* [ARROW-11663](https://issues.apache.org/jira/browse/ARROW-11663) - [DataFusion] Master does not compile
+* [ARROW-11668](https://issues.apache.org/jira/browse/ARROW-11668) - [C++] Sporadic UBSAN error in FutureStessTest.TryAddCallback
+* [ARROW-11672](https://issues.apache.org/jira/browse/ARROW-11672) - [R] Fix string function test failure on R 3.3
+* [ARROW-11681](https://issues.apache.org/jira/browse/ARROW-11681) - [Rust] IPC writers shouldn't unwrap in destructors
+* [ARROW-11686](https://issues.apache.org/jira/browse/ARROW-11686) - [C++]flight-test-integration-client sometimes exits by SIGABRT but does not print the stack trace
+* [ARROW-11687](https://issues.apache.org/jira/browse/ARROW-11687) - [Rust][DataFusion] RepartitionExec Hanging
+* [ARROW-11694](https://issues.apache.org/jira/browse/ARROW-11694) - [C++] Array Take may dereference absent null bitmap
+* [ARROW-11695](https://issues.apache.org/jira/browse/ARROW-11695) - [C++][FlightRPC][Packaging] Update support for disabling TLS server verification for recent gRPC versions
+* [ARROW-11717](https://issues.apache.org/jira/browse/ARROW-11717) - [Integration] Intermittent (but frequent) flight integration failures with auth:basic\_proto
+* [ARROW-11718](https://issues.apache.org/jira/browse/ARROW-11718) - [Rust] IPC writers shouldn't implicitly finish on drop
+* [ARROW-11741](https://issues.apache.org/jira/browse/ARROW-11741) - [C++] Decimal cast failure on big-endian
+* [ARROW-11743](https://issues.apache.org/jira/browse/ARROW-11743) - [R] Use pkgdown's new found ability to autolink Jiras
+* [ARROW-11746](https://issues.apache.org/jira/browse/ARROW-11746) - [Developer][Archery] Fix prefer real time check
+* [ARROW-11756](https://issues.apache.org/jira/browse/ARROW-11756) - [R] passing a partition as a schema leads to segfaults
+* [ARROW-11758](https://issues.apache.org/jira/browse/ARROW-11758) - [C++][Compute] Summation kernel round-off error
+* [ARROW-11767](https://issues.apache.org/jira/browse/ARROW-11767) - [C++] Scalar::hash may segfault for null scalars
+* [ARROW-11771](https://issues.apache.org/jira/browse/ARROW-11771) - [Developer][Archery] Move benchmark tests (so CI runs them)
+* [ARROW-11781](https://issues.apache.org/jira/browse/ARROW-11781) - [Python] Reading small amount of files from a partitioned dataset is unexpectedly slow
+* [ARROW-11784](https://issues.apache.org/jira/browse/ARROW-11784) - [Rust][DataFusion] CoalesceBatchesStream doesn't honor Stream interface
+* [ARROW-11785](https://issues.apache.org/jira/browse/ARROW-11785) - [R] Fallback when filtering Table with unsupported expression fails
+* [ARROW-11786](https://issues.apache.org/jira/browse/ARROW-11786) - [C++] CMake output noisy
+* [ARROW-11788](https://issues.apache.org/jira/browse/ARROW-11788) - [Java] Appending Empty List Vector yields NPE
+* [ARROW-11791](https://issues.apache.org/jira/browse/ARROW-11791) - [Rust][DataFusion] RepartitionExec Blocking
+* [ARROW-11802](https://issues.apache.org/jira/browse/ARROW-11802) - [Rust][DataFusion] Mixing of crossbeam channel and async tasks can lead to deadlock
+* [ARROW-11819](https://issues.apache.org/jira/browse/ARROW-11819) - [Rust] Add link to the doc
+* [ARROW-11821](https://issues.apache.org/jira/browse/ARROW-11821) - [Rust] Edit Rust README
+* [ARROW-11830](https://issues.apache.org/jira/browse/ARROW-11830) - [C++] gRPC compilation tests occur every time
+* [ARROW-11832](https://issues.apache.org/jira/browse/ARROW-11832) - [R] Handle conversion of extra nested struct column
+* [ARROW-11836](https://issues.apache.org/jira/browse/ARROW-11836) - Target libarrow\_bundled\_dependencies.a is not alreay created but is already required.
+* [ARROW-11845](https://issues.apache.org/jira/browse/ARROW-11845) - [Rust] Debug implementation of Date32Array panics if array contains negative values
+* [ARROW-11850](https://issues.apache.org/jira/browse/ARROW-11850) - [GLib] GARROW\_VERSION\_0\_16 macro is missing
+* [ARROW-11855](https://issues.apache.org/jira/browse/ARROW-11855) - [C++] [Python] Memory leak in to\_pandas when converting chunked struct array
+* [ARROW-11857](https://issues.apache.org/jira/browse/ARROW-11857) - [Python] Resource temporarily unavailable when using the new Dataset API with Pandas
+* [ARROW-11860](https://issues.apache.org/jira/browse/ARROW-11860) - [Rust] [DataFusion] Add DataFusion logos
+* [ARROW-11866](https://issues.apache.org/jira/browse/ARROW-11866) - [C++] Arrow Flight SetShutdownOnSignals cause potential mutex deadlock in gRPC 
+* [ARROW-11872](https://issues.apache.org/jira/browse/ARROW-11872) - [C++] Array Validation of GPU buffers fails due to incorrect validation check
+* [ARROW-11880](https://issues.apache.org/jira/browse/ARROW-11880) - [R] Handle empty or NULL transmute() args properly
+* [ARROW-11881](https://issues.apache.org/jira/browse/ARROW-11881) - [Rust][DataFusion] Fix Clippy Lint
+* [ARROW-11896](https://issues.apache.org/jira/browse/ARROW-11896) - [Rust] Hang / failure in CI on AMD64 Debian 10 Rust stable test workspace
+* [ARROW-11904](https://issues.apache.org/jira/browse/ARROW-11904) - [C++] "pure virtual method called" crash at the end of arrow-csv-test
+* [ARROW-11905](https://issues.apache.org/jira/browse/ARROW-11905) - [C++] SIMD info always returning none on MacOS
+* [ARROW-11914](https://issues.apache.org/jira/browse/ARROW-11914) - [R] [CI] r-sanitizer nightly is broken
+* [ARROW-11918](https://issues.apache.org/jira/browse/ARROW-11918) - [R] [Documentation] Docs cleanups
+* [ARROW-11923](https://issues.apache.org/jira/browse/ARROW-11923) - [CI] Update branch name for dask dev integration tests
+* [ARROW-11937](https://issues.apache.org/jira/browse/ARROW-11937) - [C++] GZip codec hangs if flushed twice
+* [ARROW-11941](https://issues.apache.org/jira/browse/ARROW-11941) - [Dev] "DEBUG=1 merge\_arrow\_pr.py" updates Jira issue
+* [ARROW-11942](https://issues.apache.org/jira/browse/ARROW-11942) - [C++] If tasks are submitted quickly the thread pool may fail to spin up new threads
+* [ARROW-11945](https://issues.apache.org/jira/browse/ARROW-11945) - [R] filter doesn't accept negative numbers as valid
+* [ARROW-11956](https://issues.apache.org/jira/browse/ARROW-11956) - [C++] Fix system re2 dependency detection for static library
+* [ARROW-11965](https://issues.apache.org/jira/browse/ARROW-11965) - [R][Docs] Fix install.packages command in R dev docs
+* [ARROW-11970](https://issues.apache.org/jira/browse/ARROW-11970) - [C++][CI] Fix Valgrind failures
+* [ARROW-11971](https://issues.apache.org/jira/browse/ARROW-11971) - [Packaging] Vcpkg patch doesn't apply on windows due to line endings
+* [ARROW-11975](https://issues.apache.org/jira/browse/ARROW-11975) - [CI][GLib] Failed to update gcc
+* [ARROW-11976](https://issues.apache.org/jira/browse/ARROW-11976) - [C++] Sporadic TSAN error in TestThreadPool.SetCapacity
+* [ARROW-11983](https://issues.apache.org/jira/browse/ARROW-11983) - [Python] ImportError calling pyarrow from\_pandas within ThreadPool
+* [ARROW-11997](https://issues.apache.org/jira/browse/ARROW-11997) - [Python] concat\_tables crashes python interpreter
+* [ARROW-12003](https://issues.apache.org/jira/browse/ARROW-12003) - [R] Fix NOTE re undefined global function group\_by\_drop\_default
+* [ARROW-12006](https://issues.apache.org/jira/browse/ARROW-12006) - [Java] Fix checkstyle config to work on Windows
+* [ARROW-12012](https://issues.apache.org/jira/browse/ARROW-12012) - [Java] [JDBC] BinaryConsumer cannot reallocate memory correctly
+* [ARROW-12013](https://issues.apache.org/jira/browse/ARROW-12013) - [C++][FlightRPC] Failed to detect gRPC version
+* [ARROW-12015](https://issues.apache.org/jira/browse/ARROW-12015) - [Rust] [DataFusion] Integrate doc-comment crate to ensure readme examples remain valid
+* [ARROW-12028](https://issues.apache.org/jira/browse/ARROW-12028) - [Rust][DataFusion] Unsupported GROUP BY for Timestamp(Millisecond, None)
+* [ARROW-12029](https://issues.apache.org/jira/browse/ARROW-12029) - Remove args from FeatherReader$create v2
+* [ARROW-12033](https://issues.apache.org/jira/browse/ARROW-12033) - [Docs] Fix link in developers/benchmarks.html
+* [ARROW-12041](https://issues.apache.org/jira/browse/ARROW-12041) - [C++] Fix string description of tensor IPC messages
+* [ARROW-12051](https://issues.apache.org/jira/browse/ARROW-12051) - [GLib] Intermittent CI failure in test\_add\_column\_type(TestCSVReader::\#read::options)
+* [ARROW-12057](https://issues.apache.org/jira/browse/ARROW-12057) - [Python] Remove direct usage of pandas' Block subclasses
+* [ARROW-12065](https://issues.apache.org/jira/browse/ARROW-12065) - [C++][Python] Segfault reading JSON file
+* [ARROW-12067](https://issues.apache.org/jira/browse/ARROW-12067) - [Python][Doc] Document pyarrow\_(un)wrap\_scalar
+* [ARROW-12073](https://issues.apache.org/jira/browse/ARROW-12073) - [R] Fix R CMD check NOTE about ‘X\_\_\_\_\_X’
+* [ARROW-12076](https://issues.apache.org/jira/browse/ARROW-12076) - [Rust] Fix build
+* [ARROW-12077](https://issues.apache.org/jira/browse/ARROW-12077) - [C++] Out-of-bounds write in ListArray::FromArrays
+* [ARROW-12086](https://issues.apache.org/jira/browse/ARROW-12086) - [C++] offline builds does not use ARROW\_$LIBRARY\_URL to search for packages
+* [ARROW-12088](https://issues.apache.org/jira/browse/ARROW-12088) - [Python][C++] Warning about offsetof in pyarrow.dataset.RecordBatchIterator
+* [ARROW-12089](https://issues.apache.org/jira/browse/ARROW-12089) - [Doc] Fix warnings when building Sphinx docs
+* [ARROW-12100](https://issues.apache.org/jira/browse/ARROW-12100) - [C\#] Cannot round-trip record batch with PyArrow
+* [ARROW-12103](https://issues.apache.org/jira/browse/ARROW-12103) - [C++] "load of misaligned address" in Parquet reader
+* [ARROW-12112](https://issues.apache.org/jira/browse/ARROW-12112) - [CI] No space left on device - AMD64 Conda Integration test
+* [ARROW-12112](https://issues.apache.org/jira/browse/ARROW-12112) - [CI] No space left on device - AMD64 Conda Integration test
+* [ARROW-12113](https://issues.apache.org/jira/browse/ARROW-12113) - [R] Fix rlang deprecation warning from check\_select\_helpers()
+* [ARROW-12130](https://issues.apache.org/jira/browse/ARROW-12130) - [C++] Arm64 buid failed if -DARROW\_SIMD\_LEVEL=NONE
+* [ARROW-12138](https://issues.apache.org/jira/browse/ARROW-12138) - [Go][IPC]
+* [ARROW-12140](https://issues.apache.org/jira/browse/ARROW-12140) - [C++][CI] Valgrind failure on Grouper tests
+* [ARROW-12145](https://issues.apache.org/jira/browse/ARROW-12145) - [Developer][Archery] Flaky test: test\_static\_runner\_from\_json
+* [ARROW-12149](https://issues.apache.org/jira/browse/ARROW-12149) - [Dev] Archery benchmark test case is failing
+* [ARROW-12154](https://issues.apache.org/jira/browse/ARROW-12154) - [C++][Gandiva] Fix gandiva crash in certain OS/CPU combinations
+* [ARROW-12155](https://issues.apache.org/jira/browse/ARROW-12155) - [R] Require Table columns to be same length
+* [ARROW-12161](https://issues.apache.org/jira/browse/ARROW-12161) - [C++][R] Async streaming CSV reader deadlocking when being run synchronously from datasets
+* [ARROW-12161](https://issues.apache.org/jira/browse/ARROW-12161) - [C++][R] Async streaming CSV reader deadlocking when being run synchronously from datasets
+* [ARROW-12169](https://issues.apache.org/jira/browse/ARROW-12169) - [C++] Fix compressed file reading with an empty stream at end of file
+* [ARROW-12171](https://issues.apache.org/jira/browse/ARROW-12171) - [Rust] Clippy error
+* [ARROW-12172](https://issues.apache.org/jira/browse/ARROW-12172) - [Python][Packaging] Pass python version as setuptools pretend version in the macOS wheel builds
+* [ARROW-12178](https://issues.apache.org/jira/browse/ARROW-12178) - [CI] Update setuptools in the ubuntu images
+* [ARROW-12186](https://issues.apache.org/jira/browse/ARROW-12186) - [Rust][DataFusion] Fix regexp\_match test
+* [ARROW-12209](https://issues.apache.org/jira/browse/ARROW-12209) - [JS] @apache-arrow/ts nor apache-arrow does not compile
+* [ARROW-12220](https://issues.apache.org/jira/browse/ARROW-12220) - [C++][CI] Thread sanitizer failure
+* [ARROW-12226](https://issues.apache.org/jira/browse/ARROW-12226) - [C++] ASAN error in s3fs\_test.cc
+* [ARROW-12227](https://issues.apache.org/jira/browse/ARROW-12227) - [R] Fix RE2 and median nightly build failures
+* [ARROW-12235](https://issues.apache.org/jira/browse/ARROW-12235) - [Rust][DataFusion] LIMIT returns incorrect results when used with several small partitions
+* [ARROW-12241](https://issues.apache.org/jira/browse/ARROW-12241) - [Python] Parallel csv reader cancellation test kills pytest
+* [ARROW-12250](https://issues.apache.org/jira/browse/ARROW-12250) - [Rust] Failing test arrow::arrow\_writer::tests::fixed\_size\_binary\_single\_column
+* [ARROW-12254](https://issues.apache.org/jira/browse/ARROW-12254) - [Rust][DataFusion] Limit keeps polling input after limit is reached
+* [ARROW-12258](https://issues.apache.org/jira/browse/ARROW-12258) - [R] Never do as.data.frame() on collect(as\_data\_frame = FALSE)
+* [ARROW-12262](https://issues.apache.org/jira/browse/ARROW-12262) - [Doc][C++][Python] Docs built and pushed with S3 and Flight disabled
+* [ARROW-12267](https://issues.apache.org/jira/browse/ARROW-12267) - [Rust] JSON writer does not support timestamp types
+* [ARROW-12273](https://issues.apache.org/jira/browse/ARROW-12273) - [JS] Coveralls does not work anymore
+* [ARROW-12279](https://issues.apache.org/jira/browse/ARROW-12279) - [Rust][DataFusion] Add test for null handling in hash join (ARROW-12266)
+* [ARROW-12294](https://issues.apache.org/jira/browse/ARROW-12294) - [Rust] Fix Boolean Kleene Kernels with no Remainder
+* [ARROW-12299](https://issues.apache.org/jira/browse/ARROW-12299) - [Python] pq.write\_to\_dataset does not recognize S3FileSystem
+* [ARROW-12300](https://issues.apache.org/jira/browse/ARROW-12300) - [C++] ArrowCUDA erroneously links to CUDA Runtime while only using CUDA Driver API
+* [ARROW-12313](https://issues.apache.org/jira/browse/ARROW-12313) - [Rust] [Ballista] Benchmark documentation out of date
+* [ARROW-12314](https://issues.apache.org/jira/browse/ARROW-12314) - [Python] pq.read\_pandas with use\_legacy\_dataset=False does not accept columns as a set (kartothek integration failure)
+* [ARROW-12327](https://issues.apache.org/jira/browse/ARROW-12327) - [Dev] Use pull request's head remote when submitting crossbow jobs via the comment bot
+* [ARROW-12330](https://issues.apache.org/jira/browse/ARROW-12330) - [Developer] Restore values in counters column of Archery benchmark
+* [ARROW-12334](https://issues.apache.org/jira/browse/ARROW-12334) - [Rust] [Ballista] Aggregate queries producing incorrect results
+* [ARROW-12342](https://issues.apache.org/jira/browse/ARROW-12342) - [Packaging] Fix tabulation in crossbow templates for submitting nightly builds
+* [ARROW-12357](https://issues.apache.org/jira/browse/ARROW-12357) - [Archery] Error running "crossbow submit ..."
+* [ARROW-12379](https://issues.apache.org/jira/browse/ARROW-12379) - [C++][CI] Thread sanitizer failure in SerialExecutor
+* [ARROW-12382](https://issues.apache.org/jira/browse/ARROW-12382) - [C++][CI] Conda nightly jobs fail due to not bundling xsimd
+* [ARROW-12385](https://issues.apache.org/jira/browse/ARROW-12385) - [R] [CI] fix cran picking in CI
+* [ARROW-12390](https://issues.apache.org/jira/browse/ARROW-12390) - [Rust] Inline Inline from\_trusted\_len\_iter, try\_from\_trusted\_len\_iter, extend\_from\_slice
+* [ARROW-12401](https://issues.apache.org/jira/browse/ARROW-12401) - [R] Fix guard around dataset\_\_\_Scanner\_\_TakeRows
+* [ARROW-12405](https://issues.apache.org/jira/browse/ARROW-12405) - [Packaging] Fix apt artifact patterns and artifact uploading from travis
+* [ARROW-12408](https://issues.apache.org/jira/browse/ARROW-12408) - [R] Delete Scan() bindings
+* [ARROW-12421](https://issues.apache.org/jira/browse/ARROW-12421) - [Rust] [DataFusion] topk\_query test fails in master
+* [ARROW-12421](https://issues.apache.org/jira/browse/ARROW-12421) - [Rust] [DataFusion] topk\_query test fails in master
+* [ARROW-12429](https://issues.apache.org/jira/browse/ARROW-12429) - [C++] MergedGeneratorTestFixture is incorrectly instantiated
+* [ARROW-12433](https://issues.apache.org/jira/browse/ARROW-12433) - [Rust] Builds failing due to new flatbuffer release introducing const generics
+* [ARROW-12437](https://issues.apache.org/jira/browse/ARROW-12437) - [Rust] [Ballista] Ballista plans must not include RepartitionExec
+* [ARROW-12440](https://issues.apache.org/jira/browse/ARROW-12440) - [Release] Various packaging, release script and release verification script fixes
+* [ARROW-12466](https://issues.apache.org/jira/browse/ARROW-12466) - [Python] Comparing array to None raises error
+* [ARROW-12475](https://issues.apache.org/jira/browse/ARROW-12475) - [C++] Build warning from thread\_pool\_benchmark.cc
+* [ARROW-12487](https://issues.apache.org/jira/browse/ARROW-12487) - [C++][Dataset] ScanBatches() hangs if there's an error during scanning
+* [ARROW-12495](https://issues.apache.org/jira/browse/ARROW-12495) - [C++][Python] NumPy buffer sets is\_mutable\_ to true but does not set mutable\_data\_ when the NumPy array is writable
+* [ARROW-12794](https://issues.apache.org/jira/browse/ARROW-12794) - C++/R: read\_parquet halts process when accessed multiple times
+* [PARQUET-1655](https://issues.apache.org/jira/browse/PARQUET-1655) - [C++] Decimal comparisons used for min/max statistics are not correct
+* [PARQUET-2008](https://issues.apache.org/jira/browse/PARQUET-2008) - [C++] Wrong information written in RowGroup::total\_byte\_size
+
+
+## New Features and Improvements
+
+* [ARROW-951](https://issues.apache.org/jira/browse/ARROW-951) - [JS] Fix generated API documentation
+* [ARROW-2229](https://issues.apache.org/jira/browse/ARROW-2229) - [C++] Write CSV files from RecordBatch, Table
+* [ARROW-3690](https://issues.apache.org/jira/browse/ARROW-3690) - [Rust] Add Rust to the format integration testing
+* [ARROW-6103](https://issues.apache.org/jira/browse/ARROW-6103) - [Java] Stop using the maven release plugin
+* [ARROW-6248](https://issues.apache.org/jira/browse/ARROW-6248) - [Python] Use FileNotFoundError in HadoopFileSystem.open() in Python 3 
+* [ARROW-6455](https://issues.apache.org/jira/browse/ARROW-6455) - [C++] Implement ExtensionType for non-UTF8 Unicode data
+* [ARROW-6604](https://issues.apache.org/jira/browse/ARROW-6604) - [C++] Add support for nested types to MakeArrayFromScalar
+* [ARROW-7215](https://issues.apache.org/jira/browse/ARROW-7215) - [C++][Gandiva] Implement castVARCHAR(numeric\_type) functions in Gandiva
+* [ARROW-7364](https://issues.apache.org/jira/browse/ARROW-7364) - [Rust] Add cast options to cast kernel
+* [ARROW-7633](https://issues.apache.org/jira/browse/ARROW-7633) - [C++][CI] Create fuzz targets for tensors and sparse tensors
+* [ARROW-7808](https://issues.apache.org/jira/browse/ARROW-7808) - [Java][Dataset] Implement Datasets Java API 
+* [ARROW-7906](https://issues.apache.org/jira/browse/ARROW-7906) - [C++][Python] Full functionality for ORC format
+* [ARROW-8049](https://issues.apache.org/jira/browse/ARROW-8049) - [C++] Upgrade bundled Thrift version to 0.13.0
+* [ARROW-8282](https://issues.apache.org/jira/browse/ARROW-8282) - [C++/Python][Dataset] Support schema evolution for integer columns
+* [ARROW-8284](https://issues.apache.org/jira/browse/ARROW-8284) - [C++][Dataset] Schema evolution for timestamp columns
+* [ARROW-8630](https://issues.apache.org/jira/browse/ARROW-8630) - [C++][Dataset] Pass schema including all materialized fields to catch CSV edge cases
+* [ARROW-8631](https://issues.apache.org/jira/browse/ARROW-8631) - [C++][Dataset] Add ConvertOptions and ReadOptions to CsvFileFormat
+* [ARROW-8658](https://issues.apache.org/jira/browse/ARROW-8658) - [C++][Dataset] Implement subtree pruning for FileSystemDataset::GetFragments
+* [ARROW-8672](https://issues.apache.org/jira/browse/ARROW-8672) - [Java] Implement RecordBatch IPC buffer compression from ARROW-300
+* [ARROW-8732](https://issues.apache.org/jira/browse/ARROW-8732) - [C++] Let Futures support cancellation
+* [ARROW-8771](https://issues.apache.org/jira/browse/ARROW-8771) - [C++] Add boost/process library to build support
+* [ARROW-8796](https://issues.apache.org/jira/browse/ARROW-8796) - [Rust] Allow parquet to be written directly to memory
+* [ARROW-8797](https://issues.apache.org/jira/browse/ARROW-8797) - [C++] Support Flight RPC among diffent endian platforms
+* [ARROW-8900](https://issues.apache.org/jira/browse/ARROW-8900) - [C++] Respect HTTP(S)\_PROXY for S3 Filesystems and/or expose proxy options as parameters
+* [ARROW-8919](https://issues.apache.org/jira/browse/ARROW-8919) - [C++] Add "DispatchBest" APIs to compute::Function that selects a kernel that may require implicit casts to invoke
+* [ARROW-9128](https://issues.apache.org/jira/browse/ARROW-9128) - [C++] Implement string space trimming kernels: trim, ltrim, and rtrim
+* [ARROW-9149](https://issues.apache.org/jira/browse/ARROW-9149) - [C++] Improve configurability of RandomArrayGenerator::ArrayOf
+* [ARROW-9196](https://issues.apache.org/jira/browse/ARROW-9196) - [C++] Make temporal casts work on Scalar inputs
+* [ARROW-9318](https://issues.apache.org/jira/browse/ARROW-9318) - [C++][Parquet] Encryption key management tools
+* [ARROW-9731](https://issues.apache.org/jira/browse/ARROW-9731) - [C++][Dataset] Port "head" method from R to C++ Dataset Scanner
+* [ARROW-9749](https://issues.apache.org/jira/browse/ARROW-9749) - [C++][Dataset] Extract format-specific scan options from FileFormat
+* [ARROW-9777](https://issues.apache.org/jira/browse/ARROW-9777) - [Rust] Implement IPC changes to catch up to 1.0.0 format
+* [ARROW-9856](https://issues.apache.org/jira/browse/ARROW-9856) - [R] Add bindings for string compute functions
+* [ARROW-10014](https://issues.apache.org/jira/browse/ARROW-10014) - [C++] TaskGroup::Finish should execute tasks
+* [ARROW-10089](https://issues.apache.org/jira/browse/ARROW-10089) - [R] inject base class for Array, ChunkedArray and Scalar
+* [ARROW-10183](https://issues.apache.org/jira/browse/ARROW-10183) - [C++] Create a ForEach library function that runs on an iterator of futures
+* [ARROW-10195](https://issues.apache.org/jira/browse/ARROW-10195) - [C++] Add string struct extract kernel using re2
+* [ARROW-10250](https://issues.apache.org/jira/browse/ARROW-10250) - [FlightRPC][C++] Remove default constructor for FlightClientOptions
+* [ARROW-10255](https://issues.apache.org/jira/browse/ARROW-10255) - [JS] Reorganize imports and exports to be more friendly to ESM tree-shaking
+* [ARROW-10297](https://issues.apache.org/jira/browse/ARROW-10297) - [Rust] Parameter for parquet-read to output data in json format
+* [ARROW-10299](https://issues.apache.org/jira/browse/ARROW-10299) - [Rust] Support reading and writing V5 of IPC metadata
+* [ARROW-10305](https://issues.apache.org/jira/browse/ARROW-10305) - [R] Filter with regular expressions
+* [ARROW-10306](https://issues.apache.org/jira/browse/ARROW-10306) - [C++] Add string replacement kernel 
+* [ARROW-10349](https://issues.apache.org/jira/browse/ARROW-10349) - [Python] Build and publish aarch64 wheels
+* [ARROW-10354](https://issues.apache.org/jira/browse/ARROW-10354) - [Rust] [DataFusion] Add support for regex extract
+* [ARROW-10360](https://issues.apache.org/jira/browse/ARROW-10360) - [CI] Bump github actions cache version
+* [ARROW-10372](https://issues.apache.org/jira/browse/ARROW-10372) - [C++][Dataset] Read compressed CSVs 
+* [ARROW-10406](https://issues.apache.org/jira/browse/ARROW-10406) - [C++] Unify dictionaries when writing IPC file in a single shot
+* [ARROW-10420](https://issues.apache.org/jira/browse/ARROW-10420) - [C++] FileSystem::OpenInput{File,Stream} should accept a MemoryPool
+* [ARROW-10421](https://issues.apache.org/jira/browse/ARROW-10421) - [R] Feather reader/writer should accept a MemoryPool
+* [ARROW-10438](https://issues.apache.org/jira/browse/ARROW-10438) - [C++][Dataset] Partitioning::Format on nulls
+* [ARROW-10520](https://issues.apache.org/jira/browse/ARROW-10520) - [C++][R] Implement add/remove/replace for RecordBatch
+* [ARROW-10570](https://issues.apache.org/jira/browse/ARROW-10570) - [R] Use Converter API to convert SEXP to Array/ChunkedArray
+* [ARROW-10580](https://issues.apache.org/jira/browse/ARROW-10580) - [C++] When Validating, ensure DenseUnionArray offsets are increasing
+* [ARROW-10606](https://issues.apache.org/jira/browse/ARROW-10606) - [C++][Compute] Support casts to and from Decimal256 type.
+* [ARROW-10655](https://issues.apache.org/jira/browse/ARROW-10655) - [C++] Add LRU cache facility
+* [ARROW-10734](https://issues.apache.org/jira/browse/ARROW-10734) - [R] Build and test on Solaris
+* [ARROW-10735](https://issues.apache.org/jira/browse/ARROW-10735) - [R] Remove arrow-without-arrow wrapping
+* [ARROW-10766](https://issues.apache.org/jira/browse/ARROW-10766) - [Rust] Compute nested definition and repetition for list arrays
+* [ARROW-10816](https://issues.apache.org/jira/browse/ARROW-10816) - [Rust] [DataFusion] Implement INTERVAL
+* [ARROW-10831](https://issues.apache.org/jira/browse/ARROW-10831) - [C++][Compute] Implemement quantile kernel
+* [ARROW-10846](https://issues.apache.org/jira/browse/ARROW-10846) - [C++] Add async filesystem operations
+* [ARROW-10880](https://issues.apache.org/jira/browse/ARROW-10880) - [Java] Support compressing RecordBatch IPC buffers by LZ4
+* [ARROW-10882](https://issues.apache.org/jira/browse/ARROW-10882) - [Python][Dataset] Writing dataset from python iterator of record batches
+* [ARROW-10895](https://issues.apache.org/jira/browse/ARROW-10895) - [C++][Gandiva] Implement bool to varchar cast function in Gandiva
+* [ARROW-10903](https://issues.apache.org/jira/browse/ARROW-10903) - [Rust] Implement FromIter<Option<Vec<u8\>\>\> constructor for FixedSizeBinaryArray
+* [ARROW-11022](https://issues.apache.org/jira/browse/ARROW-11022) - [Rust] [DataFusion] Upgrade to tokio 1.0
+* [ARROW-11070](https://issues.apache.org/jira/browse/ARROW-11070) - [C++] Implement power / exponentiation compute kernel
+* [ARROW-11074](https://issues.apache.org/jira/browse/ARROW-11074) - [Rust][DataFusion] Implement predicate push-down for parquet tables
+* [ARROW-11081](https://issues.apache.org/jira/browse/ARROW-11081) - [Java] Make IPC option immutable
+* [ARROW-11108](https://issues.apache.org/jira/browse/ARROW-11108) - [Rust] Improve performance of MutableBuffer
+* [ARROW-11141](https://issues.apache.org/jira/browse/ARROW-11141) - [Rust]: Miri checks
+* [ARROW-11149](https://issues.apache.org/jira/browse/ARROW-11149) - [Rust] create\_batch\_empty - support List, LargeList
+* [ARROW-11150](https://issues.apache.org/jira/browse/ARROW-11150) - [Rust] Set up bi-weekly Rust sync call and update website
+* [ARROW-11154](https://issues.apache.org/jira/browse/ARROW-11154) - [CI][C++] Move homebrew crossbow tests off of Travis-CI
+* [ARROW-11156](https://issues.apache.org/jira/browse/ARROW-11156) - [Rust][DataFusion] Create hashes vectorized in hash join
+* [ARROW-11174](https://issues.apache.org/jira/browse/ARROW-11174) - [C++][Dataset] Make Expressions available for projection
+* [ARROW-11179](https://issues.apache.org/jira/browse/ARROW-11179) - [Format] Make comments in fb files friendly to rust doc
+* [ARROW-11183](https://issues.apache.org/jira/browse/ARROW-11183) - [Rust] [Parquet] LogicalType::TIMESTAMP\_NANOS missing
+* [ARROW-11191](https://issues.apache.org/jira/browse/ARROW-11191) - [C++] Use FnOnce for TaskGroup's tasks instead of std::function
+* [ARROW-11216](https://issues.apache.org/jira/browse/ARROW-11216) - [Rust] Improve documentation for StringDictionaryBuilder
+* [ARROW-11220](https://issues.apache.org/jira/browse/ARROW-11220) - [Rust] DF Implement GROUP BY support for Boolean
+* [ARROW-11222](https://issues.apache.org/jira/browse/ARROW-11222) - [Rust] [Arrow] catch up with flatbuffers 0.8.1
+* [ARROW-11246](https://issues.apache.org/jira/browse/ARROW-11246) - DF - Add type to Unexpected accumulator state message
+* [ARROW-11254](https://issues.apache.org/jira/browse/ARROW-11254) - [Rust][DataFusion] Add SIMD and snmalloc flags as options to benchmarks
+* [ARROW-11260](https://issues.apache.org/jira/browse/ARROW-11260) - [C++][Dataset] Don't require dictionaries for reading dataset with schema-based Partitioning
+* [ARROW-11265](https://issues.apache.org/jira/browse/ARROW-11265) - [Rust] Made bool not convertable to bytes
+* [ARROW-11268](https://issues.apache.org/jira/browse/ARROW-11268) - [Rust][DataFusion] Support specifying repartitions in MemTable 
+* [ARROW-11270](https://issues.apache.org/jira/browse/ARROW-11270) - [Rust] Use slices for simple array data buffer access
+* [ARROW-11279](https://issues.apache.org/jira/browse/ARROW-11279) - [Rust][Parquet] ArrowWriter Definition Levels Memory Usage
+* [ARROW-11284](https://issues.apache.org/jira/browse/ARROW-11284) - [R] Support dplyr verb transmute()
+* [ARROW-11289](https://issues.apache.org/jira/browse/ARROW-11289) - [Rust] [DataFusion] Support GROUP BY for Dictionary columns
+* [ARROW-11290](https://issues.apache.org/jira/browse/ARROW-11290) - [Rust][DataFusion] Address hash aggregate performance with high number of groups
+* [ARROW-11291](https://issues.apache.org/jira/browse/ARROW-11291) - [Rust] implement extend for MutableBuffer (from iterator)
+* [ARROW-11300](https://issues.apache.org/jira/browse/ARROW-11300) - [Rust][DataFusion] Improve hash aggregate performance with large number of groups in 
+* [ARROW-11308](https://issues.apache.org/jira/browse/ARROW-11308) - [Rust] [Parquet] Add Arrow decimal array writer
+* [ARROW-11309](https://issues.apache.org/jira/browse/ARROW-11309) - [Release][C\#] Use .NET 3.1 for verification
+* [ARROW-11310](https://issues.apache.org/jira/browse/ARROW-11310) - [Rust] Implement arrow JSON writer
+* [ARROW-11314](https://issues.apache.org/jira/browse/ARROW-11314) -  [Release][APT][Yum] Add support for verifying arm64 packages
+* [ARROW-11317](https://issues.apache.org/jira/browse/ARROW-11317) - [Rust] Test the prettyprint feature in CI
+* [ARROW-11318](https://issues.apache.org/jira/browse/ARROW-11318) - [Rust] Support pretty printing timestamp, date, and time types
+* [ARROW-11319](https://issues.apache.org/jira/browse/ARROW-11319) - [Rust] [DataFusion] Improve test comparisons to record batch
+* [ARROW-11321](https://issues.apache.org/jira/browse/ARROW-11321) - [Rust][DataFusion] Fix DataFusion compilation error
+* [ARROW-11325](https://issues.apache.org/jira/browse/ARROW-11325) - [Packaging][C\#] Release Apache.Arrow.Flight and Apache.Arrow.Flight.AspNetCore
+* [ARROW-11329](https://issues.apache.org/jira/browse/ARROW-11329) - [Rust] Do not rebuild the library on every change
+* [ARROW-11330](https://issues.apache.org/jira/browse/ARROW-11330) - [Rust][DataFusion] Add ExpressionVisitor pattern
+* [ARROW-11332](https://issues.apache.org/jira/browse/ARROW-11332) - [Rust] Use MutableBuffer in take\_string instead of Vec
+* [ARROW-11333](https://issues.apache.org/jira/browse/ARROW-11333) - [Rust] Suport creating arbitrary nested empty arrays
+* [ARROW-11336](https://issues.apache.org/jira/browse/ARROW-11336) - [C++][Doc] Improve Developing on Windows docs
+* [ARROW-11338](https://issues.apache.org/jira/browse/ARROW-11338) - [R] Bindings for quantile and median 
+* [ARROW-11340](https://issues.apache.org/jira/browse/ARROW-11340) - [C++] Add vcpkg.json manifest to cpp project root
+* [ARROW-11343](https://issues.apache.org/jira/browse/ARROW-11343) - [DataFusion] Simplified example
+* [ARROW-11346](https://issues.apache.org/jira/browse/ARROW-11346) - [C++][Compute] Implement quantile kernel benchmark
+* [ARROW-11349](https://issues.apache.org/jira/browse/ARROW-11349) - [Rust] Add from\_iter\_values to create arrays from T instead of Option<T\>
+* [ARROW-11350](https://issues.apache.org/jira/browse/ARROW-11350) - [C++] Bump dependency versions
+* [ARROW-11354](https://issues.apache.org/jira/browse/ARROW-11354) - [Rust] Speed-up casts of dates and times
+* [ARROW-11355](https://issues.apache.org/jira/browse/ARROW-11355) - [Rust] Align Date type with spec
+* [ARROW-11358](https://issues.apache.org/jira/browse/ARROW-11358) - [Rust] Add benchmark for concatenating small arrays
+* [ARROW-11360](https://issues.apache.org/jira/browse/ARROW-11360) - [Rust] [DataFusion] Improve CSV "No files found" error message
+* [ARROW-11361](https://issues.apache.org/jira/browse/ARROW-11361) - [Rust] Build buffers from iterator of booleans
+* [ARROW-11362](https://issues.apache.org/jira/browse/ARROW-11362) - [Rust][DataFusion] Use iterator APIs in to\_array\_of\_size to improve performance
+* [ARROW-11365](https://issues.apache.org/jira/browse/ARROW-11365) - [Rust] [Parquet] Implement parsers for v2 of the text schema
+* [ARROW-11366](https://issues.apache.org/jira/browse/ARROW-11366) - [Rust][DataFusion] Add Constant Folding / Support boolean literal in equality expression
+* [ARROW-11367](https://issues.apache.org/jira/browse/ARROW-11367) - [C++] Implement approximante quantile utility
+* [ARROW-11369](https://issues.apache.org/jira/browse/ARROW-11369) - [DataFusion] Split expressions.rs
+* [ARROW-11372](https://issues.apache.org/jira/browse/ARROW-11372) - Support RC verification on macOS-ARM64
+* [ARROW-11373](https://issues.apache.org/jira/browse/ARROW-11373) - [Python][Docs] Add example of specifying type for a column when reading csv file
+* [ARROW-11374](https://issues.apache.org/jira/browse/ARROW-11374) - [Python] Make legacy pyarrow.filesystem / pyarrow.serialize warnings more visisble
+* [ARROW-11375](https://issues.apache.org/jira/browse/ARROW-11375) - [Rust] CI fails due to deprecation warning in clippy
+* [ARROW-11377](https://issues.apache.org/jira/browse/ARROW-11377) - [C++][CI] Add ThreadSanitizer nightly build
+* [ARROW-11383](https://issues.apache.org/jira/browse/ARROW-11383) - [Rust] use trusted len on bit ops
+* [ARROW-11386](https://issues.apache.org/jira/browse/ARROW-11386) - [Release] Fix post documents update script
+* [ARROW-11389](https://issues.apache.org/jira/browse/ARROW-11389) - [Rust] Inconsistent comments for datatypes
+* [ARROW-11395](https://issues.apache.org/jira/browse/ARROW-11395) - [DataFusion] Support custom optimizations
+* [ARROW-11401](https://issues.apache.org/jira/browse/ARROW-11401) - [Rust][DataFusion] Pass slices instead of Vec in DataFrame API
+* [ARROW-11404](https://issues.apache.org/jira/browse/ARROW-11404) - [Rust][DataFusion] Upgrade to aHash 0.7
+* [ARROW-11405](https://issues.apache.org/jira/browse/ARROW-11405) - [DataFusion] Support multiple custom nodes
+* [ARROW-11406](https://issues.apache.org/jira/browse/ARROW-11406) - [CI][C++] Fix caching on Travis-CI builds
+* [ARROW-11408](https://issues.apache.org/jira/browse/ARROW-11408) - Add window support to datafusion readme
+* [ARROW-11411](https://issues.apache.org/jira/browse/ARROW-11411) - [Packaging][Linux] Disable arm64 nightly builds
+* [ARROW-11414](https://issues.apache.org/jira/browse/ARROW-11414) - [Rust] Reduce copies in Schema::try\_merge
+* [ARROW-11417](https://issues.apache.org/jira/browse/ARROW-11417) - [Integration] Add integration test for buffer compression
+* [ARROW-11418](https://issues.apache.org/jira/browse/ARROW-11418) - [Doc] Add IPC buffer compression to support matrix
+* [ARROW-11421](https://issues.apache.org/jira/browse/ARROW-11421) - [Rust][DataFusion] Support group by Date32
+* [ARROW-11422](https://issues.apache.org/jira/browse/ARROW-11422) - [C\#] Add support for decimals
+* [ARROW-11423](https://issues.apache.org/jira/browse/ARROW-11423) - [R] value\_counts and some StructArray methods
+* [ARROW-11425](https://issues.apache.org/jira/browse/ARROW-11425) - [C++][Compute] Improve quantile kernel for integers
+* [ARROW-11426](https://issues.apache.org/jira/browse/ARROW-11426) - [Rust][DataFusion] EXTRACT support
+* [ARROW-11428](https://issues.apache.org/jira/browse/ARROW-11428) - [Rust] Add power kernel
+* [ARROW-11429](https://issues.apache.org/jira/browse/ARROW-11429) - Make string comparisson kernels generic over Utf8 and LargeUtf8
+* [ARROW-11430](https://issues.apache.org/jira/browse/ARROW-11430) - [Rust] Kernel to combine two arrays based on boolean mask
+* [ARROW-11431](https://issues.apache.org/jira/browse/ARROW-11431) - [Rust] [DataFusion] Add support for the SQL HAVING clause
+* [ARROW-11435](https://issues.apache.org/jira/browse/ARROW-11435) - Allow creating ParquetPartition from external crate
+* [ARROW-11436](https://issues.apache.org/jira/browse/ARROW-11436) - [Rust] Allow non-sized iterators in Primitive::from\_iter
+* [ARROW-11437](https://issues.apache.org/jira/browse/ARROW-11437) - [Rust] Simplify benches
+* [ARROW-11438](https://issues.apache.org/jira/browse/ARROW-11438) - Unsupported ast node Value(Boolean(true)) in sqltorel
+* [ARROW-11439](https://issues.apache.org/jira/browse/ARROW-11439) - [Rust] Add year support to temporal kernel
+* [ARROW-11440](https://issues.apache.org/jira/browse/ARROW-11440) - [Rust] [DataFusion] Add method to CsvExec to get CSV schema
+* [ARROW-11442](https://issues.apache.org/jira/browse/ARROW-11442) - [Rust] Expose the logic used to interpret date/times
+* [ARROW-11443](https://issues.apache.org/jira/browse/ARROW-11443) - [Rust] Write datetime information for Date64 Type in csv writer
+* [ARROW-11444](https://issues.apache.org/jira/browse/ARROW-11444) - [Rust][DataFusion] Pass slices instead of &Vec to functions
+* [ARROW-11446](https://issues.apache.org/jira/browse/ARROW-11446) - [DataFusion] Support scalars in builtin functions
+* [ARROW-11447](https://issues.apache.org/jira/browse/ARROW-11447) - [Rust] Add shift kernel
+* [ARROW-11449](https://issues.apache.org/jira/browse/ARROW-11449) - [CI][R][Windows] Use ccache
+* [ARROW-11457](https://issues.apache.org/jira/browse/ARROW-11457) - [Rust] Make string comparisson kernels generic over Utf8 and LargeUtf8 
+* [ARROW-11459](https://issues.apache.org/jira/browse/ARROW-11459) - [Rust] Allow ListArray of primitives to be built from iterator
+* [ARROW-11462](https://issues.apache.org/jira/browse/ARROW-11462) - [Developer] Remove needless quote from the default DOCKER\_VOLUME\_PREFIX
+* [ARROW-11463](https://issues.apache.org/jira/browse/ARROW-11463) - [Python] Allow configuration of IpcWriterOptions 64Bit from PyArrow
+* [ARROW-11466](https://issues.apache.org/jira/browse/ARROW-11466) - [Flight][Go] Add BasicAuth and BearerToken handlers for Go
+* [ARROW-11467](https://issues.apache.org/jira/browse/ARROW-11467) - [R] Fix reference to json\_table\_reader() in R docs
+* [ARROW-11468](https://issues.apache.org/jira/browse/ARROW-11468) - [R] Allow user to pass schema to read\_json\_arrow()
+* [ARROW-11474](https://issues.apache.org/jira/browse/ARROW-11474) - [C++] Update bundled re2 version
+* [ARROW-11476](https://issues.apache.org/jira/browse/ARROW-11476) - [Rust][DataFusion] Test running of TPCH benchmarks in CI
+* [ARROW-11477](https://issues.apache.org/jira/browse/ARROW-11477) - [R][Doc] Reorganize and improve README and vignette content
+* [ARROW-11478](https://issues.apache.org/jira/browse/ARROW-11478) - [R] Consider ways to make arrow.skip\_nul option more user-friendly
+* [ARROW-11479](https://issues.apache.org/jira/browse/ARROW-11479) - [Rust][Parquet] Add method to return compressed size of row group
+* [ARROW-11481](https://issues.apache.org/jira/browse/ARROW-11481) - [Rust] More cast implementations
+* [ARROW-11484](https://issues.apache.org/jira/browse/ARROW-11484) - [Rust] Derive Clone for ExecutionContext
+* [ARROW-11486](https://issues.apache.org/jira/browse/ARROW-11486) - [Website] Use Jekyll 4 and webpack to support Ruby 3.0 or later
+* [ARROW-11489](https://issues.apache.org/jira/browse/ARROW-11489) - [Rust][DataFusion] Make DataFrame Send+Sync
+* [ARROW-11491](https://issues.apache.org/jira/browse/ARROW-11491) - [Rust] Support json schema inference for nested list and struct
+* [ARROW-11493](https://issues.apache.org/jira/browse/ARROW-11493) - [CI][Packaging][deb][RPM] Test built packages
+* [ARROW-11500](https://issues.apache.org/jira/browse/ARROW-11500) - [R] Allow bundled build script to run on Solaris
+* [ARROW-11501](https://issues.apache.org/jira/browse/ARROW-11501) - [C++] endianness check does not work on Solaris
+* [ARROW-11504](https://issues.apache.org/jira/browse/ARROW-11504) - [Rust] verify Datatype in ListArray::from(ArrayDataRef)
+* [ARROW-11505](https://issues.apache.org/jira/browse/ARROW-11505) - [Rust] Add support for LargeUtf8 in csv-writer
+* [ARROW-11507](https://issues.apache.org/jira/browse/ARROW-11507) - [R] Bindings for GetRuntimeInfo
+* [ARROW-11510](https://issues.apache.org/jira/browse/ARROW-11510) - [Python] Add note that pip \>= 19.0 is required to get binary packages
+* [ARROW-11511](https://issues.apache.org/jira/browse/ARROW-11511) - [Rust] Replace Arc<ArrayData\> by ArrayData
+* [ARROW-11512](https://issues.apache.org/jira/browse/ARROW-11512) - [Packaging][deb] Add missing gRPC dependency for Ubuntu 21.04
+* [ARROW-11513](https://issues.apache.org/jira/browse/ARROW-11513) - [R] Bindings for sub/gsub
+* [ARROW-11516](https://issues.apache.org/jira/browse/ARROW-11516) - [R] Allow all C++ compute functions to be called by name in dplyr
+* [ARROW-11539](https://issues.apache.org/jira/browse/ARROW-11539) - [Developer][Archery] Change items\_per\_seconds units
+* [ARROW-11541](https://issues.apache.org/jira/browse/ARROW-11541) - [C++][Compute] Implement approximate quantile kernel
+* [ARROW-11542](https://issues.apache.org/jira/browse/ARROW-11542) - [Rust] json reader should not crash when reading nested list
+* [ARROW-11544](https://issues.apache.org/jira/browse/ARROW-11544) - [Rust] [DataFusion] Implement as\_any for AggregateExpr
+* [ARROW-11545](https://issues.apache.org/jira/browse/ARROW-11545) - [Rust] [DataFusion] SendableRecordBatchStream should implement Sync
+* [ARROW-11556](https://issues.apache.org/jira/browse/ARROW-11556) - [C++] Minor benchmark improvements
+* [ARROW-11557](https://issues.apache.org/jira/browse/ARROW-11557) - [Rust] Add table de-registration to DataFusion ExecutionContext
+* [ARROW-11559](https://issues.apache.org/jira/browse/ARROW-11559) - [C++] Improve flatbuffers verification limits
+* [ARROW-11559](https://issues.apache.org/jira/browse/ARROW-11559) - [C++] Improve flatbuffers verification limits
+* [ARROW-11561](https://issues.apache.org/jira/browse/ARROW-11561) - [Rust][DataFusion] Add Send + Sync to MemTable::load
+* [ARROW-11563](https://issues.apache.org/jira/browse/ARROW-11563) - [Rust] Support Cast(Utf8, TimeStamp(Nanoseconds, None))
+* [ARROW-11568](https://issues.apache.org/jira/browse/ARROW-11568) - [C++][Compute] Mode kernel performance is bad in some conditions
+* [ARROW-11570](https://issues.apache.org/jira/browse/ARROW-11570) - [Rust] ScalarValue - support Date64
+* [ARROW-11571](https://issues.apache.org/jira/browse/ARROW-11571) - [CI] Cancel stale Github Actions workflow runs
+* [ARROW-11572](https://issues.apache.org/jira/browse/ARROW-11572) - [Rust] Add a kernel for division by single scalar
+* [ARROW-11573](https://issues.apache.org/jira/browse/ARROW-11573) - [Developer][Archery] Google benchmark now reports run type
+* [ARROW-11574](https://issues.apache.org/jira/browse/ARROW-11574) - [Rust][DataFusion] Upgrade sqlparser to 0.8 to support parsing all TPC-H queries
+* [ARROW-11575](https://issues.apache.org/jira/browse/ARROW-11575) - [Developer][Archery] Expose execution time in benchmark results
+* [ARROW-11576](https://issues.apache.org/jira/browse/ARROW-11576) - [Rust] Remove unused variable in example
+* [ARROW-11580](https://issues.apache.org/jira/browse/ARROW-11580) - [C++] Add CMake option ARROW\_DEPENDENCY\_SOURCE=VCPKG
+* [ARROW-11581](https://issues.apache.org/jira/browse/ARROW-11581) - [Packaging][C++] Formalize distribution through vcpkg
+* [ARROW-11589](https://issues.apache.org/jira/browse/ARROW-11589) - [R] Add methods for modifying Schemas
+* [ARROW-11590](https://issues.apache.org/jira/browse/ARROW-11590) - [C++] Move CSV background generator to IO thread pool
+* [ARROW-11591](https://issues.apache.org/jira/browse/ARROW-11591) - [C++][Compute] Prototype version of hash aggregation
+* [ARROW-11592](https://issues.apache.org/jira/browse/ARROW-11592) - [Rust] Typo in comment
+* [ARROW-11594](https://issues.apache.org/jira/browse/ARROW-11594) - [Rust] Support pretty printing with NullArrays
+* [ARROW-11597](https://issues.apache.org/jira/browse/ARROW-11597) - [Rust] Split datatypes in a module
+* [ARROW-11598](https://issues.apache.org/jira/browse/ARROW-11598) - [Rust] Split buffer.rs in smaller files
+* [ARROW-11599](https://issues.apache.org/jira/browse/ARROW-11599) - [Rust] Add function to create array with all nulls
+* [ARROW-11601](https://issues.apache.org/jira/browse/ARROW-11601) - [C++][Dataset] Expose pre-buffering in ParquetFileFormatReaderOptions
+* [ARROW-11606](https://issues.apache.org/jira/browse/ARROW-11606) - [Rust] [DataFusion] Need guidance on HashAggregateExec reconstruction
+* [ARROW-11610](https://issues.apache.org/jira/browse/ARROW-11610) - [C++] Download boost from sourceforge instead of bintray
+* [ARROW-11611](https://issues.apache.org/jira/browse/ARROW-11611) - [C++] Update third party dependency mirrors
+* [ARROW-11612](https://issues.apache.org/jira/browse/ARROW-11612) - [C++] Rebuild trimmed boost bundle for 1.75.0
+* [ARROW-11613](https://issues.apache.org/jira/browse/ARROW-11613) - [R] Move nightly C++ builds off of bintray
+* [ARROW-11616](https://issues.apache.org/jira/browse/ARROW-11616) - [Rust][DataFusion] Expose collect\_partitioned for DataFrame
+* [ARROW-11621](https://issues.apache.org/jira/browse/ARROW-11621) - [CI][Gandiva][Linux] Fix Crossbow setup failure
+* [ARROW-11626](https://issues.apache.org/jira/browse/ARROW-11626) - [Rust][DataFusion] Move DataFusion examples to own project to reduce nr dependencies
+* [ARROW-11627](https://issues.apache.org/jira/browse/ARROW-11627) - [Rust] Typed allocator
+* [ARROW-11637](https://issues.apache.org/jira/browse/ARROW-11637) - [CI][Conda] Update nightly clean target platforms and packages list
+* [ARROW-11641](https://issues.apache.org/jira/browse/ARROW-11641) - [CI] Use docker buildkit's inline cache to reuse build cache across different hosts
+* [ARROW-11649](https://issues.apache.org/jira/browse/ARROW-11649) - [R] Add support for null\_fallback to R
+* [ARROW-11651](https://issues.apache.org/jira/browse/ARROW-11651) - [Rust][DataFusion] Implement Postgres Length Functions
+* [ARROW-11653](https://issues.apache.org/jira/browse/ARROW-11653) - Ascii/unicode functions
+* [ARROW-11655](https://issues.apache.org/jira/browse/ARROW-11655) - Pad/trim functions
+* [ARROW-11656](https://issues.apache.org/jira/browse/ARROW-11656) - Left over functions/fixes
+* [ARROW-11659](https://issues.apache.org/jira/browse/ARROW-11659) - [R] Preserve group\_by .drop argument
+* [ARROW-11662](https://issues.apache.org/jira/browse/ARROW-11662) - [C++] Support sorting for decimal data type.
+* [ARROW-11664](https://issues.apache.org/jira/browse/ARROW-11664) - [Rust] Cast to LargeUtf8
+* [ARROW-11665](https://issues.apache.org/jira/browse/ARROW-11665) - [Python] Document precision and scale parameters of decimal128()
+* [ARROW-11666](https://issues.apache.org/jira/browse/ARROW-11666) - [Integration] Add endianness "gold" integration file for decimal256
+* [ARROW-11667](https://issues.apache.org/jira/browse/ARROW-11667) - [Rust] Add docs for utf8 comparison functions
+* [ARROW-11669](https://issues.apache.org/jira/browse/ARROW-11669) - [Rust] [DataFusion] Remove concurrency field from GlobalLimitExec
+* [ARROW-11671](https://issues.apache.org/jira/browse/ARROW-11671) - [Rust][DataFusion] Clean up docs on Expr
+* [ARROW-11677](https://issues.apache.org/jira/browse/ARROW-11677) - [C++][Dataset] Write documentation
+* [ARROW-11680](https://issues.apache.org/jira/browse/ARROW-11680) - [C++] Add vendored version of folly's spsc queue
+* [ARROW-11683](https://issues.apache.org/jira/browse/ARROW-11683) - [R] Support dplyr::mutate()
+* [ARROW-11685](https://issues.apache.org/jira/browse/ARROW-11685) - [C++] Typo in future\_test.cc
+* [ARROW-11688](https://issues.apache.org/jira/browse/ARROW-11688) - [Rust] Casts between utf8 and large-utf8
+* [ARROW-11690](https://issues.apache.org/jira/browse/ARROW-11690) - [Rust][DataFusion] Avoid Expr::clone in Expr builder methods
+* [ARROW-11692](https://issues.apache.org/jira/browse/ARROW-11692) - [Rust][DataFusion] Improve documentation on Optimizer
+* [ARROW-11693](https://issues.apache.org/jira/browse/ARROW-11693) - [C++] Add string length kernel
+* [ARROW-11700](https://issues.apache.org/jira/browse/ARROW-11700) - [R] Internationalize error handling in tidy eval
+* [ARROW-11701](https://issues.apache.org/jira/browse/ARROW-11701) - [R] Implement dplyr::relocate()
+* [ARROW-11703](https://issues.apache.org/jira/browse/ARROW-11703) - [R] Implement dplyr::arrange()
+* [ARROW-11704](https://issues.apache.org/jira/browse/ARROW-11704) - [R] Wire up dplyr::mutate() for datasets
+* [ARROW-11707](https://issues.apache.org/jira/browse/ARROW-11707) - Support CSV schema inference without IO
+* [ARROW-11708](https://issues.apache.org/jira/browse/ARROW-11708) - Clean up Rust 2021 linting warning
+* [ARROW-11709](https://issues.apache.org/jira/browse/ARROW-11709) - [Rust][DataFusion] Move \`expressions\` and \`inputs\` into LogicalPlan rather than helpers in util
+* [ARROW-11710](https://issues.apache.org/jira/browse/ARROW-11710) - [Rust][DataFusion] Implement ExprRewriter to avoid tree traversal redundancy
+* [ARROW-11719](https://issues.apache.org/jira/browse/ARROW-11719) - Support merged schema for memory table
+* [ARROW-11721](https://issues.apache.org/jira/browse/ARROW-11721) - json schema inference should return Schema type instead of SchemaRef
+* [ARROW-11722](https://issues.apache.org/jira/browse/ARROW-11722) - Improve error message in FFI
+* [ARROW-11724](https://issues.apache.org/jira/browse/ARROW-11724) - [C++] Namespace collisions with protobuf 3.15
+* [ARROW-11725](https://issues.apache.org/jira/browse/ARROW-11725) - [Rust][DataFusion] Make use of the new divide\_scalar kernel in arrow
+* [ARROW-11727](https://issues.apache.org/jira/browse/ARROW-11727) - [C++][FlightRPC] Use TDigest to estimate latency quantiles in benchmark
+* [ARROW-11730](https://issues.apache.org/jira/browse/ARROW-11730) - [C++] Add implicit Future(Status) constructor for convenience
+* [ARROW-11733](https://issues.apache.org/jira/browse/ARROW-11733) - [Rust][DataFusion] Support hash repartitioning
+* [ARROW-11734](https://issues.apache.org/jira/browse/ARROW-11734) - [C++] vendored safe-math.h does not compile on Solaris
+* [ARROW-11735](https://issues.apache.org/jira/browse/ARROW-11735) - [R] Allow Parquet and Arrow Dataset to be optional components
+* [ARROW-11736](https://issues.apache.org/jira/browse/ARROW-11736) - [R] Allow string compute functions to be optional
+* [ARROW-11737](https://issues.apache.org/jira/browse/ARROW-11737) - [C++] Patch vendored xxhash for Solaris 
+* [ARROW-11738](https://issues.apache.org/jira/browse/ARROW-11738) - [Rust][DataFusion] Concat Functions
+* [ARROW-11740](https://issues.apache.org/jira/browse/ARROW-11740) - [C++] posix\_memalign not declared in scope on Solaris
+* [ARROW-11742](https://issues.apache.org/jira/browse/ARROW-11742) - [Rust] [DataFusion] Add Expr::is\_null and Expr::is\_not\_null functions
+* [ARROW-11744](https://issues.apache.org/jira/browse/ARROW-11744) - [C++] Add xsimd dependency
+* [ARROW-11745](https://issues.apache.org/jira/browse/ARROW-11745) - [C++] Improve configurability of random data generation
+* [ARROW-11750](https://issues.apache.org/jira/browse/ARROW-11750) - [Python][Dataset] Add support for project expressions
+* [ARROW-11752](https://issues.apache.org/jira/browse/ARROW-11752) - [R] Replace usage of testthat::expect\_is()
+* [ARROW-11753](https://issues.apache.org/jira/browse/ARROW-11753) - [Rust][DataFusion] Add test for Join Statement: Schema contains duplicate unqualified field name
+* [ARROW-11754](https://issues.apache.org/jira/browse/ARROW-11754) - [R] Support dplyr::compute()
+* [ARROW-11761](https://issues.apache.org/jira/browse/ARROW-11761) - [C++] Increase public API testing
+* [ARROW-11766](https://issues.apache.org/jira/browse/ARROW-11766) - [R] Better handling for missing compression codecs on Linux
+* [ARROW-11768](https://issues.apache.org/jira/browse/ARROW-11768) - [C++][CI] Make s390x build non-optional
+* [ARROW-11773](https://issues.apache.org/jira/browse/ARROW-11773) - [Rust] Allow json writer to write out JSON arrays as well as newline formatted objects
+* [ARROW-11774](https://issues.apache.org/jira/browse/ARROW-11774) - [R] one-line install from source on macOS
+* [ARROW-11775](https://issues.apache.org/jira/browse/ARROW-11775) - [Rust][DataFusion] Feature Flags for Dependencies
+* [ARROW-11777](https://issues.apache.org/jira/browse/ARROW-11777) - [Rust] impl AsRef for StringBuilder/BinaryBuilder
+* [ARROW-11778](https://issues.apache.org/jira/browse/ARROW-11778) - Cast from large-utf8 to numerical arrays
+* [ARROW-11779](https://issues.apache.org/jira/browse/ARROW-11779) - [Rust] make alloc module public
+* [ARROW-11790](https://issues.apache.org/jira/browse/ARROW-11790) - [Rust][DataFusion] Change plan builder signature to take Vec<Expr\> rather than &[Expr]
+* [ARROW-11794](https://issues.apache.org/jira/browse/ARROW-11794) - [Go] Add concurrent-safe ipc.FileReader.RecordAt(i)
+* [ARROW-11795](https://issues.apache.org/jira/browse/ARROW-11795) - [MATLAB] Migrate MATLAB Interface for Apache Arrow design doc to Markdown
+* [ARROW-11797](https://issues.apache.org/jira/browse/ARROW-11797) - [C++][Dataset] Provide Scanner methods to yield/visit scanned batches
+* [ARROW-11798](https://issues.apache.org/jira/browse/ARROW-11798) - [Integration] Update testing submodule
+* [ARROW-11799](https://issues.apache.org/jira/browse/ARROW-11799) - [Rust] String and Binary arrays created with incorrect length from unbound iterator
+* [ARROW-11801](https://issues.apache.org/jira/browse/ARROW-11801) - [C++] Remove bad header guard in filesystem/type\_fwd.h
+* [ARROW-11803](https://issues.apache.org/jira/browse/ARROW-11803) - [Rust] [Parquet] Support v2 LogicalType
+* [ARROW-11806](https://issues.apache.org/jira/browse/ARROW-11806) - [Rust][DataFusion] Optimize inner join creation of indices
+* [ARROW-11820](https://issues.apache.org/jira/browse/ARROW-11820) - Added macro create\_native to construct impl
+* [ARROW-11822](https://issues.apache.org/jira/browse/ARROW-11822) - Support case sensitive for function
+* [ARROW-11824](https://issues.apache.org/jira/browse/ARROW-11824) - [Rust] [Parquet] Use logical types in Arrow writer
+* [ARROW-11825](https://issues.apache.org/jira/browse/ARROW-11825) - [Rust][DataFusion] Add mimalloc as option to benchmarks
+* [ARROW-11833](https://issues.apache.org/jira/browse/ARROW-11833) - [C++] Vendored fast\_float errors for emscripten (architecture flag missing)
+* [ARROW-11837](https://issues.apache.org/jira/browse/ARROW-11837) - [C++][Dataset] Expose originating fragment as a property of ScanTask
+* [ARROW-11838](https://issues.apache.org/jira/browse/ARROW-11838) - [C++] Support reading IPC data with shared dictionaries
+* [ARROW-11839](https://issues.apache.org/jira/browse/ARROW-11839) - [C++] Rewrite bit-unpacking optimizations using xsimd
+* [ARROW-11842](https://issues.apache.org/jira/browse/ARROW-11842) - [Rust][Parquet] Use more efficient clone\_from in get\_batch\_with\_dict
+* [ARROW-11852](https://issues.apache.org/jira/browse/ARROW-11852) - [Documentation] Update CONTRIBUTING to explain Contributor role
+* [ARROW-11856](https://issues.apache.org/jira/browse/ARROW-11856) - [C++] Remove unused reference to RecordBatchStreamWriter
+* [ARROW-11858](https://issues.apache.org/jira/browse/ARROW-11858) - [GLib] Gandiva Filter in GLib
+* [ARROW-11859](https://issues.apache.org/jira/browse/ARROW-11859) - [GLib] GArrowArray: concatenate is missing
+* [ARROW-11861](https://issues.apache.org/jira/browse/ARROW-11861) - [R][Packaging] Apply changes in r/tools/autobrew upstream
+* [ARROW-11864](https://issues.apache.org/jira/browse/ARROW-11864) - [R] Document arrow.int64\_downcast option
+* [ARROW-11870](https://issues.apache.org/jira/browse/ARROW-11870) - [Dev] Automatically run merge script in venv
+* [ARROW-11876](https://issues.apache.org/jira/browse/ARROW-11876) - [Website] Update governance page
+* [ARROW-11877](https://issues.apache.org/jira/browse/ARROW-11877) - [C++] Add initial microbenchmarks for Dataset internals
+* [ARROW-11879](https://issues.apache.org/jira/browse/ARROW-11879) - [Rust][DataFusion] ExecutionContext::sql should optimize query plan
+* [ARROW-11883](https://issues.apache.org/jira/browse/ARROW-11883) - [C++] Add ConcatMap, MergeMap, and an async-reentrant version of Map 
+* [ARROW-11887](https://issues.apache.org/jira/browse/ARROW-11887) - [C++] Add asynchronous read to streaming CSV reader
+* [ARROW-11894](https://issues.apache.org/jira/browse/ARROW-11894) - [Rust][DataFusion] Change flight server example to use DataFrame API
+* [ARROW-11895](https://issues.apache.org/jira/browse/ARROW-11895) - [Rust][DataFusion] Add support for extra column statistics
+* [ARROW-11898](https://issues.apache.org/jira/browse/ARROW-11898) - [Rust] Pretty print columns
+* [ARROW-11899](https://issues.apache.org/jira/browse/ARROW-11899) - [Java] Refactor the compression codec implementation into core/Arrow specific parts
+* [ARROW-11900](https://issues.apache.org/jira/browse/ARROW-11900) - [Website] Add Yibo to committer list
+* [ARROW-11906](https://issues.apache.org/jira/browse/ARROW-11906) - [R] Make FeatherReader print method more informative
+* [ARROW-11907](https://issues.apache.org/jira/browse/ARROW-11907) - [C++] Use our own executor in S3FileSystem
+* [ARROW-11910](https://issues.apache.org/jira/browse/ARROW-11910) - [Packaging][Ubuntu] Drop support for 16.04
+* [ARROW-11911](https://issues.apache.org/jira/browse/ARROW-11911) - [Website] Add protobuf vs arrow to FAQ
+* [ARROW-11912](https://issues.apache.org/jira/browse/ARROW-11912) - [R] Remove args from FeatherReader$create
+* [ARROW-11913](https://issues.apache.org/jira/browse/ARROW-11913) - [Rust] Improve performance of StringBuilder
+* [ARROW-11920](https://issues.apache.org/jira/browse/ARROW-11920) - [R] Add r/libarrow to make clean
+* [ARROW-11921](https://issues.apache.org/jira/browse/ARROW-11921) - [R] Set LC\_COLLATE in r/data-raw/codegen.R
+* [ARROW-11924](https://issues.apache.org/jira/browse/ARROW-11924) - [C++] Provide streaming output from GetFileInfo
+* [ARROW-11925](https://issues.apache.org/jira/browse/ARROW-11925) - [R] Add \`between\` method for arrow\_dplyr\_query
+* [ARROW-11927](https://issues.apache.org/jira/browse/ARROW-11927) - [Rust][DataFusion]  Support limit push down
+* [ARROW-11931](https://issues.apache.org/jira/browse/ARROW-11931) - [Go][CI] Bump CI to use Go 1.15
+* [ARROW-11935](https://issues.apache.org/jira/browse/ARROW-11935) - [C++] Add push generator
+* [ARROW-11944](https://issues.apache.org/jira/browse/ARROW-11944) - [Developer] Achery benchmark diff regression: cannot compare jsons
+* [ARROW-11949](https://issues.apache.org/jira/browse/ARROW-11949) - [Ruby] Accept raw Ruby objects as sort key and options
+* [ARROW-11951](https://issues.apache.org/jira/browse/ARROW-11951) - [Rust] Remove OffsetSize::prefix
+* [ARROW-11952](https://issues.apache.org/jira/browse/ARROW-11952) - [Rust] Make ArrayData --\> GenericListArray fallable instead of \`panic!\`
+* [ARROW-11954](https://issues.apache.org/jira/browse/ARROW-11954) - [C++] arrow/util/io\_util.cc does not compile on Solaris
+* [ARROW-11955](https://issues.apache.org/jira/browse/ARROW-11955) - [Rust][DataFusion] Support Union
+* [ARROW-11958](https://issues.apache.org/jira/browse/ARROW-11958) - [GLib] GArrowChunkedArray: combine is missing
+* [ARROW-11959](https://issues.apache.org/jira/browse/ARROW-11959) - [Rust][DataFusion] Fix logging of optimized plan
+* [ARROW-11962](https://issues.apache.org/jira/browse/ARROW-11962) - [Rust][DataFusion] Update Datafusion Docs / readme
+* [ARROW-11969](https://issues.apache.org/jira/browse/ARROW-11969) - [Rust][DataFusion] Improve Examples in documentation
+* [ARROW-11972](https://issues.apache.org/jira/browse/ARROW-11972) - [C++][Dataset] Extract IpcFragmentScanOptions, ParquetFragmentScanOptions
+* [ARROW-11973](https://issues.apache.org/jira/browse/ARROW-11973) - [Rust] Boolean AND/OR kernels should follow sql behaviour regarding null values
+* [ARROW-11977](https://issues.apache.org/jira/browse/ARROW-11977) - [Rust] Add documentation examples for sort kernel
+* [ARROW-11982](https://issues.apache.org/jira/browse/ARROW-11982) - [Rust] Donate Ballista Distributed Compute Platform
+* [ARROW-11984](https://issues.apache.org/jira/browse/ARROW-11984) - [C++][Gandiva] Implement SHA1 and SHA256 functions 
+* [ARROW-11987](https://issues.apache.org/jira/browse/ARROW-11987) - [C++][Gandiva] Implement trigonometric functions on Gandiva
+* [ARROW-11988](https://issues.apache.org/jira/browse/ARROW-11988) - [C++][Gandiva] Implements the last\_day function
+* [ARROW-11992](https://issues.apache.org/jira/browse/ARROW-11992) - [Rust][Parquet] Add upgrade notes on 4.0 rename of LogicalType \#9731
+* [ARROW-11993](https://issues.apache.org/jira/browse/ARROW-11993) - [C++] Don't download xsimd if ARROW\_SIMD\_LEVEL=NONE
+* [ARROW-11996](https://issues.apache.org/jira/browse/ARROW-11996) - [R] Make r/configure run successfully on Solaris
+* [ARROW-11999](https://issues.apache.org/jira/browse/ARROW-11999) - [Java] Support parallel vector element search with user-specified comparator
+* [ARROW-12000](https://issues.apache.org/jira/browse/ARROW-12000) - [Documentation] Add note about deviation from style guide on struct/classes
+* [ARROW-12005](https://issues.apache.org/jira/browse/ARROW-12005) - [R] Fix a bash typo in configure
+* [ARROW-12017](https://issues.apache.org/jira/browse/ARROW-12017) - [R] [Documentation] Make proper developing arrow docs
+* [ARROW-12019](https://issues.apache.org/jira/browse/ARROW-12019) - [Rust] [Parquet] Update README for 2.6.0 support
+* [ARROW-12020](https://issues.apache.org/jira/browse/ARROW-12020) - [Rust][DataFusion] Adding SHOW TABLES and SHOW COLUMNS + partial information\_schema support to DataFusion
+* [ARROW-12031](https://issues.apache.org/jira/browse/ARROW-12031) - [C++][CSV] infer CSV timestamps columns with fractional seconds
+* [ARROW-12032](https://issues.apache.org/jira/browse/ARROW-12032) - [Rust] Optimize comparison kernels using trusted\_len iterator for bools
+* [ARROW-12034](https://issues.apache.org/jira/browse/ARROW-12034) - [Docs] Formalize Minor PRs
+* [ARROW-12037](https://issues.apache.org/jira/browse/ARROW-12037) - [Rust] [DataFusion] Support catalogs and schemas for table namespacing
+* [ARROW-12038](https://issues.apache.org/jira/browse/ARROW-12038) - [Rust][DataFusion] Upgrade hashbrown to 0.11
+* [ARROW-12039](https://issues.apache.org/jira/browse/ARROW-12039) - [CI][C++][Gandiva] Fix gandiva nightly linux build failure
+* [ARROW-12040](https://issues.apache.org/jira/browse/ARROW-12040) - [R] [CI] [C++] test-r-rstudio-r-base-3.6-opensuse15 timing out during tests
+* [ARROW-12043](https://issues.apache.org/jira/browse/ARROW-12043) - [Rust] [Parquet] Write fixed size binary arrays
+* [ARROW-12045](https://issues.apache.org/jira/browse/ARROW-12045) - First Chunk of ported Parquet Code
+* [ARROW-12047](https://issues.apache.org/jira/browse/ARROW-12047) - [Rust] Clippy parquet
+* [ARROW-12048](https://issues.apache.org/jira/browse/ARROW-12048) - [Rust][DataFusion] Support Common Table Expressions
+* [ARROW-12052](https://issues.apache.org/jira/browse/ARROW-12052) - [Rust] Implement child data in C FFI
+* [ARROW-12056](https://issues.apache.org/jira/browse/ARROW-12056) - [C++] Create sequencing AsyncGenerator
+* [ARROW-12058](https://issues.apache.org/jira/browse/ARROW-12058) - [Python] Enable arithmetic operations on Expressions
+* [ARROW-12068](https://issues.apache.org/jira/browse/ARROW-12068) - [Python] Stop using distutils
+* [ARROW-12069](https://issues.apache.org/jira/browse/ARROW-12069) - [C++][Gandiva]Implement IN expressions for Decimal types
+* [ARROW-12070](https://issues.apache.org/jira/browse/ARROW-12070) - [GLib] Drop support for GNU Autotools
+* [ARROW-12071](https://issues.apache.org/jira/browse/ARROW-12071) - [GLib] Keep input stream reference of GArrowJSONReader
+* [ARROW-12075](https://issues.apache.org/jira/browse/ARROW-12075) - [Rust][DataFusion] Add CTE to list of supported features
+* [ARROW-12081](https://issues.apache.org/jira/browse/ARROW-12081) - [R] Bindings for utf8\_length
+* [ARROW-12082](https://issues.apache.org/jira/browse/ARROW-12082) - [R][Dataset] Allow create dataset from vector of file paths
+* [ARROW-12094](https://issues.apache.org/jira/browse/ARROW-12094) - [C++][R] Fix/workaround re2 building on clang/libc++
+* [ARROW-12097](https://issues.apache.org/jira/browse/ARROW-12097) - [C++] Modify BackgroundGenerator so it creates fewer threads
+* [ARROW-12098](https://issues.apache.org/jira/browse/ARROW-12098) - [R] Catch cpp build failures on linux
+* [ARROW-12104](https://issues.apache.org/jira/browse/ARROW-12104) - Next Chunk of ported Code
+* [ARROW-12106](https://issues.apache.org/jira/browse/ARROW-12106) - [Rust][DataFusion] Support \`SELECT \* from information\_schema.tables\`
+* [ARROW-12107](https://issues.apache.org/jira/browse/ARROW-12107) - [Rust][DataFusion] Support \`SELECT \* from information\_schema.columns\`
+* [ARROW-12108](https://issues.apache.org/jira/browse/ARROW-12108) - [Rust][DataFusion] Support \`SHOW TABLES\`
+* [ARROW-12109](https://issues.apache.org/jira/browse/ARROW-12109) - [Rust][DataFusion] Support \`SHOW COLUMNS\`
+* [ARROW-12110](https://issues.apache.org/jira/browse/ARROW-12110) - [Java] Implement ZSTD buffer compression for java
+* [ARROW-12111](https://issues.apache.org/jira/browse/ARROW-12111) - [Java] place files generated by flatc under source control
+* [ARROW-12116](https://issues.apache.org/jira/browse/ARROW-12116) - [Rust] Fix or ignore 1.51 clippy lints
+* [ARROW-12119](https://issues.apache.org/jira/browse/ARROW-12119) - [Rust][DataFusion] Improve performance of to\_array\_of\_size
+* [ARROW-12120](https://issues.apache.org/jira/browse/ARROW-12120) - [Rust] Generate random arrays and batches
+* [ARROW-12121](https://issues.apache.org/jira/browse/ARROW-12121) - [Rust] [Parquet] Arrow writer benchmarks
+* [ARROW-12123](https://issues.apache.org/jira/browse/ARROW-12123) - [Rust][DataFusion] Use smallvec for indices for better join performance
+* [ARROW-12128](https://issues.apache.org/jira/browse/ARROW-12128) - [CI][Crossbow] Remove (or fix) test-ubuntu-16.04-cpp job
+* [ARROW-12131](https://issues.apache.org/jira/browse/ARROW-12131) - [CI][GLib] Ensure upgrading MSYS2
+* [ARROW-12133](https://issues.apache.org/jira/browse/ARROW-12133) - [C++][Gandiva] Add option to disable setting mcpu flag to host cpu during llvm ir compilation
+* [ARROW-12134](https://issues.apache.org/jira/browse/ARROW-12134) - [C++] Add regex string match kernel
+* [ARROW-12136](https://issues.apache.org/jira/browse/ARROW-12136) - [Rust][DataFusion] Reduce default batch\_size to 8192
+* [ARROW-12139](https://issues.apache.org/jira/browse/ARROW-12139) - [Python][Packaging] Use vcpkg to build macOS wheels
+* [ARROW-12141](https://issues.apache.org/jira/browse/ARROW-12141) - [R] Bindings for grepl
+* [ARROW-12143](https://issues.apache.org/jira/browse/ARROW-12143) - [CI] R builds should timeout and fail after some threshold and dump the output.
+* [ARROW-12146](https://issues.apache.org/jira/browse/ARROW-12146) - [C++][Gandiva] Implement CONVERT\_FROM(expression, ‘UTF8’, replacement char) function
+* [ARROW-12151](https://issues.apache.org/jira/browse/ARROW-12151) - [Docs] Add Jira component + summary conventions to the docs
+* [ARROW-12153](https://issues.apache.org/jira/browse/ARROW-12153) - [Rust] [Parquet] Return file metadata after writing Parquet file
+* [ARROW-12160](https://issues.apache.org/jira/browse/ARROW-12160) - [Rust] Add an \`into\_inner()\` method to ipc::writer::StreamWriter
+* [ARROW-12164](https://issues.apache.org/jira/browse/ARROW-12164) - [Java] Make BaseAllocator.Config public
+* [ARROW-12165](https://issues.apache.org/jira/browse/ARROW-12165) - [Rust] Inline append functions in builders for performance
+* [ARROW-12168](https://issues.apache.org/jira/browse/ARROW-12168) - [Go][IPC] Implement Compression handling for IPC
+* [ARROW-12170](https://issues.apache.org/jira/browse/ARROW-12170) - [Rust][DataFusion] Introduce repartition optimization
+* [ARROW-12173](https://issues.apache.org/jira/browse/ARROW-12173) - [GLib] Remove \#include <config.h\>
+* [ARROW-12176](https://issues.apache.org/jira/browse/ARROW-12176) - parquet/low-level-api/reader-writer.cc has some typos.
+* [ARROW-12187](https://issues.apache.org/jira/browse/ARROW-12187) - [C++][FlightRPC] Enable compression in Flight benchmark
+* [ARROW-12188](https://issues.apache.org/jira/browse/ARROW-12188) - [Docs] Switch to pydata-sphinx-theme for the main sphinx docs
+* [ARROW-12190](https://issues.apache.org/jira/browse/ARROW-12190) - [Rust][DataFusion] Implement partitioned hash join
+* [ARROW-12192](https://issues.apache.org/jira/browse/ARROW-12192) - [Website] Use downloadable URL for archive download
+* [ARROW-12193](https://issues.apache.org/jira/browse/ARROW-12193) - [Dev][Release] Use downloadable URL for archive download
+* [ARROW-12194](https://issues.apache.org/jira/browse/ARROW-12194) - [Rust] [Parquet] Update zstd version
+* [ARROW-12197](https://issues.apache.org/jira/browse/ARROW-12197) - [R] dplyr bindings for cast, dictionary\_encode
+* [ARROW-12200](https://issues.apache.org/jira/browse/ARROW-12200) - [R] Export and document list\_compute\_functions
+* [ARROW-12204](https://issues.apache.org/jira/browse/ARROW-12204) - [Rust][CI] Reduce size of rust build artifacts in integration test
+* [ARROW-12206](https://issues.apache.org/jira/browse/ARROW-12206) - [Python] Fix Table docstrings
+* [ARROW-12208](https://issues.apache.org/jira/browse/ARROW-12208) - [C++] Add the ability to run async tasks without using the CPU thread pool
+* [ARROW-12210](https://issues.apache.org/jira/browse/ARROW-12210) - [Rust][DataFusion] Document SHOW TABLES / SHOW COLUMNS / InformationSchema
+* [ARROW-12214](https://issues.apache.org/jira/browse/ARROW-12214) - [Rust][DataFusion] Add some tests for limit
+* [ARROW-12215](https://issues.apache.org/jira/browse/ARROW-12215) - [C++] fixed size binary columns cannot be null in CSV reader
+* [ARROW-12217](https://issues.apache.org/jira/browse/ARROW-12217) - [C++] Cleanup cpp examples source file names
+* [ARROW-12222](https://issues.apache.org/jira/browse/ARROW-12222) - [Dev][Packaging] Include build url in the crossbow console report
+* [ARROW-12224](https://issues.apache.org/jira/browse/ARROW-12224) -  [Rust] Use stable rust for no default test, clean up CI tests
+* [ARROW-12228](https://issues.apache.org/jira/browse/ARROW-12228) - [CI] Create base image for conda environments 
+* [ARROW-12236](https://issues.apache.org/jira/browse/ARROW-12236) - [R][CI] Add check that all docs pages are listed in \_pkgdown.yml
+* [ARROW-12237](https://issues.apache.org/jira/browse/ARROW-12237) - [Packaging][Debian] Add support for bulleye
+* [ARROW-12238](https://issues.apache.org/jira/browse/ARROW-12238) - [JS] Remove trailing spaces
+* [ARROW-12239](https://issues.apache.org/jira/browse/ARROW-12239) - [JS] Switch to yarn
+* [ARROW-12242](https://issues.apache.org/jira/browse/ARROW-12242) - [Python][Doc] Tweak nightly build instructions
+* [ARROW-12246](https://issues.apache.org/jira/browse/ARROW-12246) - [CI] Sync conda recipes with upstream feedstock
+* [ARROW-12248](https://issues.apache.org/jira/browse/ARROW-12248) - [C++] Allow static builds to change memory allocators 
+* [ARROW-12249](https://issues.apache.org/jira/browse/ARROW-12249) - [R] [CI] Fix test-r-install-local nightlies
+* [ARROW-12251](https://issues.apache.org/jira/browse/ARROW-12251) - [Rust] [Ballista] Add Ballista tests to CI
+* [ARROW-12263](https://issues.apache.org/jira/browse/ARROW-12263) - [Dev][Packaging] Move Crossbow to Archery
+* [ARROW-12269](https://issues.apache.org/jira/browse/ARROW-12269) - [JS] Move to eslint
+* [ARROW-12274](https://issues.apache.org/jira/browse/ARROW-12274) - [JS] Document how to run tests without building
+* [ARROW-12277](https://issues.apache.org/jira/browse/ARROW-12277) - [Rust][DataFusion] Min/Max are not supported for timestamp types
+* [ARROW-12278](https://issues.apache.org/jira/browse/ARROW-12278) - [Rust][DataFusion]Use Timestamp(Nanosecond, None) for SQL TIMESTAMP Type
+* [ARROW-12280](https://issues.apache.org/jira/browse/ARROW-12280) - [Developer] Remove @-mentions from commit messages in merge tool
+* [ARROW-12281](https://issues.apache.org/jira/browse/ARROW-12281) - [JS] Remove shx, trash, and rimraf
+* [ARROW-12283](https://issues.apache.org/jira/browse/ARROW-12283) - [R] Bindings for basic type convert functions in dplyr verbs
+* [ARROW-12286](https://issues.apache.org/jira/browse/ARROW-12286) - [C++] Create AsyncGenerator from Future<AsyncGenerator<T\>\>
+* [ARROW-12287](https://issues.apache.org/jira/browse/ARROW-12287) - [C++] Create enumerating generator
+* [ARROW-12288](https://issues.apache.org/jira/browse/ARROW-12288) - [C++] Create Scanner interface
+* [ARROW-12289](https://issues.apache.org/jira/browse/ARROW-12289) - [C++] Create basic AsyncScanner implementation
+* [ARROW-12303](https://issues.apache.org/jira/browse/ARROW-12303) - [JS] Use iterators instead of generators in critical code paths
+* [ARROW-12304](https://issues.apache.org/jira/browse/ARROW-12304) - [R] Update news and polish docs for 4.0
+* [ARROW-12305](https://issues.apache.org/jira/browse/ARROW-12305) - [JS] Benchmark test data generate.py assumes python 2
+* [ARROW-12309](https://issues.apache.org/jira/browse/ARROW-12309) - [JS] Make es2015 bundles the default
+* [ARROW-12316](https://issues.apache.org/jira/browse/ARROW-12316) - [C++] Switch default memory allocator from jemalloc to mimalloc on macOS
+* [ARROW-12317](https://issues.apache.org/jira/browse/ARROW-12317) - [Rust] JSON writer does not support time, date or interval types
+* [ARROW-12320](https://issues.apache.org/jira/browse/ARROW-12320) - [CI] REPO arg missing from conda-cpp-valgrind
+* [ARROW-12323](https://issues.apache.org/jira/browse/ARROW-12323) - [C++][Gandiva] Implement castTIME(timestamp) function
+* [ARROW-12325](https://issues.apache.org/jira/browse/ARROW-12325) - [C++] [CI] Nightly gandiva build failing due to failure of compiler to move return value
+* [ARROW-12326](https://issues.apache.org/jira/browse/ARROW-12326) - [C++] Avoid needless c-ares detection
+* [ARROW-12328](https://issues.apache.org/jira/browse/ARROW-12328) - [Rust] [Ballista] Fix code formatting
+* [ARROW-12329](https://issues.apache.org/jira/browse/ARROW-12329) - [Rust] [Ballista] Add README
+* [ARROW-12332](https://issues.apache.org/jira/browse/ARROW-12332) - [Rust] [Ballista] Api server for scheduler
+* [ARROW-12333](https://issues.apache.org/jira/browse/ARROW-12333) - [JS] Remove jest-environment-node-debug and do not emit from typescript by default
+* [ARROW-12335](https://issues.apache.org/jira/browse/ARROW-12335) - [Rust] [Ballista] Bump DataFusion version
+* [ARROW-12337](https://issues.apache.org/jira/browse/ARROW-12337) - add DoubleEndedIterator and ExactSizeIterator traits
+* [ARROW-12351](https://issues.apache.org/jira/browse/ARROW-12351) - [CI][Ruby] Use ruby/setup-ruby instead of actions/setup-ruby
+* [ARROW-12352](https://issues.apache.org/jira/browse/ARROW-12352) - [CI][R][Windows] Remove needless workaround for MSYS2
+* [ARROW-12353](https://issues.apache.org/jira/browse/ARROW-12353) - [Packaging][deb] Rename -archive-keyring to -apt-source
+* [ARROW-12354](https://issues.apache.org/jira/browse/ARROW-12354) - [Packaging][RPM] Use apache.jfrog.io/artifactory/ instead of apache.bintray.com/
+* [ARROW-12356](https://issues.apache.org/jira/browse/ARROW-12356) - [Website] Update install page instructions to point to artifactory
+* [ARROW-12361](https://issues.apache.org/jira/browse/ARROW-12361) - [Rust] [DataFusion] Allow users to override physical optimization rules
+* [ARROW-12367](https://issues.apache.org/jira/browse/ARROW-12367) - [C++] Stop producing when PushGenerator was destroyed
+* [ARROW-12370](https://issues.apache.org/jira/browse/ARROW-12370) - [R] Bindings for power kernel
+* [ARROW-12374](https://issues.apache.org/jira/browse/ARROW-12374) - [CI][C++][cron] Use Ubuntu 20.04 instead of 16.04
+* [ARROW-12375](https://issues.apache.org/jira/browse/ARROW-12375) - [Release] Remove rebase post-release scripts
+* [ARROW-12376](https://issues.apache.org/jira/browse/ARROW-12376) - [Dev] archery trigger-bot should use logger.exception
+* [ARROW-12380](https://issues.apache.org/jira/browse/ARROW-12380) - [Rust][Ballista] Add scheduler ui
+* [ARROW-12381](https://issues.apache.org/jira/browse/ARROW-12381) - [Packaging][Python] macOS wheels are built with wrong package kind
+* [ARROW-12383](https://issues.apache.org/jira/browse/ARROW-12383) - [JS] Update direct deps
+* [ARROW-12384](https://issues.apache.org/jira/browse/ARROW-12384) - [JS] Improve code style
+* [ARROW-12389](https://issues.apache.org/jira/browse/ARROW-12389) - [R] [Docs] Add note about autocasting
+* [ARROW-12395](https://issues.apache.org/jira/browse/ARROW-12395) - [C++]: Create RunInSerialExecutor benchmark
+* [ARROW-12396](https://issues.apache.org/jira/browse/ARROW-12396) - [Python][Docs] Clarify serialization docstrings about deprecated status
+* [ARROW-12397](https://issues.apache.org/jira/browse/ARROW-12397) - [Rust] [DataFusion] Simplify readme example \#10038
+* [ARROW-12398](https://issues.apache.org/jira/browse/ARROW-12398) - [Rust] Remove double bound checks in iterators
+* [ARROW-12400](https://issues.apache.org/jira/browse/ARROW-12400) - [Rust] Re-enable transform module tests
+* [ARROW-12402](https://issues.apache.org/jira/browse/ARROW-12402) - [Rust] [DataFusion] Implement SQL metrics framework
+* [ARROW-12406](https://issues.apache.org/jira/browse/ARROW-12406) - [R] fix checkbashims violation in configure
+* [ARROW-12409](https://issues.apache.org/jira/browse/ARROW-12409) - [R] Remove LazyData from DESCRIPTION
+* [ARROW-12419](https://issues.apache.org/jira/browse/ARROW-12419) - [Java] flatc is not used in mvn
+* [ARROW-12420](https://issues.apache.org/jira/browse/ARROW-12420) - [C++/Dataset] Reading null columns as dictionary not longer possible
+* [ARROW-12423](https://issues.apache.org/jira/browse/ARROW-12423) - [Docs] Codecov badge in main Readme only applies to Rust
+* [ARROW-12425](https://issues.apache.org/jira/browse/ARROW-12425) - [Rust] new\_null\_array doesn't allocate keys buffer for dictionary arrays
+* [ARROW-12432](https://issues.apache.org/jira/browse/ARROW-12432) - [Rust] [DataFusion] Add metrics for SortExec
+* [ARROW-12436](https://issues.apache.org/jira/browse/ARROW-12436) - [Rust][Ballista] Add watch capabilities to config backend trait
+* [ARROW-12467](https://issues.apache.org/jira/browse/ARROW-12467) - [C++][Gandiva] Add support for LLVM12
+* [ARROW-12477](https://issues.apache.org/jira/browse/ARROW-12477) - [Release] Download linux aarch64 miniforge in verify-release-candidate.sh
+* [ARROW-12485](https://issues.apache.org/jira/browse/ARROW-12485) - [C++] Use mimalloc as the default memory allocator on macOS
+* [ARROW-12488](https://issues.apache.org/jira/browse/ARROW-12488) - [GLib] Use g\_memdup2() with GLib 2.68 or later
+* [ARROW-12494](https://issues.apache.org/jira/browse/ARROW-12494) - [C++] ORC adapter fails to compile on GCC 4.8 
+* [ARROW-12506](https://issues.apache.org/jira/browse/ARROW-12506) - [Python] Improve modularity of pyarrow codebase to speedup compile time
+* [ARROW-12652](https://issues.apache.org/jira/browse/ARROW-12652) - disable conda arm64 in nightly
+* [PARQUET-1846](https://issues.apache.org/jira/browse/PARQUET-1846) - [C++] Remove deprecated IO classes and related functions
+* [PARQUET-1899](https://issues.apache.org/jira/browse/PARQUET-1899) - [C++] Deprecated ReadBatchSpaced in parquet/column\_reader
+* [PARQUET-1990](https://issues.apache.org/jira/browse/PARQUET-1990) - [C++] ConvertedType::NA is  written out in some cases
+* [PARQUET-1993](https://issues.apache.org/jira/browse/PARQUET-1993) - [C++] Expose when prefetching completes
+
+
+
+# Apache Arrow 3.0.0 (2021-01-25)
 
 ## New Features and Improvements
 
 * [ARROW-1846](https://issues.apache.org/jira/browse/ARROW-1846) - [C++] Implement "any" reduction kernel for boolean data
-* [ARROW-3850](https://issues.apache.org/jira/browse/ARROW-3850) - [Python] Support MapType and StructType for enhanced PySpark integration
 * [ARROW-4193](https://issues.apache.org/jira/browse/ARROW-4193) - [Rust] Add support for decimal data type
 * [ARROW-4544](https://issues.apache.org/jira/browse/ARROW-4544) - [Rust] Read nested JSON structs into StructArrays
 * [ARROW-4804](https://issues.apache.org/jira/browse/ARROW-4804) - [Rust] Read temporal values from CSV - Parse Date32 and Date64 in CSV reader
@@ -39,6 +2017,7 @@
 * [ARROW-9296](https://issues.apache.org/jira/browse/ARROW-9296) - [CI][Rust] Enable more clippy lint checks
 * [ARROW-9304](https://issues.apache.org/jira/browse/ARROW-9304) - [C++] Add "AppendEmptyValue" builder APIs for use inside StructBuilder::AppendNull
 * [ARROW-9361](https://issues.apache.org/jira/browse/ARROW-9361) - [Rust] Move other array types into their own modules
+* [ARROW-9367](https://issues.apache.org/jira/browse/ARROW-9367) - [Python] Sorting on pyarrow data structures ?
 * [ARROW-9400](https://issues.apache.org/jira/browse/ARROW-9400) - [Python] Do not depend on conda-forge static libraries in Windows wheel builds
 * [ARROW-9475](https://issues.apache.org/jira/browse/ARROW-9475) - [Java] Clean up usages of BaseAllocator, use BufferAllocator instead
 * [ARROW-9489](https://issues.apache.org/jira/browse/ARROW-9489) - [C++] Add fill\_null kernel implementation for (array[string], scalar[string])
@@ -326,9 +2305,9 @@
 * [ARROW-10908](https://issues.apache.org/jira/browse/ARROW-10908) - [Rust] [DataFusion] Update relevant tpch-queries with BETWEEN
 * [ARROW-10917](https://issues.apache.org/jira/browse/ARROW-10917) - [Rust][Doc] Update feature matrix
 * [ARROW-10918](https://issues.apache.org/jira/browse/ARROW-10918) - [C++][Doc] Document supported Parquet features
-* [ARROW-10927](https://issues.apache.org/jira/browse/ARROW-10927) - Add Decimal to ArrayBuilderReader for physical type fixed size binary
-* [ARROW-10927](https://issues.apache.org/jira/browse/ARROW-10927) - Add Decimal to ArrayBuilderReader for physical type fixed size binary
-* [ARROW-10927](https://issues.apache.org/jira/browse/ARROW-10927) - Add Decimal to ArrayBuilderReader for physical type fixed size binary
+* [ARROW-10927](https://issues.apache.org/jira/browse/ARROW-10927) - [Rust] [Parquet] Add Decimal to ArrayBuilderReader for physical type fixed size binary
+* [ARROW-10927](https://issues.apache.org/jira/browse/ARROW-10927) - [Rust] [Parquet] Add Decimal to ArrayBuilderReader for physical type fixed size binary
+* [ARROW-10927](https://issues.apache.org/jira/browse/ARROW-10927) - [Rust] [Parquet] Add Decimal to ArrayBuilderReader for physical type fixed size binary
 * [ARROW-10929](https://issues.apache.org/jira/browse/ARROW-10929) - [Rust] Migrate CI tests to stable rust
 * [ARROW-10933](https://issues.apache.org/jira/browse/ARROW-10933) - [Rust] Update docs in regard to stable rust
 * [ARROW-10934](https://issues.apache.org/jira/browse/ARROW-10934) - [Python] Tests are failed with fsspec-0.8.5
@@ -458,6 +2437,7 @@
 * [ARROW-11292](https://issues.apache.org/jira/browse/ARROW-11292) - [Release][JS] Use Node.JS LTS
 * [ARROW-11293](https://issues.apache.org/jira/browse/ARROW-11293) - [C++] Don't require Boost and gflags with find\_package(Arrow)
 * [ARROW-11307](https://issues.apache.org/jira/browse/ARROW-11307) - [Release][Ubuntu][20.10] Add workaround for dependency issue
+* [ARROW-11454](https://issues.apache.org/jira/browse/ARROW-11454) - [Website] [Rust] 3.0.0 Blog Post
 * [PARQUET-1566](https://issues.apache.org/jira/browse/PARQUET-1566) - [C++] Indicate if null count, distinct count are present in column statistics
 
 
@@ -471,6 +2451,7 @@
 * [ARROW-9027](https://issues.apache.org/jira/browse/ARROW-9027) - [Python] Split in multiple files + clean-up pyarrow.parquet tests
 * [ARROW-9479](https://issues.apache.org/jira/browse/ARROW-9479) - [JS] Table.from fails for zero-item Lists, FixedSizeLists, Maps. ditto Table.empty
 * [ARROW-9636](https://issues.apache.org/jira/browse/ARROW-9636) - [Python] Update documentation about 'LZO' compression in parquet.write\_table
+* [ARROW-9690](https://issues.apache.org/jira/browse/ARROW-9690) - [Go] tests failing on s390x 
 * [ARROW-9776](https://issues.apache.org/jira/browse/ARROW-9776) - [R] read\_feather causes segfault in R if file doesn't exist
 * [ARROW-9897](https://issues.apache.org/jira/browse/ARROW-9897) - [C++][Gandiva] Add to\_date() function from pattern
 * [ARROW-9897](https://issues.apache.org/jira/browse/ARROW-9897) - [C++][Gandiva] Add to\_date() function from pattern
@@ -492,6 +2473,7 @@
 * [ARROW-10283](https://issues.apache.org/jira/browse/ARROW-10283) - [Python] Python deprecation warning for "PY\_SSIZE\_T\_CLEAN will be required for '\#' formats"
 * [ARROW-10293](https://issues.apache.org/jira/browse/ARROW-10293) - [Rust] [DataFusion] Fix benchmarks
 * [ARROW-10294](https://issues.apache.org/jira/browse/ARROW-10294) - [Java] Resolve problems of DecimalVector APIs on ArrowBufs
+* [ARROW-10298](https://issues.apache.org/jira/browse/ARROW-10298) - [Rust] Incorrect offset handling in iterator over dictionary keys
 * [ARROW-10321](https://issues.apache.org/jira/browse/ARROW-10321) - [C++] Building AVX512 code when we should not
 * [ARROW-10333](https://issues.apache.org/jira/browse/ARROW-10333) - [Java] Remove split packages in arrow-memory-core and arrow-vectors
 * [ARROW-10345](https://issues.apache.org/jira/browse/ARROW-10345) - [C++] NaN breaks sorting
@@ -521,7 +2503,6 @@
 * [ARROW-10446](https://issues.apache.org/jira/browse/ARROW-10446) - [C++][Python] Timezone aware pd.Timestamp's are incorrectly converted to Timestamp arrys
 * [ARROW-10448](https://issues.apache.org/jira/browse/ARROW-10448) - [Rust] PrimitiveArray::new can create arrays not in spec
 * [ARROW-10453](https://issues.apache.org/jira/browse/ARROW-10453) - [Rust] [DataFusion] Performance degredation after removing specialization
-* [ARROW-10457](https://issues.apache.org/jira/browse/ARROW-10457) - [CI] Fix Spark branch-3.0 integration tests
 * [ARROW-10461](https://issues.apache.org/jira/browse/ARROW-10461) - [Rust] Offset related bug in BitChunks::remainder\_bits
 * [ARROW-10462](https://issues.apache.org/jira/browse/ARROW-10462) - [Python] ParquetDatasetPiece's path broken when using fsspec fs on Windows
 * [ARROW-10463](https://issues.apache.org/jira/browse/ARROW-10463) - [R] Better messaging for currently unsupported CSV options in open\_dataset
@@ -531,7 +2512,6 @@
 * [ARROW-10475](https://issues.apache.org/jira/browse/ARROW-10475) - [С++][FlightRPC] Arrow Flight Server / Client cannot be initialized with Ipv6 host
 * [ARROW-10480](https://issues.apache.org/jira/browse/ARROW-10480) - [Python] Parquet write\_table creates gzipped Parquet file, not Parquet with gzip compression
 * [ARROW-10482](https://issues.apache.org/jira/browse/ARROW-10482) - [Python] Specifying compression type on a column basis when writing Parquet not working
-* [ARROW-10489](https://issues.apache.org/jira/browse/ARROW-10489) - [C++] Unable to configure or make with intel compiler
 * [ARROW-10491](https://issues.apache.org/jira/browse/ARROW-10491) - [FlightRPC][Java] Fix NPE when using FlightProducer without interceptors
 * [ARROW-10493](https://issues.apache.org/jira/browse/ARROW-10493) - [C++][Parquet] Writing nullable nested strings results in wrong data in file
 * [ARROW-10495](https://issues.apache.org/jira/browse/ARROW-10495) - [C++] find\_package(Arrow) is broken on Ubuntu 18
@@ -582,6 +2562,7 @@
 * [ARROW-10684](https://issues.apache.org/jira/browse/ARROW-10684) - [Rust] Logical equality should consider parent array nullability
 * [ARROW-10690](https://issues.apache.org/jira/browse/ARROW-10690) - [Java] ComplexCopier gives incorrect result for list vector if target vector is non-empty
 * [ARROW-10692](https://issues.apache.org/jira/browse/ARROW-10692) - [Rust] Segfault while array buffer append
+* [ARROW-10694](https://issues.apache.org/jira/browse/ARROW-10694) - [Python] ds.write\_dataset() generates empty files for each final partition
 * [ARROW-10699](https://issues.apache.org/jira/browse/ARROW-10699) - [C++] BitmapUInt64Reader doesn't work on big-endian
 * [ARROW-10701](https://issues.apache.org/jira/browse/ARROW-10701) - [Rust] [Datafusion] Benchmark sort\_limit\_query\_sql fails because order by clause specifies column index instead of expression
 * [ARROW-10705](https://issues.apache.org/jira/browse/ARROW-10705) - [Rust] Lifetime annotations in the IPC writer are too strict, preventing code reuse
@@ -682,9 +2663,11 @@
 * [ARROW-11232](https://issues.apache.org/jira/browse/ARROW-11232) - [C++] Table::CombineChunks() returns incorrect results if Table has no column
 * [ARROW-11233](https://issues.apache.org/jira/browse/ARROW-11233) - [C++][Flight] Fail to link with bundled gRPC and Abseil
 * [ARROW-11237](https://issues.apache.org/jira/browse/ARROW-11237) - [C++] Compiler error with GLog and unity build enabled
+* [ARROW-11250](https://issues.apache.org/jira/browse/ARROW-11250) - [Python] Inconsistent behavior calling ds.dataset()
 * [ARROW-11251](https://issues.apache.org/jira/browse/ARROW-11251) - [CI] Make sure that devtoolset-8 is really installed + being used
 * [ARROW-11253](https://issues.apache.org/jira/browse/ARROW-11253) - [R] Make sure that large metadata tests are reproducible
 * [ARROW-11255](https://issues.apache.org/jira/browse/ARROW-11255) - [Packaging][Conda][macOS] Fix Python version
+* [ARROW-11257](https://issues.apache.org/jira/browse/ARROW-11257) - [C++][Parquet] PyArrow Table contains different data after writing and reloading from Parquet
 * [ARROW-11271](https://issues.apache.org/jira/browse/ARROW-11271) - [Rust] [Parquet] List schema to Arrow parser misinterpreting child nullability
 * [ARROW-11274](https://issues.apache.org/jira/browse/ARROW-11274) - [Packaging][wheel][Windows] Fix wheels path for Gemfury
 * [ARROW-11275](https://issues.apache.org/jira/browse/ARROW-11275) - [Packaging][wheel][Linux] Fix paths for Gemfury
@@ -694,11 +2677,18 @@
 * [ARROW-11301](https://issues.apache.org/jira/browse/ARROW-11301) - [C++] Fix reading LZ4-compressed Parquet files produced by Java Parquet implementation
 * [ARROW-11302](https://issues.apache.org/jira/browse/ARROW-11302) - [Release][Python] Remove verification of python 3.5 wheel on macOS
 * [ARROW-11306](https://issues.apache.org/jira/browse/ARROW-11306) - [Packaging][Ubuntu][16.04] Add missing libprotobuf-dev dependency
+* [ARROW-11363](https://issues.apache.org/jira/browse/ARROW-11363) - C++ Library Build Failure with gRPC 1.34+
+* [ARROW-11390](https://issues.apache.org/jira/browse/ARROW-11390) - [Python] pyarrow 3.0 issues with turbodbc
+* [ARROW-11445](https://issues.apache.org/jira/browse/ARROW-11445) - Type conversion failure on numpy 0.1.20
+* [ARROW-11450](https://issues.apache.org/jira/browse/ARROW-11450) - [Python] pyarrow<3 incompatible with numpy\>=1.20.0
+* [ARROW-11487](https://issues.apache.org/jira/browse/ARROW-11487) - [Python] Can't create array from Categorical with numpy 1.20
+* [ARROW-11835](https://issues.apache.org/jira/browse/ARROW-11835) - [Python] PyArrow 3.0/Pip installation errors on Big Sur.
+* [ARROW-12399](https://issues.apache.org/jira/browse/ARROW-12399) - Unable to load libhdfs
 * [PARQUET-1935](https://issues.apache.org/jira/browse/PARQUET-1935) - [C++][Parquet] nullptr access violation when writing arrays of non-nullable values
 
 
 
-# Apache Arrow 2.0.0 (2020-10-13)
+# Apache Arrow 2.0.0 (2020-10-19)
 
 ## Bug Fixes
 
@@ -773,6 +2763,7 @@
 * [ARROW-9660](https://issues.apache.org/jira/browse/ARROW-9660) - [C++] IPC - dictionaries in maps
 * [ARROW-9666](https://issues.apache.org/jira/browse/ARROW-9666) - [Python][wheel][Windows] library missing failure by ARROW-9412
 * [ARROW-9670](https://issues.apache.org/jira/browse/ARROW-9670) - [C++][FlightRPC] Close()ing a DoPut with an ongoing read locks up the client
+* [ARROW-9676](https://issues.apache.org/jira/browse/ARROW-9676) -  [R] Error converting Table with nested structs
 * [ARROW-9684](https://issues.apache.org/jira/browse/ARROW-9684) - [C++] Fix undefined behaviour on invalid IPC / Parquet input (OSS-Fuzz)
 * [ARROW-9692](https://issues.apache.org/jira/browse/ARROW-9692) - [Python] distutils import warning
 * [ARROW-9693](https://issues.apache.org/jira/browse/ARROW-9693) - [CI][Docs] Nightly docs build fails
@@ -803,6 +2794,7 @@
 * [ARROW-9797](https://issues.apache.org/jira/browse/ARROW-9797) - [Rust] AMD64 Conda Integration Tests is failing for the Master branch
 * [ARROW-9799](https://issues.apache.org/jira/browse/ARROW-9799) - [Rust] [DataFusion] Implementation of physical binary expression get\_type method is incorrect
 * [ARROW-9800](https://issues.apache.org/jira/browse/ARROW-9800) - [Rust] [Parquet] "min" and "max" written to standard out when writing columns
+* [ARROW-9801](https://issues.apache.org/jira/browse/ARROW-9801) - DictionaryArray with non-unique values are silently corrupted when written to a Parquet file
 * [ARROW-9809](https://issues.apache.org/jira/browse/ARROW-9809) - [Rust] [DataFusion] logical schema = physical schema is not true
 * [ARROW-9814](https://issues.apache.org/jira/browse/ARROW-9814) - [Python] Crash in test\_parquet.py::test\_read\_partitioned\_directory\_s3fs
 * [ARROW-9815](https://issues.apache.org/jira/browse/ARROW-9815) - [Rust] [DataFusion] Deadlock in creation of physical plan with two udfs
@@ -827,7 +2819,6 @@
 * [ARROW-9883](https://issues.apache.org/jira/browse/ARROW-9883) - [R] Fix linuxlibs.R install script for R < 3.6
 * [ARROW-9888](https://issues.apache.org/jira/browse/ARROW-9888) - [Rust] [DataFusion] ExecutionContext can not be shared between threads
 * [ARROW-9889](https://issues.apache.org/jira/browse/ARROW-9889) - [Rust][DataFusion] Datafusion CLI: CREATE EXTERNAL TABLE errors with "Unsupported logical plan variant"
-* [ARROW-9897](https://issues.apache.org/jira/browse/ARROW-9897) - [C++][Gandiva] Add to\_date() function from pattern
 * [ARROW-9906](https://issues.apache.org/jira/browse/ARROW-9906) - [Python] Crash in test\_parquet.py::test\_parquet\_writer\_filesystem\_s3\_uri (closing NativeFile from S3FileSystem)
 * [ARROW-9913](https://issues.apache.org/jira/browse/ARROW-9913) - [C++] Outputs of Decimal128::FromString depend on presence of one another
 * [ARROW-9920](https://issues.apache.org/jira/browse/ARROW-9920) - [Python] pyarrow.concat\_arrays segfaults when passing it a chunked array
@@ -915,18 +2906,22 @@
 * [ARROW-10286](https://issues.apache.org/jira/browse/ARROW-10286) - [C++][Flight] Misleading CMake errors
 * [ARROW-10288](https://issues.apache.org/jira/browse/ARROW-10288) - [C++] Compilation fails on i386
 * [ARROW-10290](https://issues.apache.org/jira/browse/ARROW-10290) - [C++] List POP\_BACK is not available in older CMake versions
+* [ARROW-10296](https://issues.apache.org/jira/browse/ARROW-10296) - [R] Data saved as integer64 loaded as integer
+* [ARROW-10517](https://issues.apache.org/jira/browse/ARROW-10517) - [Python] Unable to read/write Parquet datasets with fsspec on Azure Blob
+* [ARROW-11062](https://issues.apache.org/jira/browse/ARROW-11062) - [Java] When writing to flight stream, Spark's mapPartitions is not working
 
 
 ## New Features and Improvements
 
 * [ARROW-983](https://issues.apache.org/jira/browse/ARROW-983) - [C++] Implement InputStream and OutputStream classes for interacting with socket connections
-* [ARROW-1105](https://issues.apache.org/jira/browse/ARROW-1105) - [C++] SQLite record batch reader
 * [ARROW-1509](https://issues.apache.org/jira/browse/ARROW-1509) - [Python] Write serialized object as a stream of encapsulated IPC messages
+* [ARROW-1644](https://issues.apache.org/jira/browse/ARROW-1644) - [C++][Parquet] Read and write nested Parquet data with a mix of struct and list nesting levels
 * [ARROW-1669](https://issues.apache.org/jira/browse/ARROW-1669) - [C++] Consider adding Abseil (Google C++11 standard library extensions) to toolchain
 * [ARROW-1797](https://issues.apache.org/jira/browse/ARROW-1797) - [C++] Implement binary arithmetic kernels for numeric arrays
 * [ARROW-2164](https://issues.apache.org/jira/browse/ARROW-2164) - [C++] Clean up unnecessary decimal module refs
 * [ARROW-3080](https://issues.apache.org/jira/browse/ARROW-3080) - [Python] Unify Arrow to Python object conversion paths
 * [ARROW-3757](https://issues.apache.org/jira/browse/ARROW-3757) - [R] R bindings for Flight RPC client
+* [ARROW-3850](https://issues.apache.org/jira/browse/ARROW-3850) - [Python] Support MapType and StructType for enhanced PySpark integration
 * [ARROW-3872](https://issues.apache.org/jira/browse/ARROW-3872) - [R] Add ad hoc test of feather compatibility
 * [ARROW-4046](https://issues.apache.org/jira/browse/ARROW-4046) - [Python/CI] Exercise large memory tests
 * [ARROW-4248](https://issues.apache.org/jira/browse/ARROW-4248) - [C++][Plasma] Build on Windows / Visual Studio
@@ -957,14 +2952,11 @@
 * [ARROW-8205](https://issues.apache.org/jira/browse/ARROW-8205) - [Rust] [DataFusion] DataFusion should enforce unique field names in a schema
 * [ARROW-8253](https://issues.apache.org/jira/browse/ARROW-8253) - [Rust] [DataFusion] Improve ergonomics of registering UDFs
 * [ARROW-8262](https://issues.apache.org/jira/browse/ARROW-8262) - [Rust] [DataFusion] Add example that uses LogicalPlanBuilder
-* [ARROW-8289](https://issues.apache.org/jira/browse/ARROW-8289) - [Rust] [Parquet] Implement minimal Arrow Parquet writer as starting point for full writer
 * [ARROW-8296](https://issues.apache.org/jira/browse/ARROW-8296) - [C++][Dataset] IpcFileFormat should support writing files with compressed buffers
 * [ARROW-8355](https://issues.apache.org/jira/browse/ARROW-8355) - [Python] Reduce the number of pandas dependent test cases in test\_feather
 * [ARROW-8359](https://issues.apache.org/jira/browse/ARROW-8359) - [C++/Python] Enable aarch64/ppc64le build in conda recipes
 * [ARROW-8383](https://issues.apache.org/jira/browse/ARROW-8383) - [Rust] Easier random access to DictionaryArray keys and values
 * [ARROW-8402](https://issues.apache.org/jira/browse/ARROW-8402) - [Java] Support ValidateFull methods in Java
-* [ARROW-8423](https://issues.apache.org/jira/browse/ARROW-8423) - [Rust] [Parquet] Serialize arrow schema into metadata when writing parquet
-* [ARROW-8426](https://issues.apache.org/jira/browse/ARROW-8426) - [Rust] [Parquet] Add support for writing dictionary types
 * [ARROW-8493](https://issues.apache.org/jira/browse/ARROW-8493) - [C++] Create unified schema resolution code for Array reconstruction.
 * [ARROW-8494](https://issues.apache.org/jira/browse/ARROW-8494) - [C++] Implement basic array-by-array  reassembly logic
 * [ARROW-8581](https://issues.apache.org/jira/browse/ARROW-8581) - [C\#] Date32/64Array.Builder should accept DateTime, not DateTimeOffset
@@ -1043,7 +3035,7 @@
 * [ARROW-9640](https://issues.apache.org/jira/browse/ARROW-9640) - [C++][Gandiva] Implement round() for integers and long integers
 * [ARROW-9641](https://issues.apache.org/jira/browse/ARROW-9641) - [C++][Gandiva] Implement round() for floating point and double floating point numbers
 * [ARROW-9645](https://issues.apache.org/jira/browse/ARROW-9645) - [Python] Deprecate the legacy pyarrow.filesystem interface
-* [ARROW-9646](https://issues.apache.org/jira/browse/ARROW-9646) - [C++][Dataset] Add support for writing parquet datasets
+* [ARROW-9646](https://issues.apache.org/jira/browse/ARROW-9646) - [C++][Dataset] Add support for writing parquet datasets 
 * [ARROW-9650](https://issues.apache.org/jira/browse/ARROW-9650) - [Packaging][APT] Drop support for Ubuntu 19.10
 * [ARROW-9654](https://issues.apache.org/jira/browse/ARROW-9654) - [Rust][DataFusion] Add an EXPLAIN command to the datafusion CLI
 * [ARROW-9656](https://issues.apache.org/jira/browse/ARROW-9656) - [Rust][DataFusion] Slightly confusing error message when unsupported type is provided to CREATE EXTERNAL TABLE
@@ -1224,7 +3216,6 @@
 * [ARROW-10090](https://issues.apache.org/jira/browse/ARROW-10090) - [C++][Compute] Improve mode kernel
 * [ARROW-10092](https://issues.apache.org/jira/browse/ARROW-10092) - [Dev][Go] Add grpc generated go files to rat exclusion list
 * [ARROW-10093](https://issues.apache.org/jira/browse/ARROW-10093) - [R] Add ability to opt-out of int64 -\> int demotion
-* [ARROW-10095](https://issues.apache.org/jira/browse/ARROW-10095) - [Rust] [Parquet] Update for IPC changes
 * [ARROW-10096](https://issues.apache.org/jira/browse/ARROW-10096) - [Rust] [DataFusion] Remove unused code
 * [ARROW-10099](https://issues.apache.org/jira/browse/ARROW-10099) - [C++][Dataset] Also allow integer partition fields to be dictionary encoded
 * [ARROW-10100](https://issues.apache.org/jira/browse/ARROW-10100) - [C++][Dataset] Ability to read/subset a ParquetFileFragment with given set of row group ids
@@ -1246,10 +3237,8 @@
 * [ARROW-10162](https://issues.apache.org/jira/browse/ARROW-10162) - [Rust] Support display of DictionaryArrays in pretty printing
 * [ARROW-10164](https://issues.apache.org/jira/browse/ARROW-10164) - [Rust] Add support for DictionaryArray types to cast kernels
 * [ARROW-10167](https://issues.apache.org/jira/browse/ARROW-10167) - [Rust] Support display of DictionaryArrays in sql.rs
-* [ARROW-10168](https://issues.apache.org/jira/browse/ARROW-10168) - [Rust] [Parquet] Extend arrow schema conversion to projected fields
 * [ARROW-10171](https://issues.apache.org/jira/browse/ARROW-10171) - [Rust] [DataFusion] Add \`ExecutionContext::from<ExecutionContextState\>\`
 * [ARROW-10190](https://issues.apache.org/jira/browse/ARROW-10190) - [Website] Add Jorge to list of committers
-* [ARROW-10191](https://issues.apache.org/jira/browse/ARROW-10191) - [Rust] [Parquet] Add roundtrip tests for single column batches
 * [ARROW-10196](https://issues.apache.org/jira/browse/ARROW-10196) - [C++] Add Future::DeferNotOk()
 * [ARROW-10199](https://issues.apache.org/jira/browse/ARROW-10199) - [Rust][Parquet] Release Parquet at crates.io to remove debug prints
 * [ARROW-10201](https://issues.apache.org/jira/browse/ARROW-10201) - [C++][CI] Disable S3 in arm64 job on Travis CI
@@ -1258,7 +3247,6 @@
 * [ARROW-10206](https://issues.apache.org/jira/browse/ARROW-10206) - [Python][C++][FlightRPC] Add client option to disable server validation
 * [ARROW-10215](https://issues.apache.org/jira/browse/ARROW-10215) - [Rust] [DataFusion] Rename "Source" typedef
 * [ARROW-10217](https://issues.apache.org/jira/browse/ARROW-10217) - [CI] Run fewer GitHub Actions jobs
-* [ARROW-10225](https://issues.apache.org/jira/browse/ARROW-10225) - [Rust] [Parquet] Fix null bitmap comparisons in roundtrip tests
 * [ARROW-10227](https://issues.apache.org/jira/browse/ARROW-10227) - [Ruby] Use a table size as the default for parquet chunk\_size
 * [ARROW-10229](https://issues.apache.org/jira/browse/ARROW-10229) - [C++][Parquet] Remove left over ARROW\_LOG statement.
 * [ARROW-10231](https://issues.apache.org/jira/browse/ARROW-10231) - [CI] Unable to download minio in arm32v7 docker image
@@ -1282,7 +3270,55 @@
 
 
 
-# Apache Arrow 1.0.0 (2020-07-20)
+# Apache Arrow 1.0.1 (2020-08-21)
+
+## Bug Fixes
+
+* [ARROW-9535](https://issues.apache.org/jira/browse/ARROW-9535) - [Python] Remove symlink fixes from conda recipe
+* [ARROW-9536](https://issues.apache.org/jira/browse/ARROW-9536) - Missing parameters in PlasmaOutOfMemoryException.java
+* [ARROW-9544](https://issues.apache.org/jira/browse/ARROW-9544) - [R] version argument of write\_parquet not working
+* [ARROW-9549](https://issues.apache.org/jira/browse/ARROW-9549) - [Rust] Parquet no longer builds
+* [ARROW-9556](https://issues.apache.org/jira/browse/ARROW-9556) - [Python][C++] Segfaults in UnionArray with null values
+* [ARROW-9560](https://issues.apache.org/jira/browse/ARROW-9560) - [Packaging] conda recipes failing due to missing conda-forge.yml
+* [ARROW-9569](https://issues.apache.org/jira/browse/ARROW-9569) - [CI][R] Fix rtools35 builds for msys2 key change
+* [ARROW-9570](https://issues.apache.org/jira/browse/ARROW-9570) - [Doc] Clean up sphinx sidebar
+* [ARROW-9573](https://issues.apache.org/jira/browse/ARROW-9573) - [Python] Parquet doesn't load when partitioned column starts with '\_'
+* [ARROW-9574](https://issues.apache.org/jira/browse/ARROW-9574) - [R] Cleanups for CRAN 1.0.0 release
+* [ARROW-9575](https://issues.apache.org/jira/browse/ARROW-9575) - [R] gcc-UBSAN failure on CRAN
+* [ARROW-9577](https://issues.apache.org/jira/browse/ARROW-9577) - [Python][C++] posix\_madvise error on Debian in pyarrow 1.0.0
+* [ARROW-9589](https://issues.apache.org/jira/browse/ARROW-9589) - [C++/R] arrow\_exports.h contains structs declared as class
+* [ARROW-9592](https://issues.apache.org/jira/browse/ARROW-9592) - [CI] Update homebrew before calling brew bundle
+* [ARROW-9596](https://issues.apache.org/jira/browse/ARROW-9596) - [CI][Crossbow] Fix homebrew-cpp again, again
+* [ARROW-9598](https://issues.apache.org/jira/browse/ARROW-9598) - [C++][Parquet]  Spaced definition levels is not assigned correctly.
+* [ARROW-9599](https://issues.apache.org/jira/browse/ARROW-9599) - [CI] Appveyor toolchain build fails because CMake detects different C and C++ compilers
+* [ARROW-9600](https://issues.apache.org/jira/browse/ARROW-9600) - [Rust] When used as a crate dependency, arrow-flight is rebuilt on every invocation of cargo build
+* [ARROW-9600](https://issues.apache.org/jira/browse/ARROW-9600) - [Rust] When used as a crate dependency, arrow-flight is rebuilt on every invocation of cargo build
+* [ARROW-9602](https://issues.apache.org/jira/browse/ARROW-9602) - [R] Improve cmake detection in Linux build
+* [ARROW-9606](https://issues.apache.org/jira/browse/ARROW-9606) - [C++][Dataset] in expressions don't work with \>1 partition levels
+* [ARROW-9609](https://issues.apache.org/jira/browse/ARROW-9609) - [C++] CSV datasets don't materialize virtual columns
+* [ARROW-9621](https://issues.apache.org/jira/browse/ARROW-9621) - [Python] test\_move\_file() is failed with fsspec 0.8.0
+* [ARROW-9631](https://issues.apache.org/jira/browse/ARROW-9631) - [Rust] Arrow crate should not depend on flight
+* [ARROW-9631](https://issues.apache.org/jira/browse/ARROW-9631) - [Rust] Arrow crate should not depend on flight
+* [ARROW-9644](https://issues.apache.org/jira/browse/ARROW-9644) - [C++][Dataset] Do not check for ignore\_prefixes in the base path
+* [ARROW-9659](https://issues.apache.org/jira/browse/ARROW-9659) - [C++] RecordBatchStreamReader throws on CUDA device buffers
+* [ARROW-9684](https://issues.apache.org/jira/browse/ARROW-9684) - [C++] Fix undefined behaviour on invalid IPC / Parquet input (OSS-Fuzz)
+* [ARROW-9700](https://issues.apache.org/jira/browse/ARROW-9700) - [Python] create\_library\_symlinks doesn't work in macos
+* [ARROW-9712](https://issues.apache.org/jira/browse/ARROW-9712) - [Rust] [DataFusion] ParquetScanExec panics on error
+* [ARROW-9743](https://issues.apache.org/jira/browse/ARROW-9743) - [R] Sanitize paths in open\_dataset
+* [ARROW-10126](https://issues.apache.org/jira/browse/ARROW-10126) - [Python] Impossible to import pyarrow module in python. Generates this "ImportError: DLL load failed: The specified procedure could not be found."
+* [ARROW-10460](https://issues.apache.org/jira/browse/ARROW-10460) - [FlightRPC][Python] FlightRPC authentication mechanism changed and is undocumented, breaking current working code
+
+
+## New Features and Improvements
+
+* [ARROW-9402](https://issues.apache.org/jira/browse/ARROW-9402) - [C++] Add portable wrappers for \_\_builtin\_add\_overflow and friends
+* [ARROW-9563](https://issues.apache.org/jira/browse/ARROW-9563) - [Dev][Release] Use archery's changelog generator when creating release notes for the website 
+* [ARROW-9715](https://issues.apache.org/jira/browse/ARROW-9715) - [R] changelog/doc updates for 1.0.1
+* [ARROW-9845](https://issues.apache.org/jira/browse/ARROW-9845) - [Rust] [Parquet] serde\_json is only used in tests but isn't in dev-dependencies
+
+
+
+# Apache Arrow 1.0.0 (2020-07-24)
 
 ## Bug Fixes
 
@@ -1315,6 +3351,7 @@
 * [ARROW-7702](https://issues.apache.org/jira/browse/ARROW-7702) - [C++][Dataset] Provide (optional) deterministic order of batches
 * [ARROW-7782](https://issues.apache.org/jira/browse/ARROW-7782) - [Python] Losing index information when using write\_to\_dataset with partition\_cols
 * [ARROW-7840](https://issues.apache.org/jira/browse/ARROW-7840) - [Java] [Integration] Java executables fail
+* [ARROW-7843](https://issues.apache.org/jira/browse/ARROW-7843) - [Ruby] MSYS2 packages needed for Gandiva
 * [ARROW-7925](https://issues.apache.org/jira/browse/ARROW-7925) - [C++][Documentation] Instructions about running IWYU and other tasks in cpp/development.rst have gone stale
 * [ARROW-7939](https://issues.apache.org/jira/browse/ARROW-7939) - [Python] crashes when reading parquet file compressed with snappy
 * [ARROW-7967](https://issues.apache.org/jira/browse/ARROW-7967) - [CI][Crossbow] Pin macOS version in autobrew job to match CRAN
@@ -1449,6 +3486,7 @@
 * [ARROW-9024](https://issues.apache.org/jira/browse/ARROW-9024) - [C++/Python] Install anaconda-client in conda-clean job
 * [ARROW-9026](https://issues.apache.org/jira/browse/ARROW-9026) - [C++/Python] Force package removal from arrow-nightlies conda repository
 * [ARROW-9037](https://issues.apache.org/jira/browse/ARROW-9037) - [C++][C] unable to import array with null count == -1 (which could be exported)
+* [ARROW-9040](https://issues.apache.org/jira/browse/ARROW-9040) - [Python][Parquet]"\_ParquetDatasetV2" fail to read with columns and use\_pandas\_metadata=True
 * [ARROW-9057](https://issues.apache.org/jira/browse/ARROW-9057) - [Rust] Projection should work on InMemoryScan without error
 * [ARROW-9059](https://issues.apache.org/jira/browse/ARROW-9059) - [Rust] Documentation for slicing array data has the wrong sign
 * [ARROW-9066](https://issues.apache.org/jira/browse/ARROW-9066) - [Python] Raise correct error in isnull()
@@ -1557,7 +3595,7 @@
 * [ARROW-9512](https://issues.apache.org/jira/browse/ARROW-9512) - [C++] Variadic template unpack inside lambda doesn't compile with gcc
 * [ARROW-9524](https://issues.apache.org/jira/browse/ARROW-9524) - [CI][Gandiva] C++ unit test arrow-ipc-read-write failing in gandiva nightly build
 * [ARROW-9527](https://issues.apache.org/jira/browse/ARROW-9527) - [Rust] Remove un-needed dev-dependencies
-* [ARROW-9528](https://issues.apache.org/jira/browse/ARROW-9528) - [Python] Honor tzinfo information when converting from datetime to pyarrow
+* [ARROW-10126](https://issues.apache.org/jira/browse/ARROW-10126) - [Python] Impossible to import pyarrow module in python. Generates this "ImportError: DLL load failed: The specified procedure could not be found."
 * [PARQUET-1839](https://issues.apache.org/jira/browse/PARQUET-1839) - [C++] values\_read not updated in ReadBatchSpaced 
 * [PARQUET-1857](https://issues.apache.org/jira/browse/PARQUET-1857) - [C++][Parquet] ParquetFileReader unable to read files with more than 32767 row groups
 * [PARQUET-1865](https://issues.apache.org/jira/browse/PARQUET-1865) - [C++] Failure from C++17 feature used in parquet/encoding\_benchmark.cc
@@ -1592,6 +3630,7 @@
 * [ARROW-3509](https://issues.apache.org/jira/browse/ARROW-3509) - [C++] Inconsistent child accessor naming
 * [ARROW-3520](https://issues.apache.org/jira/browse/ARROW-3520) - [C++] Implement List Flatten kernel
 * [ARROW-3688](https://issues.apache.org/jira/browse/ARROW-3688) - [Rust] Implement PrimitiveArrayBuilder<T\>.push\_values
+* [ARROW-3764](https://issues.apache.org/jira/browse/ARROW-3764) - [C++] Port Python "ParquetDataset" business logic to C++
 * [ARROW-3827](https://issues.apache.org/jira/browse/ARROW-3827) - [Rust] Implement UnionArray
 * [ARROW-4022](https://issues.apache.org/jira/browse/ARROW-4022) - [C++] Promote Datum variant out of compute namespace
 * [ARROW-4221](https://issues.apache.org/jira/browse/ARROW-4221) - [Format] Add canonical flag in COO sparse index
@@ -1599,6 +3638,7 @@
 * [ARROW-4412](https://issues.apache.org/jira/browse/ARROW-4412) - [DOCUMENTATION] Add explicit version numbers to the arrow specification documents.
 * [ARROW-4427](https://issues.apache.org/jira/browse/ARROW-4427) - [Doc] Move Confluence Wiki pages to the Sphinx docs
 * [ARROW-4429](https://issues.apache.org/jira/browse/ARROW-4429) - [Doc] Add git rebase tips to the 'Contributing' page in the developer docs
+* [ARROW-4526](https://issues.apache.org/jira/browse/ARROW-4526) - [Java] Remove Netty references from ArrowBuf and move Allocator out of vector package
 * [ARROW-5035](https://issues.apache.org/jira/browse/ARROW-5035) - [C\#] ArrowBuffer.Builder<bool\> is broken
 * [ARROW-5082](https://issues.apache.org/jira/browse/ARROW-5082) - [Python][Packaging] Reduce size of macOS and manylinux1 wheels
 * [ARROW-5143](https://issues.apache.org/jira/browse/ARROW-5143) - [Flight] Enable integration testing of batches with dictionaries
@@ -1709,6 +3749,7 @@
 * [ARROW-8440](https://issues.apache.org/jira/browse/ARROW-8440) - [C++] Refine simd header files
 * [ARROW-8443](https://issues.apache.org/jira/browse/ARROW-8443) - [Gandiva][C++] Fix round/truncate to no-op for special cases
 * [ARROW-8447](https://issues.apache.org/jira/browse/ARROW-8447) - [C++][Dataset] Ensure Scanner::ToTable preserve ordering of ScanTasks
+* [ARROW-8456](https://issues.apache.org/jira/browse/ARROW-8456) - [Release] Add python script to help curating JIRA
 * [ARROW-8467](https://issues.apache.org/jira/browse/ARROW-8467) - [C++] Test cases using ArrayFromJSON assume only a little-endian platform
 * [ARROW-8474](https://issues.apache.org/jira/browse/ARROW-8474) - [CI][Crossbow] Skip some nightlies we don't need to run
 * [ARROW-8477](https://issues.apache.org/jira/browse/ARROW-8477) - [C++] Enable reading and writing of long filenames for Windows
@@ -1787,7 +3828,6 @@
 * [ARROW-8648](https://issues.apache.org/jira/browse/ARROW-8648) - [Rust] Optimize Rust CI Build Times
 * [ARROW-8650](https://issues.apache.org/jira/browse/ARROW-8650) - [Rust] [Website] Add documentation to Arrow website
 * [ARROW-8651](https://issues.apache.org/jira/browse/ARROW-8651) - [Python][Dataset] Support pickling of Dataset objects
-* [ARROW-8655](https://issues.apache.org/jira/browse/ARROW-8655) - [C++][Dataset][Python][R] Preserve partitioning information for a discovered Dataset
 * [ARROW-8656](https://issues.apache.org/jira/browse/ARROW-8656) - [Python] Switch to VS2017 in the windows wheel builds
 * [ARROW-8659](https://issues.apache.org/jira/browse/ARROW-8659) - [Rust] ListBuilder and FixedSizeListBuilder capacity
 * [ARROW-8660](https://issues.apache.org/jira/browse/ARROW-8660) - [C++][Gandiva] Reduce dependence on Boost
@@ -1889,6 +3929,7 @@
 * [ARROW-8867](https://issues.apache.org/jira/browse/ARROW-8867) - [R] Support converting POSIXlt type
 * [ARROW-8875](https://issues.apache.org/jira/browse/ARROW-8875) - [C++] use AWS SDK SetResponseStreamFactory to avoid a copy of bytes
 * [ARROW-8877](https://issues.apache.org/jira/browse/ARROW-8877) - [Rust] add CSV read option struct to simplify datafusion interface
+* [ARROW-8879](https://issues.apache.org/jira/browse/ARROW-8879) - [FlightRPC][Java] FlightStream should unwrap ExecutionExceptions
 * [ARROW-8880](https://issues.apache.org/jira/browse/ARROW-8880) - [R][Linux] Make R Binary Install Friendlier
 * [ARROW-8881](https://issues.apache.org/jira/browse/ARROW-8881) - [Rust] Add large list and binary support
 * [ARROW-8885](https://issues.apache.org/jira/browse/ARROW-8885) - [R] Don't include everything everywhere
@@ -1947,6 +3988,7 @@
 * [ARROW-9004](https://issues.apache.org/jira/browse/ARROW-9004) - [C++][Gandiva] Support building with LLVM 10
 * [ARROW-9005](https://issues.apache.org/jira/browse/ARROW-9005) - [Rust] [DataFusion] Support sort expression
 * [ARROW-9007](https://issues.apache.org/jira/browse/ARROW-9007) - [Rust] Support appending arrays by merging array data
+* [ARROW-9011](https://issues.apache.org/jira/browse/ARROW-9011) - [Python][Packaging] Move the anaconda cleanup script to crossbow
 * [ARROW-9014](https://issues.apache.org/jira/browse/ARROW-9014) - [Packaging] Bump the minor part of the automatically generated version in crossbow
 * [ARROW-9015](https://issues.apache.org/jira/browse/ARROW-9015) - [Java] Make BaseAllocator package private
 * [ARROW-9016](https://issues.apache.org/jira/browse/ARROW-9016) - [Java] Remove direct references to Netty/Unsafe Allocators
@@ -2048,7 +4090,6 @@
 * [ARROW-9202](https://issues.apache.org/jira/browse/ARROW-9202) - [GLib] Add GArrowDatum
 * [ARROW-9203](https://issues.apache.org/jira/browse/ARROW-9203) - [Packaging][deb] Add missing gir1.2-arrow-dataset-1.0.install
 * [ARROW-9204](https://issues.apache.org/jira/browse/ARROW-9204) - [C++][Flight] change records\_per\_stream to int64 in flight benchmark
-* [ARROW-9205](https://issues.apache.org/jira/browse/ARROW-9205) - [Documentation] Fix typos in Columnar.rst
 * [ARROW-9206](https://issues.apache.org/jira/browse/ARROW-9206) - [C++][Flight] measure latency in flight benchmark
 * [ARROW-9207](https://issues.apache.org/jira/browse/ARROW-9207) - [Python][Dataset] Clean-up internal FileSource class
 * [ARROW-9210](https://issues.apache.org/jira/browse/ARROW-9210) - [C++] Use OptionalBitBlockCounter in ArrayDataInlineVisitor
@@ -2145,8 +4186,8 @@
 * [ARROW-9493](https://issues.apache.org/jira/browse/ARROW-9493) - [Python][Dataset] Dictionary encode string partition columns by default
 * [ARROW-9509](https://issues.apache.org/jira/browse/ARROW-9509) - [Release] Don't test Gandiva in the windows wheel verification script
 * [ARROW-9511](https://issues.apache.org/jira/browse/ARROW-9511) - [Packaging][Release] Set conda packages' build number to 0
+* [ARROW-9514](https://issues.apache.org/jira/browse/ARROW-9514) - [Python] The new Dataset API will not work with files on Azure Blob
 * [ARROW-9519](https://issues.apache.org/jira/browse/ARROW-9519) - [Rust] Improve error message when getting a field by name from schema
-* [ARROW-9523](https://issues.apache.org/jira/browse/ARROW-9523) - [Rust] improve performance of filter kernel
 * [ARROW-9529](https://issues.apache.org/jira/browse/ARROW-9529) - [Dev][Release] Improvements to release verification scripts
 * [ARROW-9531](https://issues.apache.org/jira/browse/ARROW-9531) - [Packaging][Release] Update conda forge dependency pins
 * [PARQUET-1820](https://issues.apache.org/jira/browse/PARQUET-1820) - [C++] Use a column filter hint to inform read prefetching in Arrow reads
@@ -2179,6 +4220,7 @@
 
 ## New Features and Improvements
 
+* [ARROW-7731](https://issues.apache.org/jira/browse/ARROW-7731) - [C++][Parquet] Support LargeListArray
 * [ARROW-8501](https://issues.apache.org/jira/browse/ARROW-8501) - [Packaging][RPM] Upgrade devtoolset to 8 on CentOS 6
 * [ARROW-8549](https://issues.apache.org/jira/browse/ARROW-8549) - [R] Assorted post-0.17 release cleanups
 * [ARROW-8699](https://issues.apache.org/jira/browse/ARROW-8699) - [R] Fix automatic r\_to\_py conversion
@@ -7094,7 +9136,7 @@
 * [ARROW-2305](https://issues.apache.org/jira/browse/ARROW-2305) - [Python] Cython 0.25.2 compilation failure 
 * [ARROW-2314](https://issues.apache.org/jira/browse/ARROW-2314) - [Python] Union array slicing is defective
 * [ARROW-2326](https://issues.apache.org/jira/browse/ARROW-2326) - [Python] cannot import pip installed pyarrow on OS X (10.9)
-* [ARROW-2328](https://issues.apache.org/jira/browse/ARROW-2328) - Writing a slice with feather ignores the offset
+* [ARROW-2328](https://issues.apache.org/jira/browse/ARROW-2328) - [C++] Writing a slice with feather ignores the offset
 * [ARROW-2331](https://issues.apache.org/jira/browse/ARROW-2331) - [Python] Fix indexing implementations
 * [ARROW-2333](https://issues.apache.org/jira/browse/ARROW-2333) - [Python] boost bundling fails in setup.py
 * [ARROW-2342](https://issues.apache.org/jira/browse/ARROW-2342) - [Python] Aware timestamp type fails pickling
@@ -7586,6 +9628,7 @@
 * [ARROW-1463](https://issues.apache.org/jira/browse/ARROW-1463) - [JAVA] Restructure ValueVector hierarchy to minimize compile-time generated code
 * [ARROW-1579](https://issues.apache.org/jira/browse/ARROW-1579) - [Java] Add dockerized test setup to validate Spark integration
 * [ARROW-1580](https://issues.apache.org/jira/browse/ARROW-1580) - [Python] Instructions for setting up nightly builds on Linux
+* [ARROW-1621](https://issues.apache.org/jira/browse/ARROW-1621) - [JAVA] Reduce Heap Usage per Vector
 * [ARROW-1623](https://issues.apache.org/jira/browse/ARROW-1623) - [C++] Add convenience method to construct Buffer from a string that owns its memory
 * [ARROW-1632](https://issues.apache.org/jira/browse/ARROW-1632) - [Python] Permit categorical conversions in Table.to\_pandas on a per-column basis
 * [ARROW-1643](https://issues.apache.org/jira/browse/ARROW-1643) - [Python] Accept hdfs:// prefixes in parquet.read\_table and attempt to connect to HDFS
diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py
index da6c7430cff4b..d408be3cce41d 100644
--- a/dev/archery/archery/cli.py
+++ b/dev/archery/archery/cli.py
@@ -861,7 +861,7 @@ def release_changelog_regenerate(obj):
     jira, repo = obj['jira'], obj['repo']
     changelogs = []
 
-    for version in jira.arrow_versions():
+    for version in jira.project_versions('ARROW'):
         if not version.released:
             continue
         release = Release.from_jira(version, jira=jira, repo=repo)

From aa34c4fbbc43c311fb16b23af8a7493fa86045f1 Mon Sep 17 00:00:00 2001
From: Alenka Frim <frim.alenka@gmail.com>
Date: Thu, 28 Oct 2021 12:40:36 +0200
Subject: [PATCH 044/194] ARROW-2366: [Python][C++][Parquet] Add test to ensure
 support reading Parquet files having a permutation of column order

Add test for `pq.read_table` in case of permutation of column order.

Closes #11561 from AlenkaF/ARROW-2366

Authored-by: Alenka Frim <frim.alenka@gmail.com>
Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 python/pyarrow/tests/parquet/test_basic.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/python/pyarrow/tests/parquet/test_basic.py b/python/pyarrow/tests/parquet/test_basic.py
index cf1aaa21fbc20..ad7cc44271884 100644
--- a/python/pyarrow/tests/parquet/test_basic.py
+++ b/python/pyarrow/tests/parquet/test_basic.py
@@ -629,3 +629,22 @@ def test_reads_over_batch(tempdir):
     pq.write_table(table, path)
     table2 = pq.read_table(path)
     assert table == table2
+
+
+def test_permutation_of_column_order(tempdir):
+    # ARROW-2366
+    case = tempdir / "dataset_column_order_permutation"
+    case.mkdir(exist_ok=True)
+
+    data1 = pa.table([[1, 2, 3], [.1, .2, .3]], names=['a', 'b'])
+    pq.write_table(data1, case / "data1.parquet")
+
+    data2 = pa.table([[.4, .5, .6], [4, 5, 6]], names=['b', 'a'])
+    pq.write_table(data2, case / "data2.parquet")
+
+    table = pq.read_table(str(case))
+    table2 = pa.table([[1, 2, 3, 4, 5, 6],
+                      [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]],
+                      names=['a', 'b'])
+
+    assert table == table2

From f2c9d20d96377226d44c9f429d52b99f48b2969c Mon Sep 17 00:00:00 2001
From: Weston Pace <weston.pace@gmail.com>
Date: Thu, 28 Oct 2021 09:17:19 -0400
Subject: [PATCH 045/194] ARROW-14450: [R] Old macos build error

It appears that Clang 8 on MacOS 10.11.6 has a bug recognizing when a struct without a user provided constructor can be default initialized in a const context.

Related issue: https://sourceware.org/bugzilla/show_bug.cgi?id=24937

There is some discussion in the issue and a link to more discussion on the SO post (and eventually a reference to a defect in the spec)

Closes #11526 from westonpace/experiment/odd-apple-compiler-bug-bridge-cc

Authored-by: Weston Pace <weston.pace@gmail.com>
Signed-off-by: Neal Richardson <neal.p.richardson@gmail.com>
---
 cpp/src/arrow/util/key_value_metadata.cc | 2 ++
 cpp/src/arrow/util/key_value_metadata.h  | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/cpp/src/arrow/util/key_value_metadata.cc b/cpp/src/arrow/util/key_value_metadata.cc
index fd179a8bf3877..bc48ae76c2a2f 100644
--- a/cpp/src/arrow/util/key_value_metadata.cc
+++ b/cpp/src/arrow/util/key_value_metadata.cc
@@ -56,6 +56,8 @@ static std::vector<std::string> UnorderedMapValues(
   return values;
 }
 
+KeyValueMetadata::KeyValueMetadata() {}
+
 KeyValueMetadata::KeyValueMetadata(
     const std::unordered_map<std::string, std::string>& map)
     : keys_(UnorderedMapKeys(map)), values_(UnorderedMapValues(map)) {
diff --git a/cpp/src/arrow/util/key_value_metadata.h b/cpp/src/arrow/util/key_value_metadata.h
index 2a31bf378b0da..ba70ffe887a3a 100644
--- a/cpp/src/arrow/util/key_value_metadata.h
+++ b/cpp/src/arrow/util/key_value_metadata.h
@@ -34,7 +34,7 @@ namespace arrow {
 /// \brief A container for key-value pair type metadata. Not thread-safe
 class ARROW_EXPORT KeyValueMetadata {
  public:
-  KeyValueMetadata() = default;
+  KeyValueMetadata();
   KeyValueMetadata(std::vector<std::string> keys, std::vector<std::string> values);
   explicit KeyValueMetadata(const std::unordered_map<std::string, std::string>& map);
 

From b09914ea0baea59cbb00f70d2fe438eb21ad6d18 Mon Sep 17 00:00:00 2001
From: Phillip Cloud <417981+cpcloud@users.noreply.github.com>
Date: Thu, 28 Oct 2021 11:58:06 -0400
Subject: [PATCH 046/194] ARROW-14352: [IR] Remove schema property from Source

This PR removes the schema field from `Source`s, as it is currently unused by
both the C++ consumer in progress, as well as the DuckDB producer/consumer
(https://github.com/duckdb/duckdb/pull/2331)

Closes #11440 from cpcloud/ARROW-14352

Authored-by: Phillip Cloud <417981+cpcloud@users.noreply.github.com>
Signed-off-by: Benjamin Kietzman <bengilgit@gmail.com>
---
 cpp/src/generated/Relation_generated.h | 4 ++--
 experimental/computeir/Relation.fbs    | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/cpp/src/generated/Relation_generated.h b/cpp/src/generated/Relation_generated.h
index 0dbbc86ed5cc7..522a41d63173f 100644
--- a/cpp/src/generated/Relation_generated.h
+++ b/cpp/src/generated/Relation_generated.h
@@ -1114,6 +1114,7 @@ struct Source FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const org::apache::arrow::computeir::flatbuf::Expression *filter() const {
     return GetPointer<const org::apache::arrow::computeir::flatbuf::Expression *>(VT_FILTER);
   }
+  /// Schemas are explicitly optional
   const org::apache::arrow::flatbuf::Schema *schema() const {
     return GetPointer<const org::apache::arrow::flatbuf::Schema *>(VT_SCHEMA);
   }
@@ -1125,7 +1126,7 @@ struct Source FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            verifier.VerifyString(name()) &&
            VerifyOffset(verifier, VT_FILTER) &&
            verifier.VerifyTable(filter()) &&
-           VerifyOffsetRequired(verifier, VT_SCHEMA) &&
+           VerifyOffset(verifier, VT_SCHEMA) &&
            verifier.VerifyTable(schema()) &&
            verifier.EndTable();
   }
@@ -1156,7 +1157,6 @@ struct SourceBuilder {
     const auto end = fbb_.EndTable(start_);
     auto o = flatbuffers::Offset<Source>(end);
     fbb_.Required(o, Source::VT_NAME);
-    fbb_.Required(o, Source::VT_SCHEMA);
     return o;
   }
 };
diff --git a/experimental/computeir/Relation.fbs b/experimental/computeir/Relation.fbs
index 12092ec9296bf..38fe5c7901306 100644
--- a/experimental/computeir/Relation.fbs
+++ b/experimental/computeir/Relation.fbs
@@ -184,7 +184,8 @@ table Source {
   /// A missing filter value indicates no filter, i.e., all rows are
   /// returned from the source.
   filter: Expression;
-  schema: org.apache.arrow.flatbuf.Schema (required);
+  /// Schemas are explicitly optional
+  schema: org.apache.arrow.flatbuf.Schema;
 }
 
 /// The varieties of relations

From 9ebc594e959c9a88ede0f5672e8614b41d8f77d4 Mon Sep 17 00:00:00 2001
From: Nic Crane <thisisnic@gmail.com>
Date: Thu, 28 Oct 2021 17:12:56 +0100
Subject: [PATCH 047/194] ARROW-14310: [R] Make expect_dplyr_equal() more
 intuitive

* Renamed "input" to ".input" in test helpers to make more obvious that it's a "special" variable
* Renamed functions that don't fit the `testthat::expect_*()` pattern to prevent confusion
* Added additional documentation to helper functions

Closes #11403 from thisisnic/ARROW-14310_expect_dplyr_equal

Lead-authored-by: Nic Crane <thisisnic@gmail.com>
Co-authored-by: Nic <thisisnic@gmail.com>
Signed-off-by: Nic Crane <thisisnic@gmail.com>
---
 r/tests/testthat/helper-expectation.R         | 120 +++--
 r/tests/testthat/test-Array.R                 |   8 +-
 r/tests/testthat/test-compute-aggregate.R     |  62 +--
 r/tests/testthat/test-compute-sort.R          |  60 +--
 r/tests/testthat/test-dplyr-arrange.R         |  80 +--
 r/tests/testthat/test-dplyr-collapse.R        |  12 +-
 r/tests/testthat/test-dplyr-count.R           |  32 +-
 r/tests/testthat/test-dplyr-distinct.R        |  32 +-
 r/tests/testthat/test-dplyr-filter.R          | 138 +++---
 .../testthat/test-dplyr-funcs-conditional.R   | 100 ++--
 r/tests/testthat/test-dplyr-funcs-datetime.R  | 116 ++---
 r/tests/testthat/test-dplyr-funcs-math.R      |  98 ++--
 r/tests/testthat/test-dplyr-funcs-string.R    | 455 +++++++++---------
 r/tests/testthat/test-dplyr-funcs-type.R      |  84 ++--
 r/tests/testthat/test-dplyr-group-by.R        |  54 +--
 r/tests/testthat/test-dplyr-join.R            |  36 +-
 r/tests/testthat/test-dplyr-mutate.R          | 116 ++---
 r/tests/testthat/test-dplyr-query.R           |  29 +-
 r/tests/testthat/test-dplyr-select.R          |  72 +--
 r/tests/testthat/test-dplyr-summarize.R       | 280 +++++------
 r/tests/testthat/test-metadata.R              |  24 +-
 r/tests/testthat/test-na-omit.R               |  12 +-
 22 files changed, 1032 insertions(+), 988 deletions(-)

diff --git a/r/tests/testthat/helper-expectation.R b/r/tests/testthat/helper-expectation.R
index b262db4a1e122..ef6142bb4fb45 100644
--- a/r/tests/testthat/helper-expectation.R
+++ b/r/tests/testthat/helper-expectation.R
@@ -28,6 +28,8 @@ expect_r6_class <- function(object, class) {
   expect_s3_class(object, "R6")
 }
 
+#' Mask `testthat::expect_equal()` in order to compare ArrowObjects using their
+#' `Equals` methods from the C++ library.
 expect_equal <- function(object, expected, ignore_attr = FALSE, ..., info = NULL, label = NULL) {
   if (inherits(object, "ArrowObject") && inherits(expected, "ArrowObject")) {
     mc <- match.call()
@@ -64,24 +66,35 @@ verify_output <- function(...) {
   testthat::verify_output(...)
 }
 
-#' @param expr A dplyr pipeline with `input` as its start
-#' @param tbl A tbl/df as reference, will make RB/Table with
-#' @param skip_record_batch string skip message, if should skip RB test
-#' @param skip_table string skip message, if should skip Table test
-#' @param warning string expected warning from the RecordBatch and Table paths,
-#'   passed to `expect_warning()`. Special values:
+#' Ensure that dplyr methods on Arrow objects return the same as for data frames
+#'
+#' This function compares the output of running a dplyr expression on a tibble
+#' or data.frame object against the output of the same expression run on
+#' Arrow Table and RecordBatch objects.
+#'
+#'
+#' @param expr A dplyr pipeline which must have `.input` as its start
+#' @param tbl A tibble or data.frame which will be substituted for `.input`
+#' @param skip_record_batch The skip message to show (if you should skip the
+#' RecordBatch test)
+#' @param skip_table The skip message to show (if you should skip the Table test)
+#' @param warning The expected warning from the RecordBatch and Table comparison
+#'  paths, passed to `expect_warning()`. Special values:
 #'     * `NA` (the default) for ensuring no warning message
 #'     * `TRUE` is a special case to mean to check for the
 #'      "not supported in Arrow; pulling data into R" message.
 #' @param ... additional arguments, passed to `expect_equal()`
-expect_dplyr_equal <- function(expr,
-                               tbl,
-                               skip_record_batch = NULL,
-                               skip_table = NULL,
-                               warning = NA,
-                               ...) {
+compare_dplyr_binding <- function(expr,
+                                  tbl,
+                                  skip_record_batch = NULL,
+                                  skip_table = NULL,
+                                  warning = NA,
+                                  ...) {
+
+  # Quote the contents of `expr` so that we can evaluate it a few different ways
   expr <- rlang::enquo(expr)
-  expected <- rlang::eval_tidy(expr, rlang::new_data_mask(rlang::env(input = tbl)))
+  # Get the expected output by evaluating expr on the .input data.frame using regular dplyr
+  expected <- rlang::eval_tidy(expr, rlang::new_data_mask(rlang::env(.input = tbl)))
 
   if (isTRUE(warning)) {
     # Special-case the simple warning:
@@ -91,11 +104,12 @@ expect_dplyr_equal <- function(expr,
 
   skip_msg <- NULL
 
+  # Evaluate `expr` on a RecordBatch object and compare with `expected`
   if (is.null(skip_record_batch)) {
     expect_warning(
       via_batch <- rlang::eval_tidy(
         expr,
-        rlang::new_data_mask(rlang::env(input = record_batch(tbl)))
+        rlang::new_data_mask(rlang::env(.input = record_batch(tbl)))
       ),
       warning
     )
@@ -104,11 +118,12 @@ expect_dplyr_equal <- function(expr,
     skip_msg <- c(skip_msg, skip_record_batch)
   }
 
+  # Evaluate `expr` on a Table object and compare with `expected`
   if (is.null(skip_table)) {
     expect_warning(
       via_table <- rlang::eval_tidy(
         expr,
-        rlang::new_data_mask(rlang::env(input = arrow_table(tbl)))
+        rlang::new_data_mask(rlang::env(.input = arrow_table(tbl)))
       ),
       warning
     )
@@ -122,21 +137,30 @@ expect_dplyr_equal <- function(expr,
   }
 }
 
-expect_dplyr_error <- function(expr, # A dplyr pipeline with `input` as its start
-                               tbl, # A tbl/df as reference, will make RB/Table with
-                               ...) {
+#' Assert that Arrow dplyr methods error in the same way as methods on data.frame
+#'
+#' Comparing the error message generated when running expressions on R objects
+#' against the error message generated by running the same expression on Arrow
+#' Tables and RecordBatches.
+#'
+#' @param expr A dplyr pipeline which must have `.input` as its start
+#' @param tbl A tibble or data.frame which will be substituted for `.input`
+#' @param ... additional arguments, passed to `expect_error()`
+compare_dplyr_error <- function(expr,
+                                tbl,
+                                ...) {
   # ensure we have supplied tbl
   force(tbl)
 
   expr <- rlang::enquo(expr)
   msg <- tryCatch(
-    rlang::eval_tidy(expr, rlang::new_data_mask(rlang::env(input = tbl))),
+    rlang::eval_tidy(expr, rlang::new_data_mask(rlang::env(.input = tbl))),
     error = function(e) {
       msg <- conditionMessage(e)
 
       # The error here is of the form:
       #
-      # Problem with `filter()` input `..1`.
+      # Problem with `filter()` .input `..1`.
       # x object 'b_var' not found
       # ℹ Input `..1` is `chr == b_var`.
       #
@@ -158,7 +182,7 @@ expect_dplyr_error <- function(expr, # A dplyr pipeline with `input` as its star
   expect_error(
     rlang::eval_tidy(
       expr,
-      rlang::new_data_mask(rlang::env(input = record_batch(tbl)))
+      rlang::new_data_mask(rlang::env(.input = record_batch(tbl)))
     ),
     msg,
     ...
@@ -166,27 +190,36 @@ expect_dplyr_error <- function(expr, # A dplyr pipeline with `input` as its star
   expect_error(
     rlang::eval_tidy(
       expr,
-      rlang::new_data_mask(rlang::env(input = arrow_table(tbl)))
+      rlang::new_data_mask(rlang::env(.input = arrow_table(tbl)))
     ),
     msg,
     ...
   )
 }
 
-expect_vector_equal <- function(expr, # A vectorized R expression containing `input` as its input
-                                vec, # A vector as reference, will make Array/ChunkedArray with
-                                skip_array = NULL, # Msg, if should skip Array test
-                                skip_chunked_array = NULL, # Msg, if should skip ChunkedArray test
-                                ignore_attr = FALSE, # ignore attributes?
-                                ...) {
+#' Comparing the output of running expressions on R vectors against the same
+#' expression run on Arrow Arrays and ChunkedArrays.
+#'
+#' @param expr A vectorized R expression which must have `.input` as its start
+#' @param vec A vector which will be substituted for `.input`
+#' @param skip_array The skip message to show (if you should skip the Array test)
+#' @param skip_chunked_array The skip message to show (if you should skip the ChunkedArray test)
+#' @param ignore_attr Ignore differences in specified attributes?
+#' @param ... additional arguments, passed to `expect_as_vector()`
+compare_expression <- function(expr,
+                           vec,
+                           skip_array = NULL,
+                           skip_chunked_array = NULL,
+                           ignore_attr = FALSE,
+                           ...) {
   expr <- rlang::enquo(expr)
-  expected <- rlang::eval_tidy(expr, rlang::new_data_mask(rlang::env(input = vec)))
+  expected <- rlang::eval_tidy(expr, rlang::new_data_mask(rlang::env(.input = vec)))
   skip_msg <- NULL
 
   if (is.null(skip_array)) {
     via_array <- rlang::eval_tidy(
       expr,
-      rlang::new_data_mask(rlang::env(input = Array$create(vec)))
+      rlang::new_data_mask(rlang::env(.input = Array$create(vec)))
     )
     expect_as_vector(via_array, expected, ignore_attr, ...)
   } else {
@@ -199,7 +232,7 @@ expect_vector_equal <- function(expr, # A vectorized R expression containing `in
 
     via_chunked <- rlang::eval_tidy(
       expr,
-      rlang::new_data_mask(rlang::env(input = ChunkedArray$create(split_vector[[1]], split_vector[[2]])))
+      rlang::new_data_mask(rlang::env(.input = ChunkedArray$create(split_vector[[1]], split_vector[[2]])))
     )
     expect_as_vector(via_chunked, expected, ignore_attr, ...)
   } else {
@@ -211,15 +244,24 @@ expect_vector_equal <- function(expr, # A vectorized R expression containing `in
   }
 }
 
-expect_vector_error <- function(expr, # A vectorized R expression containing `input` as its input
-                                vec, # A vector as reference, will make Array/ChunkedArray with
-                                skip_array = NULL, # Msg, if should skip Array test
-                                skip_chunked_array = NULL, # Msg, if should skip ChunkedArray test
-                                ...) {
+#' Comparing the error message generated when running expressions on R objects
+#' against the error message generated by running the same expression on Arrow
+#' Arrays and ChunkedArrays.
+#'
+#' @param expr An R expression which must have `.input` as its start
+#' @param vec A vector which will be substituted for `.input`
+#' @param skip_array The skip message to show (if you should skip the Array test)
+#' @param skip_chunked_array The skip message to show (if you should skip the ChunkedArray test)
+#' @param ... additional arguments, passed to `expect_error()`
+compare_expression_error <- function(expr,
+                                 vec,
+                                 skip_array = NULL,
+                                 skip_chunked_array = NULL,
+                                 ...) {
   expr <- rlang::enquo(expr)
 
   msg <- tryCatch(
-    rlang::eval_tidy(expr, rlang::new_data_mask(rlang::env(input = vec))),
+    rlang::eval_tidy(expr, rlang::new_data_mask(rlang::env(.input = vec))),
     error = function(e) {
       msg <- conditionMessage(e)
 
@@ -240,7 +282,7 @@ expect_vector_error <- function(expr, # A vectorized R expression containing `in
     expect_error(
       rlang::eval_tidy(
         expr,
-        rlang::new_data_mask(rlang::env(input = Array$create(vec)))
+        rlang::new_data_mask(rlang::env(.input = Array$create(vec)))
       ),
       msg,
       ...
@@ -256,7 +298,7 @@ expect_vector_error <- function(expr, # A vectorized R expression containing `in
     expect_error(
       rlang::eval_tidy(
         expr,
-        rlang::new_data_mask(rlang::env(input = ChunkedArray$create(split_vector[[1]], split_vector[[2]])))
+        rlang::new_data_mask(rlang::env(.input = ChunkedArray$create(split_vector[[1]], split_vector[[2]])))
       ),
       msg,
       ...
diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R
index a4f8239b8b637..ce23c260908d9 100644
--- a/r/tests/testthat/test-Array.R
+++ b/r/tests/testthat/test-Array.R
@@ -318,19 +318,19 @@ test_that("support for NaN (ARROW-3615)", {
 
 test_that("is.nan() evalutes to FALSE on NA (for consistency with base R)", {
   x <- c(1.0, NA, NaN, -1.0)
-  expect_vector_equal(is.nan(input), x)
+  compare_expression(is.nan(.input), x)
 })
 
 test_that("is.nan() evalutes to FALSE on non-floats (for consistency with base R)", {
   x <- c(1L, 2L, 3L)
   y <- c("foo", "bar")
-  expect_vector_equal(is.nan(input), x)
-  expect_vector_equal(is.nan(input), y)
+  compare_expression(is.nan(.input), x)
+  compare_expression(is.nan(.input), y)
 })
 
 test_that("is.na() evalutes to TRUE on NaN (for consistency with base R)", {
   x <- c(1, NA, NaN, -1)
-  expect_vector_equal(is.na(input), x)
+  compare_expression(is.na(.input), x)
 })
 
 test_that("integer types casts (ARROW-3741)", {
diff --git a/r/tests/testthat/test-compute-aggregate.R b/r/tests/testthat/test-compute-aggregate.R
index 5054d04538b77..018279d4bbeec 100644
--- a/r/tests/testthat/test-compute-aggregate.R
+++ b/r/tests/testthat/test-compute-aggregate.R
@@ -295,40 +295,40 @@ test_that("median passes ... args to quantile", {
 })
 
 test_that("median.Array and median.ChunkedArray", {
-  expect_vector_equal(
-    median(input),
+  compare_expression(
+    median(.input),
     1:4
   )
-  expect_vector_equal(
-    median(input),
+  compare_expression(
+    median(.input),
     1:5
   )
-  expect_vector_equal(
-    median(input),
+  compare_expression(
+    median(.input),
     numeric(0)
   )
-  expect_vector_equal(
-    median(input, na.rm = FALSE),
+  compare_expression(
+    median(.input, na.rm = FALSE),
     c(1, 2, NA)
   )
-  expect_vector_equal(
-    median(input, na.rm = TRUE),
+  compare_expression(
+    median(.input, na.rm = TRUE),
     c(1, 2, NA)
   )
-  expect_vector_equal(
-    median(input, na.rm = TRUE),
+  compare_expression(
+    median(.input, na.rm = TRUE),
     NA_real_
   )
-  expect_vector_equal(
-    median(input, na.rm = FALSE),
+  compare_expression(
+    median(.input, na.rm = FALSE),
     c(1, 2, NA)
   )
-  expect_vector_equal(
-    median(input, na.rm = TRUE),
+  compare_expression(
+    median(.input, na.rm = TRUE),
     c(1, 2, NA)
   )
-  expect_vector_equal(
-    median(input, na.rm = TRUE),
+  compare_expression(
+    median(.input, na.rm = TRUE),
     NA_real_
   )
 })
@@ -388,31 +388,31 @@ test_that("value_counts", {
 test_that("any.Array and any.ChunkedArray", {
   data <- c(1:10, NA, NA)
 
-  expect_vector_equal(any(input > 5), data)
-  expect_vector_equal(any(input > 5, na.rm = TRUE), data)
-  expect_vector_equal(any(input < 1), data)
-  expect_vector_equal(any(input < 1, na.rm = TRUE), data)
+  compare_expression(any(.input > 5), data)
+  compare_expression(any(.input > 5, na.rm = TRUE), data)
+  compare_expression(any(.input < 1), data)
+  compare_expression(any(.input < 1, na.rm = TRUE), data)
 
   data_logical <- c(TRUE, FALSE, TRUE, NA, FALSE)
 
-  expect_vector_equal(any(input), data_logical)
-  expect_vector_equal(any(input, na.rm = FALSE), data_logical)
-  expect_vector_equal(any(input, na.rm = TRUE), data_logical)
+  compare_expression(any(.input), data_logical)
+  compare_expression(any(.input, na.rm = FALSE), data_logical)
+  compare_expression(any(.input, na.rm = TRUE), data_logical)
 })
 
 test_that("all.Array and all.ChunkedArray", {
   data <- c(1:10, NA, NA)
 
-  expect_vector_equal(all(input > 5), data)
-  expect_vector_equal(all(input > 5, na.rm = TRUE), data)
+  compare_expression(all(.input > 5), data)
+  compare_expression(all(.input > 5, na.rm = TRUE), data)
 
-  expect_vector_equal(all(input < 11), data)
-  expect_vector_equal(all(input < 11, na.rm = TRUE), data)
+  compare_expression(all(.input < 11), data)
+  compare_expression(all(.input < 11, na.rm = TRUE), data)
 
   data_logical <- c(TRUE, TRUE, NA)
 
-  expect_vector_equal(all(input), data_logical)
-  expect_vector_equal(all(input, na.rm = TRUE), data_logical)
+  compare_expression(all(.input), data_logical)
+  compare_expression(all(.input, na.rm = TRUE), data_logical)
 })
 
 test_that("variance", {
diff --git a/r/tests/testthat/test-compute-sort.R b/r/tests/testthat/test-compute-sort.R
index 0b6dd3ec503b0..e3574d86f7748 100644
--- a/r/tests/testthat/test-compute-sort.R
+++ b/r/tests/testthat/test-compute-sort.R
@@ -66,70 +66,70 @@ test_that("ChunkedArray$SortIndices()", {
 })
 
 test_that("sort(vector), sort(Array), sort(ChunkedArray) give equivalent results on integers", {
-  expect_vector_equal(
-    sort(input),
+  compare_expression(
+    sort(.input),
     tbl$int
   )
-  expect_vector_equal(
-    sort(input, na.last = NA),
+  compare_expression(
+    sort(.input, na.last = NA),
     tbl$int
   )
-  expect_vector_equal(
-    sort(input, na.last = TRUE),
+  compare_expression(
+    sort(.input, na.last = TRUE),
     tbl$int
   )
-  expect_vector_equal(
-    sort(input, na.last = FALSE),
+  compare_expression(
+    sort(.input, na.last = FALSE),
     tbl$int
   )
-  expect_vector_equal(
-    sort(input, decreasing = TRUE),
+  compare_expression(
+    sort(.input, decreasing = TRUE),
     tbl$int,
   )
-  expect_vector_equal(
-    sort(input, decreasing = TRUE, na.last = TRUE),
+  compare_expression(
+    sort(.input, decreasing = TRUE, na.last = TRUE),
     tbl$int,
   )
-  expect_vector_equal(
-    sort(input, decreasing = TRUE, na.last = FALSE),
+  compare_expression(
+    sort(.input, decreasing = TRUE, na.last = FALSE),
     tbl$int,
   )
 })
 
 test_that("sort(vector), sort(Array), sort(ChunkedArray) give equivalent results on strings", {
-  expect_vector_equal(
-    sort(input, decreasing = TRUE, na.last = FALSE),
+  compare_expression(
+    sort(.input, decreasing = TRUE, na.last = FALSE),
     tbl$chr
   )
-  expect_vector_equal(
-    sort(input, decreasing = TRUE, na.last = FALSE),
+  compare_expression(
+    sort(.input, decreasing = TRUE, na.last = FALSE),
     tbl$chr
   )
 })
 
 test_that("sort(vector), sort(Array), sort(ChunkedArray) give equivalent results on floats", {
-  expect_vector_equal(
-    sort(input, decreasing = TRUE, na.last = TRUE),
+  compare_expression(
+    sort(.input, decreasing = TRUE, na.last = TRUE),
     tbl$dbl
   )
-  expect_vector_equal(
-    sort(input, decreasing = FALSE, na.last = TRUE),
+  compare_expression(
+    sort(.input, decreasing = FALSE, na.last = TRUE),
     tbl$dbl
   )
-  expect_vector_equal(
-    sort(input, decreasing = TRUE, na.last = NA),
+  compare_expression(
+    sort(.input, decreasing = TRUE, na.last = NA),
     tbl$dbl
   )
-  expect_vector_equal(
-    sort(input, decreasing = TRUE, na.last = FALSE),
+  compare_expression(
+    sort(.input, decreasing = TRUE, na.last = FALSE),
     tbl$dbl,
   )
-  expect_vector_equal(
-    sort(input, decreasing = FALSE, na.last = NA),
+  compare_expression(
+    sort(.input, decreasing = FALSE, na.last = NA),
     tbl$dbl
   )
-  expect_vector_equal(
-    sort(input, decreasing = FALSE, na.last = FALSE),
+  compare_expression(
+    sort(.input, decreasing = FALSE, na.last = FALSE),
     tbl$dbl,
   )
 })
diff --git a/r/tests/testthat/test-dplyr-arrange.R b/r/tests/testthat/test-dplyr-arrange.R
index ae747efdfa6a2..d22f64a7c570e 100644
--- a/r/tests/testthat/test-dplyr-arrange.R
+++ b/r/tests/testthat/test-dplyr-arrange.R
@@ -23,118 +23,118 @@ library(dplyr, warn.conflicts = FALSE)
 tbl <- slice_sample(example_data_for_sorting, prop = 1L)
 
 test_that("arrange() on integer, double, and character columns", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       arrange(int, chr) %>%
       collect(),
     tbl
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       arrange(int, desc(dbl)) %>%
       collect(),
     tbl
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       arrange(int, desc(desc(dbl))) %>%
       collect(),
     tbl
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       arrange(int) %>%
       arrange(desc(dbl)) %>%
       collect(),
     tbl
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       arrange(int + dbl, chr) %>%
       collect(),
     tbl
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(zzz = int + dbl, ) %>%
       arrange(zzz, chr) %>%
       collect(),
     tbl
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(zzz = int + dbl) %>%
       arrange(int + dbl, chr) %>%
       collect(),
     tbl
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(int + dbl) %>%
       arrange(int + dbl, chr) %>%
       collect(),
     tbl
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(grp) %>%
       arrange(int, dbl) %>%
       collect(),
     tbl
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(grp) %>%
       arrange(int, dbl, .by_group = TRUE) %>%
       collect(),
     tbl
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(grp, grp2) %>%
       arrange(int, dbl, .by_group = TRUE) %>%
       collect(),
     tbl %>%
       mutate(grp2 = ifelse(is.na(lgl), 1L, as.integer(lgl)))
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(grp) %>%
       arrange(.by_group = TRUE) %>%
       pull(grp),
     tbl
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       arrange() %>%
       collect(),
     tbl %>%
       group_by(grp)
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(grp) %>%
       arrange() %>%
       collect(),
     tbl
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       arrange() %>%
       collect(),
     tbl
   )
   test_sort_col <- "chr"
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       arrange(!!sym(test_sort_col)) %>%
       collect(),
     tbl %>%
       select(chr, lgl)
   )
   test_sort_cols <- c("int", "dbl")
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       arrange(!!!syms(test_sort_cols)) %>%
       collect(),
     tbl
@@ -142,14 +142,14 @@ test_that("arrange() on integer, double, and character columns", {
 })
 
 test_that("arrange() on datetime columns", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       arrange(dttm, int) %>%
       collect(),
     tbl
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       arrange(dttm) %>%
       collect(),
     tbl %>%
@@ -158,8 +158,8 @@ test_that("arrange() on datetime columns", {
 })
 
 test_that("arrange() on logical columns", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       arrange(lgl, int) %>%
       collect(),
     tbl
diff --git a/r/tests/testthat/test-dplyr-collapse.R b/r/tests/testthat/test-dplyr-collapse.R
index 13d870faedd5b..c7281b62ded52 100644
--- a/r/tests/testthat/test-dplyr-collapse.R
+++ b/r/tests/testthat/test-dplyr-collapse.R
@@ -94,8 +94,8 @@ test_that("collapse", {
   expect_true(is_collapsed(collapse(q)))
   expect_false(is_collapsed(collapse(q)$.data))
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(dbl > 2, chr == "d" | chr == "f") %>%
       select(chr, int, lgl) %>%
       mutate(twice = int * 2L) %>%
@@ -106,8 +106,8 @@ test_that("collapse", {
     tbl
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(dbl > 2, chr == "d" | chr == "f") %>%
       collapse() %>%
       select(chr, int, lgl) %>%
@@ -118,8 +118,8 @@ test_that("collapse", {
     tbl
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(dbl > 2, chr == "d" | chr == "f") %>%
       collapse() %>%
       group_by(chr) %>%
diff --git a/r/tests/testthat/test-dplyr-count.R b/r/tests/testthat/test-dplyr-count.R
index 1a852e1999970..8af9b57aa0a90 100644
--- a/r/tests/testthat/test-dplyr-count.R
+++ b/r/tests/testthat/test-dplyr-count.R
@@ -24,15 +24,15 @@ tbl <- example_data
 tbl$some_grouping <- rep(c(1, 2), 5)
 
 test_that("count/tally", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       count() %>%
       collect(),
     tbl
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       tally() %>%
       collect(),
     tbl
@@ -40,16 +40,16 @@ test_that("count/tally", {
 })
 
 test_that("count/tally with wt and grouped data", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(some_grouping) %>%
       count(wt = int) %>%
       collect(),
     tbl
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(some_grouping) %>%
       tally(wt = int) %>%
       collect(),
@@ -58,16 +58,16 @@ test_that("count/tally with wt and grouped data", {
 })
 
 test_that("count/tally with sort", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(some_grouping) %>%
       count(wt = int, sort = TRUE) %>%
       collect(),
     tbl
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(some_grouping) %>%
       tally(wt = int, sort = TRUE) %>%
       collect(),
@@ -76,15 +76,15 @@ test_that("count/tally with sort", {
 })
 
 test_that("count/tally with name arg", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       count(name = "new_col") %>%
       collect(),
     tbl
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       tally(name = "new_col") %>%
       collect(),
     tbl
diff --git a/r/tests/testthat/test-dplyr-distinct.R b/r/tests/testthat/test-dplyr-distinct.R
index 4e85a5a54ab12..3a44c7372e8c1 100644
--- a/r/tests/testthat/test-dplyr-distinct.R
+++ b/r/tests/testthat/test-dplyr-distinct.R
@@ -23,8 +23,8 @@ tbl <- example_data
 tbl$some_grouping <- rep(c(1, 2), 5)
 
 test_that("distinct()", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       distinct(some_grouping, lgl) %>%
       collect() %>%
       arrange(some_grouping, lgl),
@@ -33,16 +33,16 @@ test_that("distinct()", {
 })
 
 test_that("distinct() works without any variables", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       distinct() %>%
       arrange(int) %>%
       collect(),
     tbl
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(x = int + 1) %>%
       distinct() %>%
       # Even though we have group_by(x), all cols (including int) are kept
@@ -53,8 +53,8 @@ test_that("distinct() works without any variables", {
 })
 
 test_that("distinct() can retain groups", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(some_grouping, int) %>%
       distinct(lgl) %>%
       collect() %>%
@@ -63,8 +63,8 @@ test_that("distinct() can retain groups", {
   )
 
   # With expressions here
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(y = some_grouping, int) %>%
       distinct(x = lgl) %>%
       collect() %>%
@@ -74,16 +74,16 @@ test_that("distinct() can retain groups", {
 })
 
 test_that("distinct() can contain expressions", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       distinct(lgl, x = some_grouping + 1) %>%
       collect() %>%
       arrange(lgl, x),
     tbl
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(lgl, int) %>%
       distinct(x = some_grouping + 1) %>%
       collect() %>%
@@ -94,8 +94,8 @@ test_that("distinct() can contain expressions", {
 
 test_that("distinct() can return all columns", {
   skip("ARROW-13993 - need this to return correct rows from other cols")
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       distinct(lgl, .keep_all = TRUE) %>%
       collect() %>%
       arrange(int),
diff --git a/r/tests/testthat/test-dplyr-filter.R b/r/tests/testthat/test-dplyr-filter.R
index 74b7cf3e7eddc..72a64229cdffe 100644
--- a/r/tests/testthat/test-dplyr-filter.R
+++ b/r/tests/testthat/test-dplyr-filter.R
@@ -29,8 +29,8 @@ tbl$padded_strings <- stringr::str_pad(letters[1:10], width = 2 * (1:10) + 1, si
 tbl$some_negative <- tbl$int * (-1)^(1:nrow(tbl)) # nolint
 
 test_that("filter() on is.na()", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(is.na(lgl)) %>%
       select(chr, int, lgl) %>%
       collect(),
@@ -39,8 +39,8 @@ test_that("filter() on is.na()", {
 })
 
 test_that("filter() with NAs in selection", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(lgl) %>%
       select(chr, int, lgl) %>%
       collect(),
@@ -49,8 +49,8 @@ test_that("filter() with NAs in selection", {
 })
 
 test_that("Filter returning an empty Table should not segfault (ARROW-8354)", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(false) %>%
       select(chr, int, lgl) %>%
       collect(),
@@ -60,8 +60,8 @@ test_that("Filter returning an empty Table should not segfault (ARROW-8354)", {
 
 test_that("filtering with expression", {
   char_sym <- "b"
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(chr == char_sym) %>%
       select(string = chr, int) %>%
       collect(),
@@ -70,56 +70,56 @@ test_that("filtering with expression", {
 })
 
 test_that("filtering with arithmetic", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(dbl + 1 > 3) %>%
       select(string = chr, int, dbl) %>%
       collect(),
     tbl
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(dbl / 2 > 3) %>%
       select(string = chr, int, dbl) %>%
       collect(),
     tbl
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(dbl / 2L > 3) %>%
       select(string = chr, int, dbl) %>%
       collect(),
     tbl
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(int / 2 > 3) %>%
       select(string = chr, int, dbl) %>%
       collect(),
     tbl
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(int / 2L > 3) %>%
       select(string = chr, int, dbl) %>%
       collect(),
     tbl
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(dbl %/% 2 > 3) %>%
       select(string = chr, int, dbl) %>%
       collect(),
     tbl
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(dbl^2 > 3) %>%
       select(string = chr, int, dbl) %>%
       collect(),
@@ -128,24 +128,24 @@ test_that("filtering with arithmetic", {
 })
 
 test_that("filtering with expression + autocasting", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(dbl + 1 > 3L) %>% # test autocasting with comparison to 3L
       select(string = chr, int, dbl) %>%
       collect(),
     tbl
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(int + 1 > 3) %>%
       select(string = chr, int, dbl) %>%
       collect(),
     tbl
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(int^2 > 3) %>%
       select(string = chr, int, dbl) %>%
       collect(),
@@ -154,8 +154,8 @@ test_that("filtering with expression + autocasting", {
 })
 
 test_that("More complex select/filter", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(dbl > 2, chr == "d" | chr == "f") %>%
       select(chr, int, lgl) %>%
       filter(int < 5) %>%
@@ -166,8 +166,8 @@ test_that("More complex select/filter", {
 })
 
 test_that("filter() with %in%", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(dbl > 2, chr %in% c("d", "f")) %>%
       collect(),
     tbl
@@ -175,20 +175,20 @@ test_that("filter() with %in%", {
 })
 
 test_that("Negative scalar values", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(some_negative > -2) %>%
       collect(),
     tbl
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(some_negative %in% -1) %>%
       collect(),
     tbl
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(int == -some_negative) %>%
       collect(),
     tbl
@@ -196,15 +196,15 @@ test_that("Negative scalar values", {
 })
 
 test_that("filter() with between()", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(between(dbl, 1, 2)) %>%
       collect(),
     tbl
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(between(dbl, 0.5, 2)) %>%
       collect(),
     tbl
@@ -243,15 +243,15 @@ test_that("filter() with between()", {
 
 test_that("filter() with string ops", {
   skip_if_not_available("utf8proc")
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(dbl > 2, str_length(verses) > 25) %>%
       collect(),
     tbl
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(dbl > 2, str_length(str_trim(padded_strings, "left")) > 5) %>%
       collect(),
     tbl
@@ -260,31 +260,31 @@ test_that("filter() with string ops", {
 
 test_that("filter environment scope", {
   # "object 'b_var' not found"
-  expect_dplyr_error(input %>% filter(chr == b_var), tbl)
+  compare_dplyr_error(.input %>% filter(chr == b_var), tbl)
 
   b_var <- "b"
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(chr == b_var) %>%
       collect(),
     tbl
   )
   # Also for functions
   # 'could not find function "isEqualTo"' because we haven't defined it yet
-  expect_dplyr_error(input %>% filter(isEqualTo(int, 4)), tbl)
+  compare_dplyr_error(.input %>% filter(isEqualTo(int, 4)), tbl)
 
   # This works but only because there are S3 methods for those operations
   isEqualTo <- function(x, y) x == y & !is.na(x)
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       select(-fct) %>% # factor levels aren't identical
       filter(isEqualTo(int, 4)) %>%
       collect(),
     tbl
   )
   # Try something that needs to call another nse_func
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       select(-fct) %>%
       filter(nchar(padded_strings) < 10) %>%
       collect(),
@@ -292,8 +292,8 @@ test_that("filter environment scope", {
   )
   isShortString <- function(x) nchar(x) < 10
   skip("TODO: 14071")
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       select(-fct) %>%
       filter(isShortString(padded_strings)) %>%
       collect(),
@@ -327,15 +327,15 @@ test_that("Filtering on a column that doesn't exist errors correctly", {
 })
 
 test_that("Filtering with unsupported functions", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(int > 2, pnorm(dbl) > .99) %>%
       collect(),
     tbl,
     warning = "Expression pnorm\\(dbl\\) > 0.99 not supported in Arrow; pulling data into R"
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(
         nchar(chr, type = "bytes", allowNA = TRUE) == 1, # bad, Arrow msg
         int > 2, # good
@@ -361,7 +361,7 @@ test_that("Calling Arrow compute functions 'directly'", {
       select(string = chr, int, dbl)
   )
 
-  expect_dplyr_equal(
+  compare_dplyr_binding(
     tbl %>%
       record_batch() %>%
       filter(arrow_greater(arrow_add(dbl, 1), 3L)) %>%
@@ -374,16 +374,16 @@ test_that("Calling Arrow compute functions 'directly'", {
 })
 
 test_that("filter() with .data pronoun", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(.data$dbl > 4) %>%
       select(.data$chr, .data$int, .data$lgl) %>%
       collect(),
     tbl
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(is.na(.data$lgl)) %>%
       select(.data$chr, .data$int, .data$lgl) %>%
       collect(),
@@ -392,8 +392,8 @@ test_that("filter() with .data pronoun", {
 
   # and the .env pronoun too!
   chr <- 4
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(.data$dbl > .env$chr) %>%
       select(.data$chr, .data$int, .data$lgl) %>%
       collect(),
@@ -402,8 +402,8 @@ test_that("filter() with .data pronoun", {
 
   skip("test now faulty - code no longer gives error & outputs a empty tibble")
   # but there is an error if we don't override the masking with `.env`
-  expect_dplyr_error(
-    input %>%
+  compare_dplyr_error(
+    .input %>%
       filter(.data$dbl > chr) %>%
       select(.data$chr, .data$int, .data$lgl) %>%
       collect(),
diff --git a/r/tests/testthat/test-dplyr-funcs-conditional.R b/r/tests/testthat/test-dplyr-funcs-conditional.R
index e597d365e0494..4f27007958073 100644
--- a/r/tests/testthat/test-dplyr-funcs-conditional.R
+++ b/r/tests/testthat/test-dplyr-funcs-conditional.R
@@ -26,8 +26,8 @@ tbl$verses <- verses[[1]]
 tbl$another_chr <- tail(letters, 10)
 
 test_that("if_else and ifelse", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(
         y = if_else(int > 5, 1, 0)
       ) %>%
@@ -35,8 +35,8 @@ test_that("if_else and ifelse", {
     tbl
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(
         y = if_else(int > 5, int, 0L)
       ) %>%
@@ -53,8 +53,8 @@ test_that("if_else and ifelse", {
     "NotImplemented: Function if_else has no kernel matching input types"
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(
         y = if_else(int > 5, 1, NA_real_)
       ) %>%
@@ -62,8 +62,8 @@ test_that("if_else and ifelse", {
     tbl
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(
         y = ifelse(int > 5, 1, 0)
       ) %>%
@@ -71,8 +71,8 @@ test_that("if_else and ifelse", {
     tbl
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(
         y = if_else(dbl > 5, TRUE, FALSE)
       ) %>%
@@ -80,8 +80,8 @@ test_that("if_else and ifelse", {
     tbl
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(
         y = if_else(chr %in% letters[1:3], 1L, 3L)
       ) %>%
@@ -89,8 +89,8 @@ test_that("if_else and ifelse", {
     tbl
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(
         y = if_else(int > 5, "one", "zero")
       ) %>%
@@ -98,8 +98,8 @@ test_that("if_else and ifelse", {
     tbl
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(
         y = if_else(int > 5, chr, another_chr)
       ) %>%
@@ -107,8 +107,8 @@ test_that("if_else and ifelse", {
     tbl
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(
         y = if_else(int > 5, "true", chr, missing = "MISSING")
       ) %>%
@@ -118,8 +118,8 @@ test_that("if_else and ifelse", {
 
   # TODO: remove the mutate + warning after ARROW-13358 is merged and Arrow
   # supports factors in if(_)else
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(
         y = if_else(int > 5, fct, factor("a"))
       ) %>%
@@ -131,8 +131,8 @@ test_that("if_else and ifelse", {
   )
 
   # detecting NA and NaN works just fine
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(
         y = if_else(is.na(dbl), chr, "false", missing = "MISSING")
       ) %>%
@@ -142,8 +142,8 @@ test_that("if_else and ifelse", {
 
   # However, currently comparisons with NaNs return false and not NaNs or NAs
   skip("ARROW-13364")
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(
         y = if_else(dbl > 5, chr, another_chr, missing = "MISSING")
       ) %>%
@@ -152,8 +152,8 @@ test_that("if_else and ifelse", {
   )
 
   skip("TODO: could? should? we support the autocasting in ifelse")
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(y = ifelse(int > 5, 1, FALSE)) %>%
       collect(),
     tbl
@@ -161,26 +161,26 @@ test_that("if_else and ifelse", {
 })
 
 test_that("case_when()", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       transmute(cw = case_when(lgl ~ dbl, !false ~ dbl + dbl2)) %>%
       collect(),
     tbl
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(cw = case_when(int > 5 ~ 1, TRUE ~ 0)) %>%
       collect(),
     tbl
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       transmute(cw = case_when(chr %in% letters[1:3] ~ 1L) + 41L) %>%
       collect(),
     tbl
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(case_when(
         dbl + int - 1.1 == dbl2 ~ TRUE,
         NA ~ NA,
@@ -256,21 +256,21 @@ test_that("case_when()", {
     )
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       transmute(cw = case_when(lgl ~ "abc")) %>%
       collect(),
     tbl
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       transmute(cw = case_when(lgl ~ verses, !false ~ paste(chr, chr))) %>%
       collect(),
     tbl
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(
         cw = case_when(!(!(!(lgl))) ~ factor(chr), TRUE ~ fct)
       ) %>%
@@ -288,8 +288,8 @@ test_that("coalesce()", {
     y = c(NA_character_, "b", "c"),
     z = c("a", "b", "c")
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(
         cw = coalesce(w),
         cz = coalesce(z),
@@ -308,8 +308,8 @@ test_that("coalesce()", {
     y = c(NA_integer_, 2L, 3L),
     z = 1:3
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(
         cw = coalesce(w),
         cz = coalesce(z),
@@ -328,8 +328,8 @@ test_that("coalesce()", {
     y = c(NA_real_, 2.2, 3.3),
     z = c(1.1, 2.2, 3.3)
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(
         cw = coalesce(w),
         cz = coalesce(z),
@@ -369,8 +369,8 @@ test_that("coalesce()", {
     float32()
   )
   # with R literal values
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(
         c1 = coalesce(4.4),
         c2 = coalesce(NA_real_),
@@ -390,8 +390,8 @@ test_that("coalesce()", {
     x = factor("a", levels = c("a", "z")),
     y = factor("b", levels = c("a", "b", "c"))
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(c = coalesce(x, y)) %>%
       collect() %>%
       # This is a no-op on the Arrow side, but necessary to make the results equal
diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R
index b18898efecd3e..5cb515e69da0e 100644
--- a/r/tests/testthat/test-dplyr-funcs-datetime.R
+++ b/r/tests/testthat/test-dplyr-funcs-datetime.R
@@ -47,8 +47,8 @@ test_df <- tibble::tibble(
 # These tests test component extraction from timestamp objects
 
 test_that("extract year from timestamp", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(x = year(datetime)) %>%
       collect(),
     test_df
@@ -56,8 +56,8 @@ test_that("extract year from timestamp", {
 })
 
 test_that("extract isoyear from timestamp", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(x = isoyear(datetime)) %>%
       collect(),
     test_df
@@ -65,8 +65,8 @@ test_that("extract isoyear from timestamp", {
 })
 
 test_that("extract quarter from timestamp", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(x = quarter(datetime)) %>%
       collect(),
     test_df
@@ -74,8 +74,8 @@ test_that("extract quarter from timestamp", {
 })
 
 test_that("extract month from timestamp", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(x = month(datetime)) %>%
       collect(),
     test_df
@@ -83,8 +83,8 @@ test_that("extract month from timestamp", {
 })
 
 test_that("extract isoweek from timestamp", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(x = isoweek(datetime)) %>%
       collect(),
     test_df
@@ -92,8 +92,8 @@ test_that("extract isoweek from timestamp", {
 })
 
 test_that("extract epiweek from timestamp", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(x = epiweek(datetime)) %>%
       collect(),
     test_df
@@ -101,8 +101,8 @@ test_that("extract epiweek from timestamp", {
 })
 
 test_that("extract day from timestamp", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(x = day(datetime)) %>%
       collect(),
     test_df
@@ -110,22 +110,22 @@ test_that("extract day from timestamp", {
 })
 
 test_that("extract wday from timestamp", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(x = wday(datetime)) %>%
       collect(),
     test_df
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(x = wday(date, week_start = 3)) %>%
       collect(),
     test_df
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(x = wday(date, week_start = 1)) %>%
       collect(),
     test_df
@@ -133,16 +133,16 @@ test_that("extract wday from timestamp", {
 
   skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(x = wday(date, label = TRUE)) %>%
       mutate(x = as.character(x)) %>%
       collect(),
     test_df
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(x = wday(datetime, label = TRUE, abbr = TRUE)) %>%
       mutate(x = as.character(x)) %>%
       collect(),
@@ -151,8 +151,8 @@ test_that("extract wday from timestamp", {
 })
 
 test_that("extract yday from timestamp", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(x = yday(datetime)) %>%
       collect(),
     test_df
@@ -160,8 +160,8 @@ test_that("extract yday from timestamp", {
 })
 
 test_that("extract hour from timestamp", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(x = hour(datetime)) %>%
       collect(),
     test_df
@@ -169,8 +169,8 @@ test_that("extract hour from timestamp", {
 })
 
 test_that("extract minute from timestamp", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(x = minute(datetime)) %>%
       collect(),
     test_df
@@ -178,8 +178,8 @@ test_that("extract minute from timestamp", {
 })
 
 test_that("extract second from timestamp", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(x = second(datetime)) %>%
       collect(),
     test_df,
@@ -191,8 +191,8 @@ test_that("extract second from timestamp", {
 # These tests test extraction of components from date32 objects
 
 test_that("extract year from date", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(x = year(date)) %>%
       collect(),
     test_df
@@ -200,8 +200,8 @@ test_that("extract year from date", {
 })
 
 test_that("extract isoyear from date", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(x = isoyear(date)) %>%
       collect(),
     test_df
@@ -209,8 +209,8 @@ test_that("extract isoyear from date", {
 })
 
 test_that("extract quarter from date", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(x = quarter(date)) %>%
       collect(),
     test_df
@@ -218,8 +218,8 @@ test_that("extract quarter from date", {
 })
 
 test_that("extract month from date", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(x = month(date)) %>%
       collect(),
     test_df
@@ -227,8 +227,8 @@ test_that("extract month from date", {
 })
 
 test_that("extract isoweek from date", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(x = isoweek(date)) %>%
       collect(),
     test_df
@@ -236,8 +236,8 @@ test_that("extract isoweek from date", {
 })
 
 test_that("extract epiweek from date", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(x = epiweek(date)) %>%
       collect(),
     test_df
@@ -245,8 +245,8 @@ test_that("extract epiweek from date", {
 })
 
 test_that("extract day from date", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(x = day(date)) %>%
       collect(),
     test_df
@@ -254,22 +254,22 @@ test_that("extract day from date", {
 })
 
 test_that("extract wday from date", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(x = wday(date)) %>%
       collect(),
     test_df
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(x = wday(date, week_start = 3)) %>%
       collect(),
     test_df
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(x = wday(date, week_start = 1)) %>%
       collect(),
     test_df
@@ -277,16 +277,16 @@ test_that("extract wday from date", {
 
   skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(x = wday(date, label = TRUE, abbr = TRUE)) %>%
       mutate(x = as.character(x)) %>%
       collect(),
     test_df
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(x = wday(date, label = TRUE)) %>%
       mutate(x = as.character(x)) %>%
       collect(),
@@ -295,8 +295,8 @@ test_that("extract wday from date", {
 })
 
 test_that("extract yday from date", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(x = yday(date)) %>%
       collect(),
     test_df
diff --git a/r/tests/testthat/test-dplyr-funcs-math.R b/r/tests/testthat/test-dplyr-funcs-math.R
index 2f2de18ac9277..b6663067510f7 100644
--- a/r/tests/testthat/test-dplyr-funcs-math.R
+++ b/r/tests/testthat/test-dplyr-funcs-math.R
@@ -23,8 +23,8 @@ library(dplyr, warn.conflicts = FALSE)
 test_that("abs()", {
   df <- tibble(x = c(-127, -10, -1, -0, 0, 1, 10, 127, NA))
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       transmute(abs = abs(x)) %>%
       collect(),
     df
@@ -34,8 +34,8 @@ test_that("abs()", {
 test_that("sign()", {
   df <- tibble(x = c(-127, -10, -1, -0, 0, 1, 10, 127, NA))
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       transmute(sign = sign(x)) %>%
       collect(),
     df
@@ -45,8 +45,8 @@ test_that("sign()", {
 test_that("ceiling(), floor(), trunc(), round()", {
   df <- tibble(x = c(-1, -0.55, -0.5, -0.1, 0, 0.1, 0.5, 0.55, 1, NA, NaN))
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(
         c = ceiling(x),
         f = floor(x),
@@ -58,8 +58,8 @@ test_that("ceiling(), floor(), trunc(), round()", {
   )
 
   # with digits set to 1
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(x %% 0.5 == 0) %>% # filter out indeterminate cases (see below)
       mutate(r = round(x, 1)) %>%
       collect(),
@@ -67,8 +67,8 @@ test_that("ceiling(), floor(), trunc(), round()", {
   )
 
   # with digits set to -1
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(
         rd = round(floor(x * 111), -1), # double
         y = ifelse(is.nan(x), NA_integer_, x),
@@ -100,8 +100,8 @@ test_that("ceiling(), floor(), trunc(), round()", {
   skip_on_cran()
   skip_on_os("windows")
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(r = round(x, 1)) %>%
       collect(),
     df
@@ -139,37 +139,37 @@ test_that("ceiling(), floor(), trunc(), round()", {
 test_that("log functions", {
   df <- tibble(x = c(1:10, NA, NA))
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(y = log(x)) %>%
       collect(),
     df
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(y = log(x, base = exp(1))) %>%
       collect(),
     df
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(y = log(x, base = 2)) %>%
       collect(),
     df
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(y = log(x, base = 10)) %>%
       collect(),
     df
   )
 
   # test log(, base = (length == 1))
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(y = log(x, base = 5)) %>%
       collect(),
     df
@@ -190,8 +190,8 @@ test_that("log functions", {
   )
 
   # test log(, base = Expression)
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       # test cases where base = 1 below
       filter(x != 1) %>%
       mutate(
@@ -205,8 +205,8 @@ test_that("log functions", {
   # log(1, base = 1) is NaN in both R and Arrow
   # suppress the R warning because R warns but Arrow does not
   suppressWarnings(
-    expect_dplyr_equal(
-      input %>%
+    compare_dplyr_binding(
+      .input %>%
         mutate(y = log(x, base = y)) %>%
         collect(),
       tibble(x = 1, y = 1)
@@ -214,36 +214,36 @@ test_that("log functions", {
   )
 
   # log(n != 1, base = 1) is Inf in R and Arrow
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(y = log(x, base = y)) %>%
       collect(),
     tibble(x = 10, y = 1)
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(y = logb(x)) %>%
       collect(),
     df
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(y = log1p(x)) %>%
       collect(),
     df
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(y = log2(x)) %>%
       collect(),
     df
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(y = log10(x)) %>%
       collect(),
     df
@@ -253,36 +253,36 @@ test_that("log functions", {
 test_that("trig functions", {
   df <- tibble(x = c(seq(from = 0, to = 1, by = 0.1), NA))
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(y = sin(x)) %>%
       collect(),
     df
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(y = cos(x)) %>%
       collect(),
     df
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(y = tan(x)) %>%
       collect(),
     df
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(y = asin(x)) %>%
       collect(),
     df
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(y = acos(x)) %>%
       collect(),
     df
@@ -292,15 +292,15 @@ test_that("trig functions", {
 test_that("arith functions ", {
   df <- tibble(x = c(1:5, NA))
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       transmute(
         int_div = x %/% 2,
         addition = x + 1,
         multiplication = x * 3,
         subtraction = x - 5,
         division = x / 2,
-        power = x ^ 3,
+        power = x^3,
         modulo = x %% 3
       ) %>%
       collect(),
diff --git a/r/tests/testthat/test-dplyr-funcs-string.R b/r/tests/testthat/test-dplyr-funcs-string.R
index 333735be4f093..5e092f4e335f7 100644
--- a/r/tests/testthat/test-dplyr-funcs-string.R
+++ b/r/tests/testthat/test-dplyr-funcs-string.R
@@ -35,84 +35,84 @@ test_that("paste, paste0, and str_c", {
   y <- Expression$field_ref("y")
 
   # no NAs in data
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       transmute(paste(v, w)) %>%
       collect(),
     df
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       transmute(paste(v, w, sep = "-")) %>%
       collect(),
     df
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       transmute(paste0(v, w)) %>%
       collect(),
     df
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       transmute(str_c(v, w)) %>%
       collect(),
     df
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       transmute(str_c(v, w, sep = "+")) %>%
       collect(),
     df
   )
 
   # NAs in data
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       transmute(paste(x, y)) %>%
       collect(),
     df
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       transmute(paste(x, y, sep = "-")) %>%
       collect(),
     df
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       transmute(str_c(x, y)) %>%
       collect(),
     df
   )
 
   # non-character column in dots
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       transmute(paste0(x, y, z)) %>%
       collect(),
     df
   )
 
   # literal string in dots
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       transmute(paste(x, "foo", y)) %>%
       collect(),
     df
   )
 
   # literal NA in dots
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       transmute(paste(x, NA, y)) %>%
       collect(),
     df
   )
 
   # expressions in dots
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       transmute(paste0(x, toupper(y), as.character(z))) %>%
       collect(),
     df
@@ -125,16 +125,16 @@ test_that("paste, paste0, and str_c", {
     "Invalid separator"
   )
   # emits null in str_c() (consistent with stringr::str_c())
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       transmute(str_c(x, y, sep = NA_character_)) %>%
       collect(),
     df
   )
 
   # sep passed in dots to paste0 (which doesn't take a sep argument)
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       transmute(paste0(x, y, sep = "-")) %>%
       collect(),
     df
@@ -181,8 +181,8 @@ test_that("paste, paste0, and str_c", {
 
 test_that("grepl with ignore.case = FALSE and fixed = TRUE", {
   df <- tibble(x = c("Foo", "bar"))
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(grepl("o", x, fixed = TRUE)) %>%
       collect(),
     df
@@ -191,14 +191,14 @@ test_that("grepl with ignore.case = FALSE and fixed = TRUE", {
 
 test_that("sub and gsub with ignore.case = FALSE and fixed = TRUE", {
   df <- tibble(x = c("Foo", "bar"))
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       transmute(x = sub("Foo", "baz", x, fixed = TRUE)) %>%
       collect(),
     df
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       transmute(x = gsub("o", "u", x, fixed = TRUE)) %>%
       collect(),
     df
@@ -212,20 +212,20 @@ test_that("grepl", {
   df <- tibble(x = c("Foo", "bar"))
 
   for (fixed in c(TRUE, FALSE)) {
-    expect_dplyr_equal(
-      input %>%
+    compare_dplyr_binding(
+      .input %>%
         filter(grepl("Foo", x, fixed = fixed)) %>%
         collect(),
       df
     )
-    expect_dplyr_equal(
-      input %>%
+    compare_dplyr_binding(
+      .input %>%
         transmute(x = grepl("^B.+", x, ignore.case = FALSE, fixed = fixed)) %>%
         collect(),
       df
     )
-    expect_dplyr_equal(
-      input %>%
+    compare_dplyr_binding(
+      .input %>%
         filter(grepl("Foo", x, ignore.case = FALSE, fixed = fixed)) %>%
         collect(),
       df
@@ -237,7 +237,7 @@ test_that("grepl with ignore.case = TRUE and fixed = TRUE", {
   df <- tibble(x = c("Foo", "bar"))
 
   # base::grepl() ignores ignore.case = TRUE with a warning when fixed = TRUE,
-  # so we can't use expect_dplyr_equal() for these tests
+  # so we can't use compare_dplyr_binding() for these tests
   expect_equal(
     df %>%
       Table$create() %>%
@@ -257,44 +257,44 @@ test_that("grepl with ignore.case = TRUE and fixed = TRUE", {
 test_that("str_detect", {
   df <- tibble(x = c("Foo", "bar"))
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(str_detect(x, regex("^F"))) %>%
       collect(),
     df
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       transmute(x = str_detect(x, regex("^f[A-Z]{2}", ignore_case = TRUE))) %>%
       collect(),
     df
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       transmute(x = str_detect(x, regex("^f[A-Z]{2}", ignore_case = TRUE), negate = TRUE)) %>%
       collect(),
     df
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(str_detect(x, fixed("o"))) %>%
       collect(),
     df
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(str_detect(x, fixed("O"))) %>%
       collect(),
     df
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(str_detect(x, fixed("O", ignore_case = TRUE))) %>%
       collect(),
     df
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(str_detect(x, fixed("O", ignore_case = TRUE), negate = TRUE)) %>%
       collect(),
     df
@@ -305,20 +305,20 @@ test_that("sub and gsub", {
   df <- tibble(x = c("Foo", "bar"))
 
   for (fixed in c(TRUE, FALSE)) {
-    expect_dplyr_equal(
-      input %>%
+    compare_dplyr_binding(
+      .input %>%
         transmute(x = sub("Foo", "baz", x, fixed = fixed)) %>%
         collect(),
       df
     )
-    expect_dplyr_equal(
-      input %>%
+    compare_dplyr_binding(
+      .input %>%
         transmute(x = sub("^B.+", "baz", x, ignore.case = FALSE, fixed = fixed)) %>%
         collect(),
       df
     )
-    expect_dplyr_equal(
-      input %>%
+    compare_dplyr_binding(
+      .input %>%
         transmute(x = sub("Foo", "baz", x, ignore.case = FALSE, fixed = fixed)) %>%
         collect(),
       df
@@ -330,7 +330,7 @@ test_that("sub and gsub with ignore.case = TRUE and fixed = TRUE", {
   df <- tibble(x = c("Foo", "bar"))
 
   # base::sub() and base::gsub() ignore ignore.case = TRUE with a warning when
-  # fixed = TRUE, so we can't use expect_dplyr_equal() for these tests
+  # fixed = TRUE, so we can't use compare_dplyr_binding() for these tests
   expect_equal(
     df %>%
       Table$create() %>%
@@ -357,47 +357,47 @@ test_that("sub and gsub with ignore.case = TRUE and fixed = TRUE", {
 test_that("str_replace and str_replace_all", {
   df <- tibble(x = c("Foo", "bar"))
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       transmute(x = str_replace_all(x, "^F", "baz")) %>%
       collect(),
     df
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       transmute(x = str_replace_all(x, regex("^F"), "baz")) %>%
       collect(),
     df
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(x = str_replace(x, "^F[a-z]{2}", "baz")) %>%
       collect(),
     df
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       transmute(x = str_replace(x, regex("^f[A-Z]{2}", ignore_case = TRUE), "baz")) %>%
       collect(),
     df
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       transmute(x = str_replace_all(x, fixed("o"), "u")) %>%
       collect(),
     df
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       transmute(x = str_replace(x, fixed("O"), "u")) %>%
       collect(),
     df
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       transmute(x = str_replace(x, fixed("O", ignore_case = TRUE), "u")) %>%
       collect(),
     df
@@ -407,8 +407,8 @@ test_that("str_replace and str_replace_all", {
 test_that("strsplit and str_split", {
   df <- tibble(x = c("Foo and bar", "baz and qux and quux"))
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(x = strsplit(x, "and")) %>%
       collect(),
     df,
@@ -416,50 +416,50 @@ test_that("strsplit and str_split", {
     # has type information in it, but it's just a bare list from R/dplyr.
     ignore_attr = TRUE
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(x = strsplit(x, "and.*", fixed = TRUE)) %>%
       collect(),
     df,
     ignore_attr = TRUE
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(x = strsplit(x, " +and +")) %>%
       collect(),
     df,
     ignore_attr = TRUE
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(x = str_split(x, "and")) %>%
       collect(),
     df,
     ignore_attr = TRUE
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(x = str_split(x, "and", n = 2)) %>%
       collect(),
     df,
     ignore_attr = TRUE
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(x = str_split(x, fixed("and"), n = 2)) %>%
       collect(),
     df,
     ignore_attr = TRUE
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(x = str_split(x, regex("and"), n = 2)) %>%
       collect(),
     df,
     ignore_attr = TRUE
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(x = str_split(x, "Foo|bar", n = 2)) %>%
       collect(),
     df,
@@ -469,8 +469,8 @@ test_that("strsplit and str_split", {
 
 test_that("str_to_lower, str_to_upper, and str_to_title", {
   df <- tibble(x = c("foo1", " \tB a R\n", "!apACHe aRroW!"))
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       transmute(
         x_lower = str_to_lower(x),
         x_upper = str_to_upper(x),
@@ -596,8 +596,8 @@ test_that("backreferences in pattern in string detection", {
   skip("RE2 does not support backreferences in pattern (https://github.com/google/re2/issues/101)")
   df <- tibble(x = c("Foo", "bar"))
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(str_detect(x, regex("F([aeiou])\\1"))) %>%
       collect(),
     df
@@ -607,8 +607,8 @@ test_that("backreferences in pattern in string detection", {
 test_that("backreferences (substitutions) in string replacement", {
   df <- tibble(x = c("Foo", "bar"))
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       transmute(desc = sub(
         "(?:https?|ftp)://([^/\r\n]+)(/[^\r\n]*)?",
         "path `\\2` on server `\\1`",
@@ -617,20 +617,20 @@ test_that("backreferences (substitutions) in string replacement", {
       collect(),
     tibble(url = "https://arrow.apache.org/docs/r/")
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       transmute(x = str_replace(x, "^(\\w)o(.*)", "\\1\\2p")) %>%
       collect(),
     df
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       transmute(x = str_replace(x, regex("^(\\w)o(.*)", ignore_case = TRUE), "\\1\\2p")) %>%
       collect(),
     df
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       transmute(x = str_replace(x, regex("^(\\w)o(.*)", ignore_case = TRUE), "\\1\\2p")) %>%
       collect(),
     df
@@ -640,7 +640,7 @@ test_that("backreferences (substitutions) in string replacement", {
 test_that("edge cases in string detection and replacement", {
   # in case-insensitive fixed match/replace, test that "\\E" in the search
   # string and backslashes in the replacement string are interpreted literally.
-  # this test does not use expect_dplyr_equal() because base::sub() and
+  # this test does not use compare_dplyr_binding() because base::sub() and
   # base::grepl() do not support ignore.case = TRUE when fixed = TRUE.
   expect_equal(
     tibble(x = c("\\Q\\e\\D")) %>%
@@ -659,14 +659,14 @@ test_that("edge cases in string detection and replacement", {
 
   # test that a user's "(?i)" prefix does not break the "(?i)" prefix that's
   # added in case-insensitive regex match/replace
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(grepl("(?i)^[abc]{3}$", x, ignore.case = TRUE, fixed = FALSE)) %>%
       collect(),
     tibble(x = c("ABC"))
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       transmute(x = sub("(?i)^[abc]{3}$", "123", x, ignore.case = TRUE, fixed = FALSE)) %>%
       collect(),
     tibble(x = c("ABC"))
@@ -762,38 +762,39 @@ test_that("strftime", {
   formats <- "%a %A %w %d %b %B %m %y %Y %H %I %p %M %z %Z %j %U %W %x %X %% %G %V %u"
   formats_date <- "%a %A %w %d %b %B %m %y %Y %H %I %p %M %j %U %W %x %X %% %G %V %u"
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(x = strftime(datetime, format = formats)) %>%
       collect(),
     times
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(x = strftime(date, format = formats_date)) %>%
       collect(),
     times
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(x = strftime(datetime, format = formats, tz = "Pacific/Marquesas")) %>%
       collect(),
     times
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(x = strftime(datetime, format = formats, tz = "EST", usetz = TRUE)) %>%
       collect(),
     times
   )
 
   withr::with_timezone(
-    "Pacific/Marquesas", {
-      expect_dplyr_equal(
-        input %>%
+    "Pacific/Marquesas",
+    {
+      compare_dplyr_binding(
+        .input %>%
           mutate(
             x = strftime(datetime, format = formats, tz = "EST"),
             x_date = strftime(date, format = formats_date, tz = "EST")
@@ -802,8 +803,8 @@ test_that("strftime", {
         times
       )
 
-      expect_dplyr_equal(
-        input %>%
+      compare_dplyr_binding(
+        .input %>%
           mutate(
             x = strftime(datetime, format = formats),
             x_date = strftime(date, format = formats_date)
@@ -828,8 +829,8 @@ test_that("strftime", {
   # Timestamps with second precision are represented as integers while
   # milliseconds, microsecond and nanoseconds are represented as fixed floating
   # point numbers with 3, 6 and 9 decimal places respectively.
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(x = strftime(datetime, format = "%S")) %>%
       transmute(as.double(substr(x, 1, 2))) %>%
       collect(),
@@ -842,8 +843,8 @@ test_that("format_ISO8601", {
   skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168
   times <- tibble(x = c(lubridate::ymd_hms("2018-10-07 19:04:05", tz = "Etc/GMT+6"), NA))
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(x = format_ISO8601(x, precision = "ymd", usetz = FALSE)) %>%
       collect(),
     times
@@ -870,16 +871,16 @@ test_that("format_ISO8601", {
       "Timezone not present, cannot convert to string with timezone: %Y-%m-%dT%H:%M:%S%z"
     )
   } else {
-    expect_dplyr_equal(
-      input %>%
+    compare_dplyr_binding(
+      .input %>%
         mutate(x = format_ISO8601(x, precision = "ymd", usetz = TRUE)) %>%
         collect(),
       times
     )
 
     # See comment regarding %S flag in strftime tests
-    expect_dplyr_equal(
-      input %>%
+    compare_dplyr_binding(
+      .input %>%
         mutate(x = format_ISO8601(x, precision = "ymdhms", usetz = TRUE)) %>%
         mutate(x = gsub("\\.0*", "", x)) %>%
         collect(),
@@ -889,8 +890,8 @@ test_that("format_ISO8601", {
 
 
   # See comment regarding %S flag in strftime tests
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(x = format_ISO8601(x, precision = "ymdhms", usetz = FALSE)) %>%
       mutate(x = gsub("\\.0*", "", x)) %>%
       collect(),
@@ -945,15 +946,15 @@ test_that("stri_reverse and arrow_ascii_reverse functions", {
 
   df_utf8 <- tibble(x = c("Foo\u00A0\u0061nd\u00A0bar", "\u0062az\u00A0and\u00A0qux\u3000and\u00A0quux"))
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(x = stri_reverse(x)) %>%
       collect(),
     df_utf8
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(x = stri_reverse(x)) %>%
       collect(),
     df_ascii
@@ -980,7 +981,7 @@ test_that("str_like", {
   df <- tibble(x = c("Foo and bar", "baz and qux and quux"))
 
   # TODO: After new version of stringr with str_like has been released, update all
-  # these tests to use expect_dplyr_equal
+  # these tests to use compare_dplyr_binding
 
   # No match - entire string
   expect_equal(
@@ -1029,8 +1030,8 @@ test_that("str_like", {
 
   # This will give an error until a new version of stringr with str_like has been released
   skip_if_not(packageVersion("stringr") > "1.4.0")
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(x = str_like(x, "%baz%")) %>%
       collect(),
     df
@@ -1040,36 +1041,36 @@ test_that("str_like", {
 test_that("str_pad", {
   df <- tibble(x = c("Foo and bar", "baz and qux and quux"))
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(x = str_pad(x, width = 31)) %>%
       collect(),
     df
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(x = str_pad(x, width = 30, side = "right")) %>%
       collect(),
     df
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(x = str_pad(x, width = 31, side = "left", pad = "+")) %>%
       collect(),
     df
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(x = str_pad(x, width = 10, side = "left", pad = "+")) %>%
       collect(),
     df
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(x = str_pad(x, width = 31, side = "both")) %>%
       collect(),
     df
@@ -1079,64 +1080,64 @@ test_that("str_pad", {
 test_that("substr", {
   df <- tibble(x = "Apache Arrow")
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(y = substr(x, 1, 6)) %>%
       collect(),
     df
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(y = substr(x, 0, 6)) %>%
       collect(),
     df
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(y = substr(x, -1, 6)) %>%
       collect(),
     df
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(y = substr(x, 6, 1)) %>%
       collect(),
     df
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(y = substr(x, -1, -2)) %>%
       collect(),
     df
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(y = substr(x, 9, 6)) %>%
       collect(),
     df
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(y = substr(x, 1, 6)) %>%
       collect(),
     df
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(y = substr(x, 8, 12)) %>%
       collect(),
     df
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(y = substr(x, -5, -1)) %>%
       collect(),
     df
@@ -1157,8 +1158,8 @@ test_that("substring", {
   # nse_funcs$substring just calls nse_funcs$substr, tested extensively above
   df <- tibble(x = "Apache Arrow")
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(y = substring(x, 1, 6)) %>%
       collect(),
     df
@@ -1168,71 +1169,71 @@ test_that("substring", {
 test_that("str_sub", {
   df <- tibble(x = "Apache Arrow")
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(y = str_sub(x, 1, 6)) %>%
       collect(),
     df
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(y = str_sub(x, 0, 6)) %>%
       collect(),
     df
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(y = str_sub(x, -1, 6)) %>%
       collect(),
     df
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(y = str_sub(x, 6, 1)) %>%
       collect(),
     df
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(y = str_sub(x, -1, -2)) %>%
       collect(),
     df
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(y = str_sub(x, -1, 3)) %>%
       collect(),
     df
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(y = str_sub(x, 9, 6)) %>%
       collect(),
     df
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(y = str_sub(x, 1, 6)) %>%
       collect(),
     df
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(y = str_sub(x, 8, 12)) %>%
       collect(),
     df
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(y = str_sub(x, -5, -1)) %>%
       collect(),
     df
@@ -1252,85 +1253,85 @@ test_that("str_sub", {
 test_that("str_starts, str_ends, startsWith, endsWith", {
   df <- tibble(x = c("Foo", "bar", "baz", "qux"))
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(str_starts(x, "b.*")) %>%
       collect(),
     df
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(str_starts(x, "b.*", negate = TRUE)) %>%
       collect(),
     df
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(str_starts(x, fixed("b.*"))) %>%
       collect(),
     df
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(str_starts(x, fixed("b"))) %>%
       collect(),
     df
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(str_ends(x, "r")) %>%
       collect(),
     df
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(str_ends(x, "r", negate = TRUE)) %>%
       collect(),
     df
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(str_ends(x, fixed("r$"))) %>%
       collect(),
     df
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(str_ends(x, fixed("r"))) %>%
       collect(),
     df
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(startsWith(x, "b")) %>%
       collect(),
     df
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(endsWith(x, "r")) %>%
       collect(),
     df
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(startsWith(x, "b.*")) %>%
       collect(),
     df
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(endsWith(x, "r$")) %>%
       collect(),
     df
@@ -1343,22 +1344,22 @@ test_that("str_count", {
     dots = c("a.", "...", ".a.a", "a..a.", "ab...", "dse....", ".f..d..")
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(a_count = str_count(cities, pattern = "a")) %>%
       collect(),
     df
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(p_count = str_count(cities, pattern = "d")) %>%
       collect(),
     df
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(p_count = str_count(cities,
         pattern = regex("d", ignore_case = TRUE)
       )) %>%
@@ -1366,31 +1367,31 @@ test_that("str_count", {
     df
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(e_count = str_count(cities, pattern = "u")) %>%
       collect(),
     df
   )
 
   # nse_funcs$str_count() is not vectorised over pattern
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(let_count = str_count(cities, pattern = c("a", "b", "e", "g", "p", "n", "s"))) %>%
       collect(),
     df,
     warning = TRUE
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(dots_count = str_count(dots, ".")) %>%
       collect(),
     df
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(dots_count = str_count(dots, fixed("."))) %>%
       collect(),
     df
diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R
index 7f7396d88e7ad..859dc14b99b00 100644
--- a/r/tests/testthat/test-dplyr-funcs-type.R
+++ b/r/tests/testthat/test-dplyr-funcs-type.R
@@ -88,8 +88,8 @@ test_that("explicit type conversions with cast()", {
 
 test_that("explicit type conversions with as.*()", {
   library(bit64)
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       transmute(
         int2chr = as.character(int),
         int2dbl = as.double(int),
@@ -103,8 +103,8 @@ test_that("explicit type conversions with as.*()", {
       collect(),
     tbl
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       transmute(
         chr2chr = as.character(chr),
         chr2dbl = as.double(chr),
@@ -114,8 +114,8 @@ test_that("explicit type conversions with as.*()", {
       collect(),
     tibble(chr = c("1", "2", "3"))
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       transmute(
         chr2i64 = as.integer64(chr),
         dbl2i64 = as.integer64(dbl),
@@ -124,8 +124,8 @@ test_that("explicit type conversions with as.*()", {
       collect(),
     tibble(chr = "10000000000", dbl = 10000000000, i64 = as.integer64(1e10))
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       transmute(
         chr2lgl = as.logical(chr),
         dbl2lgl = as.logical(dbl),
@@ -138,8 +138,8 @@ test_that("explicit type conversions with as.*()", {
       int = c(1L, 0L, -99L, 0L)
     )
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       transmute(
         dbl2chr = as.character(dbl),
         dbl2dbl = as.double(dbl),
@@ -170,8 +170,8 @@ test_that("is.finite(), is.infinite(), is.nan()", {
     -4.94065645841246544e-324, 1.79769313486231570e+308, 0,
     NA_real_, NaN, Inf, -Inf
   ))
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       transmute(
         is_fin = is.finite(x),
         is_inf = is.infinite(x)
@@ -180,8 +180,8 @@ test_that("is.finite(), is.infinite(), is.nan()", {
     df
   )
   # is.nan() evaluates to FALSE on NA_real_ (ARROW-12850)
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       transmute(
         is_nan = is.nan(x)
       ) %>%
@@ -192,8 +192,8 @@ test_that("is.finite(), is.infinite(), is.nan()", {
 
 test_that("is.na() evaluates to TRUE on NaN (ARROW-12055)", {
   df <- tibble(x = c(1.1, 2.2, NA_real_, 4.4, NaN, 6.6, 7.7))
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       transmute(
         is_na = is.na(x)
       ) %>%
@@ -306,8 +306,8 @@ test_that("type checks with is() giving Arrow types", {
 
 test_that("type checks with is() giving R types", {
   library(bit64)
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       transmute(
         chr_is_chr = is(chr, "character"),
         chr_is_fct = is(chr, "factor"),
@@ -348,8 +348,8 @@ test_that("type checks with is() giving R types", {
       collect(),
     tbl
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       transmute(
         i64_is_chr = is(i64, "character"),
         i64_is_fct = is(i64, "factor"),
@@ -378,8 +378,8 @@ test_that("type checks with is() giving R types", {
 
 test_that("type checks with is.*()", {
   library(bit64)
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       transmute(
         chr_is_chr = is.character(chr),
         chr_is_dbl = is.double(chr),
@@ -425,8 +425,8 @@ test_that("type checks with is.*()", {
       collect(),
     tbl
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       transmute(
         i64_is_chr = is.character(i64),
         # TODO: investigate why this is not matching when testthat runs it
@@ -457,8 +457,8 @@ test_that("type checks with is.*()", {
 
 test_that("type checks with is_*()", {
   library(rlang, warn.conflicts = FALSE)
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       transmute(
         chr_is_chr = is_character(chr),
         chr_is_dbl = is_double(chr),
@@ -487,8 +487,8 @@ test_that("type checks with is_*()", {
 })
 
 test_that("type checks on expressions", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       transmute(
         a = is.character(as.character(int)),
         b = is.integer(as.character(int)),
@@ -503,8 +503,8 @@ test_that("type checks on expressions", {
   # the code in the expectation below depends on RE2
   skip_if_not_available("re2")
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       transmute(
         a = is.logical(grepl("[def]", chr))
       ) %>%
@@ -514,8 +514,8 @@ test_that("type checks on expressions", {
 })
 
 test_that("type checks on R scalar literals", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       transmute(
         chr_is_chr = is.character("foo"),
         int_is_chr = is.character(42L),
@@ -543,16 +543,16 @@ test_that("as.factor()/dictionary_encode()", {
   df1 <- tibble(x = c("C", "D", "B", NA, "D", "B", "S", "A", "B", "Z", "B"))
   df2 <- tibble(x = c(5, 5, 5, NA, 2, 3, 6, 8))
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       transmute(x = as.factor(x)) %>%
       collect(),
     df1
   )
 
   expect_warning(
-    expect_dplyr_equal(
-      input %>%
+    compare_dplyr_binding(
+      .input %>%
         transmute(x = as.factor(x)) %>%
         collect(),
       df2
@@ -593,8 +593,8 @@ test_that("bad explicit type conversions with as.*()", {
 
   # Arrow returns lowercase "true", "false" (instead of "TRUE", "FALSE" like R)
   expect_error(
-    expect_dplyr_equal(
-      input %>%
+    compare_dplyr_binding(
+      .input %>%
         transmute(lgl2chr = as.character(lgl)) %>%
         collect(),
       tibble(lgl = c(TRUE, FALSE, NA))
@@ -605,8 +605,8 @@ test_that("bad explicit type conversions with as.*()", {
   # a warning like R does)
   expect_error(
     expect_warning(
-      expect_dplyr_equal(
-        input %>%
+      compare_dplyr_binding(
+        .input %>%
           transmute(chr2num = as.numeric(chr)) %>%
           collect(),
         tibble(chr = c("l.O", "S.S", ""))
@@ -617,8 +617,8 @@ test_that("bad explicit type conversions with as.*()", {
   # Arrow fails to parse these strings as Booleans (instead of returning NAs
   # like R does)
   expect_error(
-    expect_dplyr_equal(
-      input %>%
+    compare_dplyr_binding(
+      .input %>%
         transmute(chr2lgl = as.logical(chr)) %>%
         collect(),
       tibble(chr = c("TRU", "FAX", ""))
diff --git a/r/tests/testthat/test-dplyr-group-by.R b/r/tests/testthat/test-dplyr-group-by.R
index 9f1d385eb43fe..7cfcfb5c98116 100644
--- a/r/tests/testthat/test-dplyr-group-by.R
+++ b/r/tests/testthat/test-dplyr-group-by.R
@@ -23,8 +23,8 @@ library(stringr)
 tbl <- example_data
 
 test_that("group_by groupings are recorded", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(chr) %>%
       select(int, chr) %>%
       filter(int > 5) %>%
@@ -34,20 +34,20 @@ test_that("group_by groupings are recorded", {
 })
 
 test_that("group_by supports creating/renaming", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(chr, numbers = int) %>%
       collect(),
     tbl
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(chr, numbers = int * 4) %>%
       collect(),
     tbl
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(int > 4, lgl, foo = int > 5) %>%
       collect(),
     tbl
@@ -55,8 +55,8 @@ test_that("group_by supports creating/renaming", {
 })
 
 test_that("ungroup", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(chr) %>%
       select(int, chr) %>%
       ungroup() %>%
@@ -66,11 +66,11 @@ test_that("ungroup", {
   )
 
   # to confirm that the above expectation is actually testing what we think it's
-  # testing, verify that expect_dplyr_equal() distinguishes between grouped and
+  # testing, verify that compare_dplyr_binding() distinguishes between grouped and
   # ungrouped tibbles
   expect_error(
-    expect_dplyr_equal(
-      input %>%
+    compare_dplyr_binding(
+      .input %>%
         group_by(chr) %>%
         select(int, chr) %>%
         (function(x) if (inherits(x, "tbl_df")) ungroup(x) else x) %>%
@@ -82,8 +82,8 @@ test_that("ungroup", {
 })
 
 test_that("group_by then rename", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(chr) %>%
       select(string = chr, int) %>%
       collect(),
@@ -93,14 +93,14 @@ test_that("group_by then rename", {
 
 test_that("group_by with .drop", {
   test_groups <- c("starting_a_fight", "consoling_a_child", "petting_a_dog")
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(!!!syms(test_groups), .drop = TRUE) %>%
       collect(),
     example_with_logical_factors
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(!!!syms(test_groups), .drop = FALSE) %>%
       collect(),
     example_with_logical_factors
@@ -131,25 +131,25 @@ test_that("group_by with .drop", {
       group_by_drop_default(),
     TRUE
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(.drop = FALSE) %>% # no group by vars
       group_by_drop_default(),
     example_with_logical_factors
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by_drop_default(),
     example_with_logical_factors
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(!!!syms(test_groups)) %>%
       group_by_drop_default(),
     example_with_logical_factors
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(!!!syms(test_groups), .drop = FALSE) %>%
       ungroup() %>%
       group_by_drop_default(),
diff --git a/r/tests/testthat/test-dplyr-join.R b/r/tests/testthat/test-dplyr-join.R
index 03c170556a7cf..3ff9ad8ff1a5f 100644
--- a/r/tests/testthat/test-dplyr-join.R
+++ b/r/tests/testthat/test-dplyr-join.R
@@ -40,8 +40,8 @@ to_join_tab <- Table$create(to_join)
 
 test_that("left_join", {
   expect_message(
-    expect_dplyr_equal(
-      input %>%
+    compare_dplyr_binding(
+      .input %>%
         left_join(to_join) %>%
         collect(),
       left
@@ -51,14 +51,14 @@ test_that("left_join", {
 })
 
 test_that("left_join `by` args", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       left_join(to_join, by = "some_grouping") %>%
       collect(),
     left
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       left_join(
         to_join %>%
           rename(the_grouping = some_grouping),
@@ -70,8 +70,8 @@ test_that("left_join `by` args", {
 
   # TODO: allow renaming columns on the right side as well
   skip("ARROW-14184")
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       rename(the_grouping = some_grouping) %>%
       left_join(
         to_join,
@@ -108,8 +108,8 @@ test_that("Error handling", {
 # TODO: casting: int and float columns?
 
 test_that("right_join", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       right_join(to_join, by = "some_grouping") %>%
       collect(),
     left
@@ -117,8 +117,8 @@ test_that("right_join", {
 })
 
 test_that("inner_join", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       inner_join(to_join, by = "some_grouping") %>%
       collect(),
     left
@@ -126,8 +126,8 @@ test_that("inner_join", {
 })
 
 test_that("full_join", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       full_join(to_join, by = "some_grouping") %>%
       collect(),
     left
@@ -135,8 +135,8 @@ test_that("full_join", {
 })
 
 test_that("semi_join", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       semi_join(to_join, by = "some_grouping") %>%
       collect(),
     left
@@ -144,8 +144,8 @@ test_that("semi_join", {
 })
 
 test_that("anti_join", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       anti_join(to_join, by = "some_grouping") %>%
       collect(),
     left
diff --git a/r/tests/testthat/test-dplyr-mutate.R b/r/tests/testthat/test-dplyr-mutate.R
index 419c583284ebf..886ec9e42ba2c 100644
--- a/r/tests/testthat/test-dplyr-mutate.R
+++ b/r/tests/testthat/test-dplyr-mutate.R
@@ -35,8 +35,8 @@ test_that("mutate() is lazy", {
 })
 
 test_that("basic mutate", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       select(int, chr) %>%
       filter(int > 5) %>%
       mutate(int = int + 6L) %>%
@@ -46,8 +46,8 @@ test_that("basic mutate", {
 })
 
 test_that("mutate() with NULL inputs", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(int = NULL) %>%
       collect(),
     tbl
@@ -55,8 +55,8 @@ test_that("mutate() with NULL inputs", {
 })
 
 test_that("empty mutate()", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate() %>%
       collect(),
     tbl
@@ -64,8 +64,8 @@ test_that("empty mutate()", {
 })
 
 test_that("transmute", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       select(int, chr) %>%
       filter(int > 5) %>%
       transmute(int = int + 6L) %>%
@@ -75,8 +75,8 @@ test_that("transmute", {
 })
 
 test_that("transmute() with NULL inputs", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       transmute(int = NULL) %>%
       collect(),
     tbl
@@ -84,8 +84,8 @@ test_that("transmute() with NULL inputs", {
 })
 
 test_that("empty transmute()", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       transmute() %>%
       collect(),
     tbl
@@ -128,8 +128,8 @@ test_that("transmute() defuses dots arguments (ARROW-13262)", {
 })
 
 test_that("mutate and refer to previous mutants", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       select(int, verses) %>%
       mutate(
         line_lengths = nchar(verses),
@@ -142,8 +142,8 @@ test_that("mutate and refer to previous mutants", {
 })
 
 test_that("nchar() arguments", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       select(int, verses) %>%
       mutate(
         line_lengths = nchar(verses, type = "bytes"),
@@ -154,8 +154,8 @@ test_that("nchar() arguments", {
     tbl
   )
   # This tests the whole abandon_ship() machinery
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       select(int, verses) %>%
       mutate(
         line_lengths = nchar(verses, type = "bytes", allowNA = TRUE),
@@ -172,8 +172,8 @@ test_that("nchar() arguments", {
 })
 
 test_that("mutate with .data pronoun", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       select(int, verses) %>%
       mutate(
         line_lengths = str_length(verses),
@@ -186,8 +186,8 @@ test_that("mutate with .data pronoun", {
 })
 
 test_that("mutate with unnamed expressions", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       select(int, padded_strings) %>%
       mutate(
         int, # bare column name
@@ -200,8 +200,8 @@ test_that("mutate with unnamed expressions", {
 })
 
 test_that("mutate with reassigning same name", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       transmute(
         new = lgl,
         new = chr
@@ -212,8 +212,8 @@ test_that("mutate with reassigning same name", {
 })
 
 test_that("mutate with single value for recycling", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       select(int, padded_strings) %>%
       mutate(
         dr_bronner = 1 # ALL ONE!
@@ -225,8 +225,8 @@ test_that("mutate with single value for recycling", {
 
 test_that("dplyr::mutate's examples", {
   # Newly created variables are available immediately
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       select(name, mass) %>%
       mutate(
         mass2 = mass * 2,
@@ -238,8 +238,8 @@ test_that("dplyr::mutate's examples", {
 
   # As well as adding new variables, you can use mutate() to
   # remove variables and modify existing variables.
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       select(name, height, mass, homeworld) %>%
       mutate(
         mass = NULL,
@@ -253,8 +253,8 @@ test_that("dplyr::mutate's examples", {
   # but warn that they're pulling data into R to do so
 
   # across and autosplicing: ARROW-11699
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       select(name, homeworld, species) %>%
       mutate(across(!name, as.factor)) %>%
       collect(),
@@ -263,8 +263,8 @@ test_that("dplyr::mutate's examples", {
   )
 
   # group_by then mutate
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       select(name, mass, homeworld) %>%
       group_by(homeworld) %>%
       mutate(rank = min_rank(desc(mass))) %>%
@@ -275,8 +275,8 @@ test_that("dplyr::mutate's examples", {
 
   # `.before` and `.after` experimental args: ARROW-11701
   df <- tibble(x = 1, y = 2)
-  expect_dplyr_equal(
-    input %>% mutate(z = x + y) %>% collect(),
+  compare_dplyr_binding(
+    .input %>% mutate(z = x + y) %>% collect(),
     df
   )
   #> # A tibble: 1 x 3
@@ -284,16 +284,16 @@ test_that("dplyr::mutate's examples", {
   #>   <dbl> <dbl> <dbl>
   #> 1     1     2     3
 
-  expect_dplyr_equal(
-    input %>% mutate(z = x + y, .before = 1) %>% collect(),
+  compare_dplyr_binding(
+    .input %>% mutate(z = x + y, .before = 1) %>% collect(),
     df
   )
   #> # A tibble: 1 x 3
   #>       z     x     y
   #>   <dbl> <dbl> <dbl>
   #> 1     3     1     2
-  expect_dplyr_equal(
-    input %>% mutate(z = x + y, .after = x) %>% collect(),
+  compare_dplyr_binding(
+    .input %>% mutate(z = x + y, .after = x) %>% collect(),
     df
   )
   #> # A tibble: 1 x 3
@@ -304,32 +304,32 @@ test_that("dplyr::mutate's examples", {
   # By default, mutate() keeps all columns from the input data.
   # Experimental: You can override with `.keep`
   df <- tibble(x = 1, y = 2, a = "a", b = "b")
-  expect_dplyr_equal(
-    input %>% mutate(z = x + y, .keep = "all") %>% collect(), # the default
+  compare_dplyr_binding(
+    .input %>% mutate(z = x + y, .keep = "all") %>% collect(), # the default
     df
   )
   #> # A tibble: 1 x 5
   #>       x     y a     b         z
   #>   <dbl> <dbl> <chr> <chr> <dbl>
   #> 1     1     2 a     b         3
-  expect_dplyr_equal(
-    input %>% mutate(z = x + y, .keep = "used") %>% collect(),
+  compare_dplyr_binding(
+    .input %>% mutate(z = x + y, .keep = "used") %>% collect(),
     df
   )
   #> # A tibble: 1 x 3
   #>       x     y     z
   #>   <dbl> <dbl> <dbl>
   #> 1     1     2     3
-  expect_dplyr_equal(
-    input %>% mutate(z = x + y, .keep = "unused") %>% collect(),
+  compare_dplyr_binding(
+    .input %>% mutate(z = x + y, .keep = "unused") %>% collect(),
     df
   )
   #> # A tibble: 1 x 3
   #>   a     b         z
   #>   <chr> <chr> <dbl>
   #> 1 a     b         3
-  expect_dplyr_equal(
-    input %>% mutate(z = x + y, .keep = "none") %>% collect(), # same as transmute()
+  compare_dplyr_binding(
+    .input %>% mutate(z = x + y, .keep = "none") %>% collect(), # same as transmute()
     df
   )
   #> # A tibble: 1 x 1
@@ -342,8 +342,8 @@ test_that("dplyr::mutate's examples", {
   # tibbles because the expressions are computed within groups.
   # The following normalises `mass` by the global average:
   # TODO: ARROW-13926
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       select(name, mass, species) %>%
       mutate(mass_norm = mass / mean(mass, na.rm = TRUE)) %>%
       collect(),
@@ -353,16 +353,16 @@ test_that("dplyr::mutate's examples", {
 })
 
 test_that("Can mutate after group_by as long as there are no aggregations", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       select(int, chr) %>%
       group_by(chr) %>%
       mutate(int = int + 6L) %>%
       collect(),
     tbl
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       select(mean = int, chr) %>%
       # rename `int` to `mean` and use `mean` in `mutate()` to test that
       # `all_funs()` does not incorrectly identify it as an aggregate function
@@ -498,8 +498,8 @@ test_that("mutate and pmin/pmax", {
     val3 = c(0, NA, NA)
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(
         max_val_1 = pmax(val1, val2, val3),
         max_val_2 = pmax(val1, val2, val3, na.rm = TRUE),
@@ -510,8 +510,8 @@ test_that("mutate and pmin/pmax", {
     df
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(
         max_val_1 = pmax(val1 - 100, 200, val1 * 100, na.rm = TRUE),
         min_val_1 = pmin(val1 - 100, 100, val1 * 100, na.rm = TRUE),
diff --git a/r/tests/testthat/test-dplyr-query.R b/r/tests/testthat/test-dplyr-query.R
index 07cdf081fb37f..21a55f4b4284c 100644
--- a/r/tests/testthat/test-dplyr-query.R
+++ b/r/tests/testthat/test-dplyr-query.R
@@ -43,8 +43,8 @@ test_that("basic select/filter/collect", {
 })
 
 test_that("dim() on query", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(int > 5) %>%
       select(int, chr) %>%
       dim(),
@@ -71,20 +71,20 @@ See $.data for the source Arrow object',
 })
 
 test_that("pull", {
-  expect_dplyr_equal(
-    input %>% pull(),
+  compare_dplyr_binding(
+    .input %>% pull(),
     tbl
   )
-  expect_dplyr_equal(
-    input %>% pull(1),
+  compare_dplyr_binding(
+    .input %>% pull(1),
     tbl
   )
-  expect_dplyr_equal(
-    input %>% pull(chr),
+  compare_dplyr_binding(
+    .input %>% pull(chr),
     tbl
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(int > 4) %>%
       rename(strng = chr) %>%
       pull(strng),
@@ -224,8 +224,9 @@ test_that("head", {
 })
 
 test_that("arrange then head returns the right data (ARROW-14162)", {
-  expect_dplyr_equal(
-    input %>%
+
+  compare_dplyr_binding(
+    .input %>%
       # mpg has ties so we need to sort by two things to get deterministic order
       arrange(mpg, disp) %>%
       head(4) %>%
@@ -236,8 +237,8 @@ test_that("arrange then head returns the right data (ARROW-14162)", {
 })
 
 test_that("arrange then tail returns the right data", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       # mpg has ties so we need to sort by two things to get deterministic order
       arrange(mpg, disp) %>%
       tail(4) %>%
diff --git a/r/tests/testthat/test-dplyr-select.R b/r/tests/testthat/test-dplyr-select.R
index 35c149c5f0c6c..2ca2b100e2355 100644
--- a/r/tests/testthat/test-dplyr-select.R
+++ b/r/tests/testthat/test-dplyr-select.R
@@ -23,16 +23,16 @@ library(stringr)
 tbl <- example_data
 
 test_that("Empty select returns no columns", {
-  expect_dplyr_equal(
-    input %>% select() %>% collect(),
+  compare_dplyr_binding(
+    .input %>% select() %>% collect(),
     tbl,
     skip_table = "Table with 0 cols doesn't know how many rows it should have"
   )
 })
 test_that("Empty select still includes the group_by columns", {
   expect_message(
-    expect_dplyr_equal(
-      input %>% group_by(chr) %>% select() %>% collect(),
+    compare_dplyr_binding(
+      .input %>% group_by(chr) %>% select() %>% collect(),
       tbl
     ),
     "Adding missing grouping variables"
@@ -40,20 +40,20 @@ test_that("Empty select still includes the group_by columns", {
 })
 
 test_that("select/rename", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       select(string = chr, int) %>%
       collect(),
     tbl
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       rename(string = chr) %>%
       collect(),
     tbl
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       rename(strng = chr) %>%
       rename(other = strng) %>%
       collect(),
@@ -66,8 +66,8 @@ test_that("select/rename with selection helpers", {
   # TODO: add some passing tests here
 
   expect_error(
-    expect_dplyr_equal(
-      input %>%
+    compare_dplyr_binding(
+      .input %>%
         select(where(is.numeric)) %>%
         collect(),
       tbl
@@ -77,15 +77,15 @@ test_that("select/rename with selection helpers", {
 })
 
 test_that("filtering with rename", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(chr == "b") %>%
       select(string = chr, int) %>%
       collect(),
     tbl
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       select(string = chr, int) %>%
       filter(string == "b") %>%
       collect(),
@@ -95,49 +95,49 @@ test_that("filtering with rename", {
 
 test_that("relocate", {
   df <- tibble(a = 1, b = 1, c = 1, d = "a", e = "a", f = "a")
-  expect_dplyr_equal(
-    input %>% relocate(f) %>% collect(),
+  compare_dplyr_binding(
+    .input %>% relocate(f) %>% collect(),
     df,
   )
-  expect_dplyr_equal(
-    input %>% relocate(a, .after = c) %>% collect(),
+  compare_dplyr_binding(
+    .input %>% relocate(a, .after = c) %>% collect(),
     df,
   )
-  expect_dplyr_equal(
-    input %>% relocate(f, .before = b) %>% collect(),
+  compare_dplyr_binding(
+    .input %>% relocate(f, .before = b) %>% collect(),
     df,
   )
-  expect_dplyr_equal(
-    input %>% relocate(a, .after = last_col()) %>% collect(),
+  compare_dplyr_binding(
+    .input %>% relocate(a, .after = last_col()) %>% collect(),
     df,
   )
-  expect_dplyr_equal(
-    input %>% relocate(ff = f) %>% collect(),
+  compare_dplyr_binding(
+    .input %>% relocate(ff = f) %>% collect(),
     df,
   )
 })
 
 test_that("relocate with selection helpers", {
   df <- tibble(a = 1, b = 1, c = 1, d = "a", e = "a", f = "a")
-  expect_dplyr_equal(
-    input %>% relocate(any_of(c("a", "e", "i", "o", "u"))) %>% collect(),
+  compare_dplyr_binding(
+    .input %>% relocate(any_of(c("a", "e", "i", "o", "u"))) %>% collect(),
     df
   )
-  expect_dplyr_equal(
-    input %>% relocate(where(is.character)) %>% collect(),
+  compare_dplyr_binding(
+    .input %>% relocate(where(is.character)) %>% collect(),
     df
   )
-  expect_dplyr_equal(
-    input %>% relocate(a, b, c, .after = where(is.character)) %>% collect(),
+  compare_dplyr_binding(
+    .input %>% relocate(a, b, c, .after = where(is.character)) %>% collect(),
     df
   )
-  expect_dplyr_equal(
-    input %>% relocate(d, e, f, .before = where(is.numeric)) %>% collect(),
+  compare_dplyr_binding(
+    .input %>% relocate(d, e, f, .before = where(is.numeric)) %>% collect(),
     df
   )
   # works after other dplyr verbs
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(c = as.character(c)) %>%
       relocate(d, e, f, .after = where(is.numeric)) %>%
       collect(),
diff --git a/r/tests/testthat/test-dplyr-summarize.R b/r/tests/testthat/test-dplyr-summarize.R
index a722e52a52e5d..3988412b8aa90 100644
--- a/r/tests/testthat/test-dplyr-summarize.R
+++ b/r/tests/testthat/test-dplyr-summarize.R
@@ -45,14 +45,14 @@ test_that("summarize() doesn't evaluate eagerly", {
 })
 
 test_that("Can aggregate in Arrow", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       summarize(total = sum(int, na.rm = TRUE)) %>%
       collect(),
     tbl
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       summarize(total = sum(int)) %>%
       collect(),
     tbl
@@ -60,24 +60,24 @@ test_that("Can aggregate in Arrow", {
 })
 
 test_that("Group by sum on dataset", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(some_grouping) %>%
       summarize(total = sum(int, na.rm = TRUE)) %>%
       collect(),
     tbl
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(some_grouping) %>%
       summarize(total = sum(int * 4, na.rm = TRUE)) %>%
       collect(),
     tbl
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(some_grouping) %>%
       summarize(total = sum(int)) %>%
       collect(),
@@ -86,16 +86,16 @@ test_that("Group by sum on dataset", {
 })
 
 test_that("Group by mean on dataset", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(some_grouping) %>%
       summarize(mean = mean(int, na.rm = TRUE)) %>%
       collect(),
     tbl
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(some_grouping) %>%
       summarize(mean = mean(int, na.rm = FALSE)) %>%
       collect(),
@@ -104,16 +104,16 @@ test_that("Group by mean on dataset", {
 })
 
 test_that("Group by sd on dataset", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(some_grouping) %>%
       summarize(sd = sd(int, na.rm = TRUE)) %>%
       collect(),
     tbl
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(some_grouping) %>%
       summarize(sd = sd(int, na.rm = FALSE)) %>%
       collect(),
@@ -122,16 +122,16 @@ test_that("Group by sd on dataset", {
 })
 
 test_that("Group by var on dataset", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(some_grouping) %>%
       summarize(var = var(int, na.rm = TRUE)) %>%
       collect(),
     tbl
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(some_grouping) %>%
       summarize(var = var(int, na.rm = FALSE)) %>%
       collect(),
@@ -140,15 +140,15 @@ test_that("Group by var on dataset", {
 })
 
 test_that("n()", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       summarize(counts = n()) %>%
       collect(),
     tbl
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(some_grouping) %>%
       summarize(counts = n()) %>%
       arrange(some_grouping) %>%
@@ -158,53 +158,53 @@ test_that("n()", {
 })
 
 test_that("Group by any/all", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(some_grouping) %>%
       summarize(any(lgl, na.rm = TRUE)) %>%
       collect(),
     tbl
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(some_grouping) %>%
       summarize(all(lgl, na.rm = TRUE)) %>%
       collect(),
     tbl
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(some_grouping) %>%
       summarize(any(lgl, na.rm = FALSE)) %>%
       collect(),
     tbl
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(some_grouping) %>%
       summarize(all(lgl, na.rm = FALSE)) %>%
       collect(),
     tbl
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(has_words = nchar(verses) < 0) %>%
       group_by(some_grouping) %>%
       summarize(any(has_words, na.rm = TRUE)) %>%
       collect(),
     tbl
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(has_words = nchar(verses) < 0) %>%
       group_by(some_grouping) %>%
       summarize(all(has_words, na.rm = TRUE)) %>%
       collect(),
     tbl
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(some_grouping) %>%
       summarize(has_words = all(nchar(verses) < 0, na.rm = TRUE)) %>%
       collect(),
@@ -214,43 +214,43 @@ test_that("Group by any/all", {
 
 test_that("n_distinct() on dataset", {
   # With groupby
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(some_grouping) %>%
       summarize(distinct = n_distinct(lgl, na.rm = FALSE)) %>%
       collect(),
     tbl
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(some_grouping) %>%
       summarize(distinct = n_distinct(lgl, na.rm = TRUE)) %>%
       collect(),
     tbl
   )
   # Without groupby
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       summarize(distinct = n_distinct(lgl, na.rm = FALSE)) %>%
       collect(),
     tbl
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       summarize(distinct = n_distinct(lgl, na.rm = TRUE)) %>%
       collect(),
     tbl
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       summarize(distinct = n_distinct(int, lgl)) %>%
       collect(),
     tbl,
     warning = "Multiple arguments"
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(some_grouping) %>%
       summarize(distinct = n_distinct(int, lgl)) %>%
       collect(),
@@ -260,15 +260,15 @@ test_that("n_distinct() on dataset", {
 })
 
 test_that("Functions that take ... but we only accept a single arg", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       summarize(distinct = n_distinct()) %>%
       collect(),
     tbl,
     warning = "0 arguments"
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       summarize(distinct = n_distinct(int, lgl)) %>%
       collect(),
     tbl,
@@ -301,8 +301,8 @@ test_that("median()", {
   local_edition(2)
 
   # with groups
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(some_grouping) %>%
       summarize(
         med_dbl = median(dbl),
@@ -318,8 +318,8 @@ test_that("median()", {
     warning = "median\\(\\) currently returns an approximate median in Arrow"
   )
   # without groups, with na.rm = TRUE
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       summarize(
         med_dbl_narmt = median(dbl, na.rm = TRUE),
         med_int_narmt = as.double(median(int, TRUE))
@@ -329,8 +329,8 @@ test_that("median()", {
     warning = "median\\(\\) currently returns an approximate median in Arrow"
   )
   # without groups, with na.rm = FALSE (the default)
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       summarize(
         med_dbl = median(dbl),
         med_int = as.double(median(int)),
@@ -354,8 +354,8 @@ test_that("quantile()", {
   # controls whether the result has a names attribute. It defaults to
   # names = TRUE. With Arrow, it is not possible to give the result a names
   # attribute, so the quantile() binding in Arrow does not accept a `names`
-  # argument. Differences in this names attribute cause expect_dplyr_equal() to
-  # report that the objects are not equal, so we do not use expect_dplyr_equal()
+  # argument. Differences in this names attribute cause compare_dplyr_binding() to
+  # report that the objects are not equal, so we do not use compare_dplyr_binding()
   # in the tests below.
 
   # The tests below all use probs = 0.5 because other values cause differences
@@ -443,16 +443,16 @@ test_that("quantile()", {
 })
 
 test_that("summarize() with min() and max()", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       select(int, chr) %>%
       filter(int > 5) %>% # this filters out the NAs in `int`
       summarize(min_int = min(int), max_int = max(int)) %>%
       collect(),
     tbl,
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       select(int, chr) %>%
       filter(int > 5) %>% # this filters out the NAs in `int`
       summarize(
@@ -462,15 +462,15 @@ test_that("summarize() with min() and max()", {
       collect(),
     tbl,
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       select(int, chr) %>%
       summarize(min_int = min(int), max_int = max(int)) %>%
       collect(),
     tbl,
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       select(int) %>%
       summarize(
         min_int = min(int, na.rm = TRUE),
@@ -479,8 +479,8 @@ test_that("summarize() with min() and max()", {
       collect(),
     tbl,
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       select(dbl, int) %>%
       summarize(
         min_int = -min(log(ceiling(dbl)), na.rm = TRUE),
@@ -491,15 +491,15 @@ test_that("summarize() with min() and max()", {
   )
 
   # multiple dots arguments to min(), max() not supported
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       summarize(min_mult = min(dbl, int)) %>%
       collect(),
     tbl,
     warning = "Multiple arguments to min\\(\\) not supported by Arrow"
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       select(int, dbl, dbl2) %>%
       summarize(max_mult = max(int, dbl, dbl2)) %>%
       collect(),
@@ -509,8 +509,8 @@ test_that("summarize() with min() and max()", {
 
   # min(logical) or max(logical) yields integer in R
   # min(Boolean) or max(Boolean) yields Boolean in Arrow
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       select(lgl) %>%
       summarize(
         max_lgl = as.logical(max(lgl, na.rm = TRUE)),
@@ -522,8 +522,8 @@ test_that("summarize() with min() and max()", {
 })
 
 test_that("min() and max() on character strings", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       summarize(
         min_chr = min(chr, na.rm = TRUE),
         max_chr = max(chr, na.rm = TRUE)
@@ -532,8 +532,8 @@ test_that("min() and max() on character strings", {
     tbl,
   )
   skip("Strings not supported by hash_min_max (ARROW-13988)")
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(fct) %>%
       summarize(
         min_chr = min(chr, na.rm = TRUE),
@@ -548,8 +548,8 @@ test_that("summarise() with !!sym()", {
   test_chr_col <- "int"
   test_dbl_col <- "dbl"
   test_lgl_col <- "lgl"
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(false) %>%
       summarise(
         sum = sum(!!sym(test_dbl_col)),
@@ -568,24 +568,24 @@ test_that("summarise() with !!sym()", {
 })
 
 test_that("Filter and aggregate", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(some_grouping == 2) %>%
       summarize(total = sum(int, na.rm = TRUE)) %>%
       collect(),
     tbl
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(int > 5) %>%
       summarize(total = sum(int, na.rm = TRUE)) %>%
       collect(),
     tbl
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(some_grouping == 2) %>%
       group_by(some_grouping) %>%
       summarize(total = sum(int, na.rm = TRUE)) %>%
@@ -593,8 +593,8 @@ test_that("Filter and aggregate", {
     tbl
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(int > 5) %>%
       group_by(some_grouping) %>%
       summarize(total = sum(int, na.rm = TRUE)) %>%
@@ -604,16 +604,16 @@ test_that("Filter and aggregate", {
 })
 
 test_that("Group by edge cases", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(some_grouping * 2) %>%
       summarize(total = sum(int, na.rm = TRUE)) %>%
       collect(),
     tbl
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(alt = some_grouping * 2) %>%
       summarize(total = sum(int, na.rm = TRUE)) %>%
       collect(),
@@ -629,8 +629,8 @@ test_that("Do things after summarize", {
     pull() %>%
     tail(1)
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(some_grouping) %>%
       filter(int > 5) %>%
       summarize(total = sum(int, na.rm = TRUE)) %>%
@@ -640,8 +640,8 @@ test_that("Do things after summarize", {
     tbl
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       filter(dbl > 2) %>%
       select(chr, int, lgl) %>%
       mutate(twice = int * 2L) %>%
@@ -658,8 +658,8 @@ test_that("Do things after summarize", {
 
 test_that("Expressions on aggregations", {
   # This is what it effectively is
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(some_grouping) %>%
       summarize(
         any = any(lgl),
@@ -672,8 +672,8 @@ test_that("Expressions on aggregations", {
     tbl
   )
   # More concisely:
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(some_grouping) %>%
       summarize(any(lgl) & !all(lgl)) %>%
       collect(),
@@ -681,8 +681,8 @@ test_that("Expressions on aggregations", {
   )
 
   # Save one of the aggregates first
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(some_grouping) %>%
       summarize(
         any_lgl = any(lgl),
@@ -693,8 +693,8 @@ test_that("Expressions on aggregations", {
   )
 
   # Make sure order of columns in result is correct
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(some_grouping) %>%
       summarize(
         any_lgl = any(lgl),
@@ -707,8 +707,8 @@ test_that("Expressions on aggregations", {
 
   # Aggregate on an aggregate (trivial but dplyr allows)
   skip("Aggregate on an aggregate not supported")
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(some_grouping) %>%
       summarize(
         any_lgl = any(any(lgl))
@@ -719,8 +719,8 @@ test_that("Expressions on aggregations", {
 })
 
 test_that("Summarize with 0 arguments", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(some_grouping) %>%
       summarize() %>%
       collect(),
@@ -730,8 +730,8 @@ test_that("Summarize with 0 arguments", {
 
 test_that("Not (yet) supported: implicit join", {
   withr::local_options(list(arrow.debug = TRUE))
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(some_grouping) %>%
       summarize(
         sum((dbl - mean(dbl))^2)
@@ -740,8 +740,8 @@ test_that("Not (yet) supported: implicit join", {
     tbl,
     warning = "Expression sum\\(\\(dbl - mean\\(dbl\\)\\)\\^2\\) not supported in Arrow; pulling data into R"
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(some_grouping) %>%
       summarize(
         sum(dbl - mean(dbl))
@@ -750,8 +750,8 @@ test_that("Not (yet) supported: implicit join", {
     tbl,
     warning = "Expression sum\\(dbl - mean\\(dbl\\)\\) not supported in Arrow; pulling data into R"
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(some_grouping) %>%
       summarize(
         sqrt(sum((dbl - mean(dbl))^2) / (n() - 1L))
@@ -761,8 +761,8 @@ test_that("Not (yet) supported: implicit join", {
     warning = "Expression sum\\(\\(dbl - mean\\(dbl\\)\\)\\^2\\) not supported in Arrow; pulling data into R"
   )
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(some_grouping) %>%
       summarize(
         dbl - mean(dbl)
@@ -773,8 +773,8 @@ test_that("Not (yet) supported: implicit join", {
   )
 
   # This one could possibly be supported--in mutate()
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(some_grouping) %>%
       summarize(
         dbl - int
@@ -786,36 +786,36 @@ test_that("Not (yet) supported: implicit join", {
 })
 
 test_that(".groups argument", {
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(some_grouping, int < 6) %>%
       summarize(count = n()) %>%
       collect(),
     tbl
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(some_grouping, int < 6) %>%
       summarize(count = n(), .groups = "drop_last") %>%
       collect(),
     tbl
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(some_grouping, int < 6) %>%
       summarize(count = n(), .groups = "keep") %>%
       collect(),
     tbl
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(some_grouping, int < 6) %>%
       summarize(count = n(), .groups = "drop") %>%
       collect(),
     tbl
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(some_grouping, int < 6) %>%
       summarize(count = n(), .groups = "rowwise") %>%
       collect(),
@@ -844,8 +844,8 @@ test_that("summarize() handles group_by .drop", {
     x = 1:10,
     y = factor(rep(c("a", "c"), each = 5), levels = c("a", "b", "c"))
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(y) %>%
       count() %>%
       collect() %>%
@@ -853,8 +853,8 @@ test_that("summarize() handles group_by .drop", {
     tbl
   )
   # Not supported: check message
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(y, .drop = FALSE) %>%
       count() %>%
       collect() %>%
@@ -867,8 +867,8 @@ test_that("summarize() handles group_by .drop", {
   )
 
   # But this is ok because there is no factor group
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(y, .drop = FALSE) %>%
       count() %>%
       collect() %>%
diff --git a/r/tests/testthat/test-metadata.R b/r/tests/testthat/test-metadata.R
index c560da77081df..4c4d8a767bbe7 100644
--- a/r/tests/testthat/test-metadata.R
+++ b/r/tests/testthat/test-metadata.R
@@ -315,26 +315,26 @@ test_that("When we encounter SF cols, we warn", {
 test_that("dplyr with metadata", {
   skip_if_not_available("dataset")
 
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       collect(),
     example_with_metadata
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       select(a) %>%
       collect(),
     example_with_metadata
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(z = b * 4) %>%
       select(z, a) %>%
       collect(),
     example_with_metadata
   )
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(z = nchar(a)) %>%
       select(z, a) %>%
       collect(),
@@ -342,8 +342,8 @@ test_that("dplyr with metadata", {
   )
   # dplyr drops top-level attributes if you do summarize, though attributes
   # of grouping columns appear to come through
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       group_by(a) %>%
       summarize(n()) %>%
       collect(),
@@ -351,8 +351,8 @@ test_that("dplyr with metadata", {
   )
   # Same name in output but different data, so the column metadata shouldn't
   # carry through
-  expect_dplyr_equal(
-    input %>%
+  compare_dplyr_binding(
+    .input %>%
       mutate(a = nchar(a)) %>%
       select(a) %>%
       collect(),
diff --git a/r/tests/testthat/test-na-omit.R b/r/tests/testthat/test-na-omit.R
index 894dbe3d9defe..fafebb4ffb757 100644
--- a/r/tests/testthat/test-na-omit.R
+++ b/r/tests/testthat/test-na-omit.R
@@ -26,18 +26,18 @@ test_that("na.fail on Scalar", {
 })
 
 test_that("na.omit on Array and ChunkedArray", {
-  expect_vector_equal(na.omit(input), data_no_na)
-  expect_vector_equal(na.omit(input), data_na, ignore_attr = TRUE)
+  compare_expression(na.omit(.input), data_no_na)
+  compare_expression(na.omit(.input), data_na, ignore_attr = TRUE)
 })
 
 test_that("na.exclude on Array and ChunkedArray", {
-  expect_vector_equal(na.exclude(input), data_no_na)
-  expect_vector_equal(na.exclude(input), data_na, ignore_attr = TRUE)
+  compare_expression(na.exclude(.input), data_no_na)
+  compare_expression(na.exclude(.input), data_na, ignore_attr = TRUE)
 })
 
 test_that("na.fail on Array and ChunkedArray", {
-  expect_vector_equal(na.fail(input), data_no_na, ignore_attr = TRUE)
-  expect_vector_error(na.fail(input), data_na)
+  compare_expression(na.fail(.input), data_no_na, ignore_attr = TRUE)
+  compare_expression_error(na.fail(.input), data_na)
 })
 
 test_that("na.fail on Scalar", {

From 34fcb3847efe8f2d153bd3893d868f72b4e8a55b Mon Sep 17 00:00:00 2001
From: jeszyb <jeszyb@gmail.com>
Date: Thu, 28 Oct 2021 18:16:15 +0200
Subject: [PATCH 048/194] ARROW-14491: [CI] Add Debian 10 C++ nightly build
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The GCS testbench doesn't currently work on Debian 10, but it was never noticed because Debian 10 builds were disabled. Removing the testbench for now, and re-enabling builds.

Closes #11568 from jeszyb/debian-10

Authored-by: jeszyb <jeszyb@gmail.com>
Signed-off-by: Krisztián Szűcs <szucs.krisztian@gmail.com>
---
 ci/docker/debian-10-cpp.dockerfile | 2 --
 dev/tasks/tasks.yml                | 6 ++++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/ci/docker/debian-10-cpp.dockerfile b/ci/docker/debian-10-cpp.dockerfile
index 16e867fc3fbd6..f85408f03489a 100644
--- a/ci/docker/debian-10-cpp.dockerfile
+++ b/ci/docker/debian-10-cpp.dockerfile
@@ -74,8 +74,6 @@ RUN apt-get update -y -q && \
 
 COPY ci/scripts/install_minio.sh /arrow/ci/scripts/
 RUN /arrow/ci/scripts/install_minio.sh ${arch} linux latest /usr/local
-COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts/
-RUN /arrow/ci/scripts/install_gcs_testbench.sh ${arch} default
 
 ENV ARROW_BUILD_TESTS=ON \
     ARROW_DATASET=ON \
diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml
index 86cba81653ee1..944bc9838c8ee 100644
--- a/dev/tasks/tasks.yml
+++ b/dev/tasks/tasks.yml
@@ -906,13 +906,15 @@ tasks:
       flags: -e ARROW_SKYHOOK=ON
       image: ubuntu-cpp
 
-  test-debian-11-cpp:
+{% for debian_version in ["10", "11"] %}
+  test-debian-{{ debian_version }}-cpp:
     ci: github
     template: docker-tests/github.linux.yml
     params:
       env:
-        DEBIAN: 11
+        DEBIAN: {{ debian_version }}
       image: debian-cpp
+{% endfor %}
 
   test-fedora-33-cpp:
     ci: github

From faafc51840143635eb06957ccbb8d3669aa30c1d Mon Sep 17 00:00:00 2001
From: Nic Crane <thisisnic@gmail.com>
Date: Thu, 28 Oct 2021 18:13:08 +0100
Subject: [PATCH 049/194] ARROW-14253: [R] Update lz4 test failing locally due
 to different error message

Closes #11539 from thisisnic/ARROW-14253_lz4_test

Authored-by: Nic Crane <thisisnic@gmail.com>
Signed-off-by: Nic Crane <thisisnic@gmail.com>
---
 r/tests/testthat/test-feather.R | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/r/tests/testthat/test-feather.R b/r/tests/testthat/test-feather.R
index 136474deadbae..36dea0ef30c52 100644
--- a/r/tests/testthat/test-feather.R
+++ b/r/tests/testthat/test-feather.R
@@ -233,10 +233,10 @@ ft_file <- test_path("golden-files/data-arrow_2.0.0_lz4.feather")
 
 test_that("Error messages are shown when the compression algorithm lz4 is not found", {
   msg <- paste0(
-    "NotImplemented: Support for codec 'lz4' not built\nIn order to read this file, ",
+    ".*",
     "you will need to reinstall arrow with additional features enabled.\nSet one of ",
-    "these environment variables before installing:\n\n * LIBARROW_MINIMAL=false ",
-    "(for all optional features, including 'lz4')\n * ARROW_WITH_LZ4=ON (for just 'lz4')",
+    "these environment variables before installing:\n\n \\* LIBARROW_MINIMAL=false ",
+    "\\(for all optional features, including 'lz4'\\)\n \\* ARROW_WITH_LZ4=ON \\(for just 'lz4'\\)",
     "\n\nSee https://arrow.apache.org/docs/r/articles/install.html for details"
   )
 
@@ -244,7 +244,7 @@ test_that("Error messages are shown when the compression algorithm lz4 is not fo
     d <- read_feather(ft_file)
     expect_s3_class(d, "data.frame")
   } else {
-    expect_error(read_feather(ft_file), msg, fixed = TRUE)
+    expect_error(read_feather(ft_file), msg)
   }
 })
 

From 2ae84e8e90a7c0d8cd37df07d69e982dcfd16eaf Mon Sep 17 00:00:00 2001
From: David Li <li.davidm96@gmail.com>
Date: Thu, 28 Oct 2021 15:50:59 -0400
Subject: [PATCH 050/194] ARROW-14174: [C++] Deduplicate some
 Decimal/FixedSizeBinary kernels

Now that their respective scalar classes have a common `util::string_view view() const` getter, we can use that to consolidate some kernel implementations and hopefully shave binary size/build time a little.

Closes #11536 from lidavidm/arrow-14174

Authored-by: David Li <li.davidm96@gmail.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 .../arrow/compute/kernels/scalar_if_else.cc   | 59 +++++--------------
 .../arrow/compute/kernels/vector_replace.cc   | 55 +++--------------
 2 files changed, 23 insertions(+), 91 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else.cc b/cpp/src/arrow/compute/kernels/scalar_if_else.cc
index b3ebba8ea0003..6195d1381a044 100644
--- a/cpp/src/arrow/compute/kernels/scalar_if_else.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_if_else.cc
@@ -915,16 +915,9 @@ struct IfElseFunctor<Type, enable_if_fixed_size_binary<Type>> {
     return Status::OK();
   }
 
-  template <typename T = Type>
-  static enable_if_t<!is_decimal_type<T>::value, const uint8_t*> UnboxBinaryScalar(
-      const Scalar& scalar) {
+  static const uint8_t* UnboxBinaryScalar(const Scalar& scalar) {
     return reinterpret_cast<const uint8_t*>(
-        internal::UnboxScalar<FixedSizeBinaryType>::Unbox(scalar).data());
-  }
-
-  template <typename T = Type>
-  static enable_if_decimal<T, const uint8_t*> UnboxBinaryScalar(const Scalar& scalar) {
-    return internal::UnboxScalar<T>::Unbox(scalar).native_endian_bytes();
+        checked_cast<const arrow::internal::PrimitiveScalarBase&>(scalar).view().data());
   }
 
   template <typename T = Type>
@@ -1319,20 +1312,22 @@ struct CopyFixedWidth<Type, enable_if_number<Type>> {
 };
 
 template <typename Type>
-struct CopyFixedWidth<Type, enable_if_same<Type, FixedSizeBinaryType>> {
+struct CopyFixedWidth<Type, enable_if_fixed_size_binary<Type>> {
   static void CopyScalar(const Scalar& values, const int64_t length,
                          uint8_t* raw_out_values, const int64_t out_offset) {
     const int32_t width =
         checked_cast<const FixedSizeBinaryType&>(*values.type).byte_width();
     uint8_t* next = raw_out_values + (width * out_offset);
-    const auto& scalar = checked_cast<const FixedSizeBinaryScalar&>(values);
+    const auto& scalar =
+        checked_cast<const arrow::internal::PrimitiveScalarBase&>(values);
     // Scalar may have null value buffer
-    if (!scalar.value) {
+    if (!scalar.is_valid) {
       std::memset(next, 0x00, width * length);
     } else {
-      DCHECK_EQ(scalar.value->size(), width);
+      util::string_view view = scalar.view();
+      DCHECK_EQ(view.size(), static_cast<size_t>(width));
       for (int i = 0; i < length; i++) {
-        std::memcpy(next, scalar.value->data(), width);
+        std::memcpy(next, view.data(), width);
         next += width;
       }
     }
@@ -1346,30 +1341,6 @@ struct CopyFixedWidth<Type, enable_if_same<Type, FixedSizeBinaryType>> {
   }
 };
 
-template <typename Type>
-struct CopyFixedWidth<Type, enable_if_decimal<Type>> {
-  using ScalarType = typename TypeTraits<Type>::ScalarType;
-  static void CopyScalar(const Scalar& values, const int64_t length,
-                         uint8_t* raw_out_values, const int64_t out_offset) {
-    const int32_t width =
-        checked_cast<const FixedSizeBinaryType&>(*values.type).byte_width();
-    uint8_t* next = raw_out_values + (width * out_offset);
-    const auto& scalar = checked_cast<const ScalarType&>(values);
-    const auto value = scalar.value.ToBytes();
-    for (int i = 0; i < length; i++) {
-      std::memcpy(next, value.data(), width);
-      next += width;
-    }
-  }
-  static void CopyArray(const DataType& type, const uint8_t* in_values,
-                        const int64_t in_offset, const int64_t length,
-                        uint8_t* raw_out_values, const int64_t out_offset) {
-    const int32_t width = checked_cast<const FixedSizeBinaryType&>(type).byte_width();
-    uint8_t* next = raw_out_values + (width * out_offset);
-    std::memcpy(next, in_values + in_offset * width, length * width);
-  }
-};
-
 // Copy fixed-width values from a scalar/array datum into an output values buffer
 template <typename Type>
 void CopyValues(const Datum& in_values, const int64_t in_offset, const int64_t length,
@@ -2851,8 +2822,8 @@ void RegisterScalarIfElse(FunctionRegistry* registry) {
     AddPrimitiveCaseWhenKernels(func, {boolean(), null()});
     AddCaseWhenKernel(func, Type::FIXED_SIZE_BINARY,
                       CaseWhenFunctor<FixedSizeBinaryType>::Exec);
-    AddCaseWhenKernel(func, Type::DECIMAL128, CaseWhenFunctor<Decimal128Type>::Exec);
-    AddCaseWhenKernel(func, Type::DECIMAL256, CaseWhenFunctor<Decimal256Type>::Exec);
+    AddCaseWhenKernel(func, Type::DECIMAL128, CaseWhenFunctor<FixedSizeBinaryType>::Exec);
+    AddCaseWhenKernel(func, Type::DECIMAL256, CaseWhenFunctor<FixedSizeBinaryType>::Exec);
     AddBinaryCaseWhenKernels(func, BaseBinaryTypes());
     AddCaseWhenKernel(func, Type::FIXED_SIZE_LIST,
                       CaseWhenFunctor<FixedSizeListType>::Exec);
@@ -2874,8 +2845,8 @@ void RegisterScalarIfElse(FunctionRegistry* registry) {
     AddPrimitiveCoalesceKernels(func, {boolean(), null()});
     AddCoalesceKernel(func, Type::FIXED_SIZE_BINARY,
                       CoalesceFunctor<FixedSizeBinaryType>::Exec);
-    AddCoalesceKernel(func, Type::DECIMAL128, CoalesceFunctor<Decimal128Type>::Exec);
-    AddCoalesceKernel(func, Type::DECIMAL256, CoalesceFunctor<Decimal256Type>::Exec);
+    AddCoalesceKernel(func, Type::DECIMAL128, CoalesceFunctor<FixedSizeBinaryType>::Exec);
+    AddCoalesceKernel(func, Type::DECIMAL256, CoalesceFunctor<FixedSizeBinaryType>::Exec);
     for (const auto& ty : BaseBinaryTypes()) {
       AddCoalesceKernel(func, ty, GenerateTypeAgnosticVarBinaryBase<CoalesceFunctor>(ty));
     }
@@ -2898,8 +2869,8 @@ void RegisterScalarIfElse(FunctionRegistry* registry) {
     AddPrimitiveChooseKernels(func, {boolean(), null()});
     AddChooseKernel(func, Type::FIXED_SIZE_BINARY,
                     ChooseFunctor<FixedSizeBinaryType>::Exec);
-    AddChooseKernel(func, Type::DECIMAL128, ChooseFunctor<Decimal128Type>::Exec);
-    AddChooseKernel(func, Type::DECIMAL256, ChooseFunctor<Decimal256Type>::Exec);
+    AddChooseKernel(func, Type::DECIMAL128, ChooseFunctor<FixedSizeBinaryType>::Exec);
+    AddChooseKernel(func, Type::DECIMAL256, ChooseFunctor<FixedSizeBinaryType>::Exec);
     for (const auto& ty : BaseBinaryTypes()) {
       AddChooseKernel(func, ty, GenerateTypeAgnosticVarBinaryBase<ChooseFunctor>(ty));
     }
diff --git a/cpp/src/arrow/compute/kernels/vector_replace.cc b/cpp/src/arrow/compute/kernels/vector_replace.cc
index 450f99d7826c7..eca9c4bb72cb3 100644
--- a/cpp/src/arrow/compute/kernels/vector_replace.cc
+++ b/cpp/src/arrow/compute/kernels/vector_replace.cc
@@ -270,7 +270,7 @@ struct ReplaceWithMask<Type, enable_if_boolean<Type>> {
 };
 
 template <typename Type>
-struct ReplaceWithMask<Type, enable_if_same<Type, FixedSizeBinaryType>> {
+struct ReplaceWithMask<Type, enable_if_fixed_size_binary<Type>> {
   static void CopyData(const DataType& ty, uint8_t* out, const int64_t out_offset,
                        const ArrayData& in, const int64_t in_offset,
                        const int64_t length) {
@@ -283,52 +283,13 @@ struct ReplaceWithMask<Type, enable_if_same<Type, FixedSizeBinaryType>> {
                        const Scalar& in, const int64_t in_offset, const int64_t length) {
     const int32_t width = checked_cast<const FixedSizeBinaryType&>(ty).byte_width();
     uint8_t* begin = out + (out_offset * width);
-    const auto& scalar = checked_cast<const FixedSizeBinaryScalar&>(in);
+    const auto& scalar = checked_cast<const arrow::internal::PrimitiveScalarBase&>(in);
     // Null scalar may have null value buffer
-    if (!scalar.value) return;
-    const Buffer& buffer = *scalar.value;
-    const uint8_t* value = buffer.data();
-    DCHECK_GE(buffer.size(), width);
+    if (!scalar.is_valid) return;
+    const util::string_view buffer = scalar.view();
+    DCHECK_GE(buffer.size(), static_cast<size_t>(width));
     for (int i = 0; i < length; i++) {
-      std::memcpy(begin, value, width);
-      begin += width;
-    }
-  }
-
-  static Status ExecScalarMask(KernelContext* ctx, const ArrayData& array,
-                               const BooleanScalar& mask, const Datum& replacements,
-                               ArrayData* output) {
-    return ReplaceWithScalarMask<ReplaceWithMask<Type>>(ctx, array, mask, replacements,
-                                                        output);
-  }
-
-  static Status ExecArrayMask(KernelContext* ctx, const ArrayData& array,
-                              const ArrayData& mask, const Datum& replacements,
-                              ArrayData* output) {
-    return ReplaceWithArrayMask<ReplaceWithMask<Type>>(ctx, array, mask, replacements,
-                                                       output);
-  }
-};
-
-template <typename Type>
-struct ReplaceWithMask<Type, enable_if_decimal<Type>> {
-  using ScalarType = typename TypeTraits<Type>::ScalarType;
-  static void CopyData(const DataType& ty, uint8_t* out, const int64_t out_offset,
-                       const ArrayData& in, const int64_t in_offset,
-                       const int64_t length) {
-    const int32_t width = checked_cast<const FixedSizeBinaryType&>(ty).byte_width();
-    uint8_t* begin = out + (out_offset * width);
-    const auto in_arr = in.GetValues<uint8_t>(1, (in_offset + in.offset) * width);
-    std::memcpy(begin, in_arr, length * width);
-  }
-  static void CopyData(const DataType& ty, uint8_t* out, const int64_t out_offset,
-                       const Scalar& in, const int64_t in_offset, const int64_t length) {
-    const int32_t width = checked_cast<const FixedSizeBinaryType&>(ty).byte_width();
-    uint8_t* begin = out + (out_offset * width);
-    const auto& scalar = checked_cast<const ScalarType&>(in);
-    const auto value = scalar.value.ToBytes();
-    for (int i = 0; i < length; i++) {
-      std::memcpy(begin, value.data(), width);
+      std::memcpy(begin, buffer.data(), width);
       begin += width;
     }
   }
@@ -526,8 +487,8 @@ void RegisterVectorReplace(FunctionRegistry* registry) {
   add_primitive_kernel(null());
   add_primitive_kernel(boolean());
   add_kernel(Type::FIXED_SIZE_BINARY, ReplaceWithMaskFunctor<FixedSizeBinaryType>::Exec);
-  add_kernel(Type::DECIMAL128, ReplaceWithMaskFunctor<Decimal128Type>::Exec);
-  add_kernel(Type::DECIMAL256, ReplaceWithMaskFunctor<Decimal256Type>::Exec);
+  add_kernel(Type::DECIMAL128, ReplaceWithMaskFunctor<FixedSizeBinaryType>::Exec);
+  add_kernel(Type::DECIMAL256, ReplaceWithMaskFunctor<FixedSizeBinaryType>::Exec);
   for (const auto& ty : BaseBinaryTypes()) {
     add_kernel(ty->id(), GenerateTypeAgnosticVarBinaryBase<ReplaceWithMaskFunctor>(*ty));
   }

From e8270d705da325158291f412235c9c155adceb39 Mon Sep 17 00:00:00 2001
From: Phillip Cloud <417981+cpcloud@users.noreply.github.com>
Date: Fri, 29 Oct 2021 09:55:59 -0400
Subject: [PATCH 051/194] ARROW-14351: [IR] Add projection list to Source node

- Add projection field indices to Source
- Regenerate code

Closes #11439 from cpcloud/ARROW-14351

Authored-by: Phillip Cloud <417981+cpcloud@users.noreply.github.com>
Signed-off-by: Benjamin Kietzman <bengilgit@gmail.com>
---
 cpp/src/generated/Relation_generated.h | 31 ++++++++++++++++++++++----
 experimental/computeir/Relation.fbs    |  9 ++++++++
 2 files changed, 36 insertions(+), 4 deletions(-)

diff --git a/cpp/src/generated/Relation_generated.h b/cpp/src/generated/Relation_generated.h
index 522a41d63173f..110e632aa3f68 100644
--- a/cpp/src/generated/Relation_generated.h
+++ b/cpp/src/generated/Relation_generated.h
@@ -1095,7 +1095,8 @@ struct Source FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
     VT_ID = 4,
     VT_NAME = 6,
     VT_FILTER = 8,
-    VT_SCHEMA = 10
+    VT_SCHEMA = 10,
+    VT_PROJECTION = 12
   };
   /// An identifiier for the relation. The identifier should be unique over the
   /// entire plan. Optional.
@@ -1118,6 +1119,17 @@ struct Source FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const org::apache::arrow::flatbuf::Schema *schema() const {
     return GetPointer<const org::apache::arrow::flatbuf::Schema *>(VT_SCHEMA);
   }
+  /// An optional list of field indices indicating which columns should be read
+  /// from the source. Columns excluded from this listing will instead be replaced
+  /// with all-null placeholders to guarantee that the schema of the source is
+  /// unaffected by this projection.
+  ///
+  /// A missing value indicates all columns should be read.
+  ///
+  /// The behavior of an empty list is undefined.
+  const flatbuffers::Vector<flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::FieldIndex>> *projection() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::FieldIndex>> *>(VT_PROJECTION);
+  }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_ID) &&
@@ -1128,6 +1140,9 @@ struct Source FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            verifier.VerifyTable(filter()) &&
            VerifyOffset(verifier, VT_SCHEMA) &&
            verifier.VerifyTable(schema()) &&
+           VerifyOffset(verifier, VT_PROJECTION) &&
+           verifier.VerifyVector(projection()) &&
+           verifier.VerifyVectorOfTables(projection()) &&
            verifier.EndTable();
   }
 };
@@ -1148,6 +1163,9 @@ struct SourceBuilder {
   void add_schema(flatbuffers::Offset<org::apache::arrow::flatbuf::Schema> schema) {
     fbb_.AddOffset(Source::VT_SCHEMA, schema);
   }
+  void add_projection(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::FieldIndex>>> projection) {
+    fbb_.AddOffset(Source::VT_PROJECTION, projection);
+  }
   explicit SourceBuilder(flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -1166,8 +1184,10 @@ inline flatbuffers::Offset<Source> CreateSource(
     flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::RelId> id = 0,
     flatbuffers::Offset<flatbuffers::String> name = 0,
     flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::Expression> filter = 0,
-    flatbuffers::Offset<org::apache::arrow::flatbuf::Schema> schema = 0) {
+    flatbuffers::Offset<org::apache::arrow::flatbuf::Schema> schema = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::FieldIndex>>> projection = 0) {
   SourceBuilder builder_(_fbb);
+  builder_.add_projection(projection);
   builder_.add_schema(schema);
   builder_.add_filter(filter);
   builder_.add_name(name);
@@ -1180,14 +1200,17 @@ inline flatbuffers::Offset<Source> CreateSourceDirect(
     flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::RelId> id = 0,
     const char *name = nullptr,
     flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::Expression> filter = 0,
-    flatbuffers::Offset<org::apache::arrow::flatbuf::Schema> schema = 0) {
+    flatbuffers::Offset<org::apache::arrow::flatbuf::Schema> schema = 0,
+    const std::vector<flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::FieldIndex>> *projection = nullptr) {
   auto name__ = name ? _fbb.CreateString(name) : 0;
+  auto projection__ = projection ? _fbb.CreateVector<flatbuffers::Offset<org::apache::arrow::computeir::flatbuf::FieldIndex>>(*projection) : 0;
   return org::apache::arrow::computeir::flatbuf::CreateSource(
       _fbb,
       id,
       name__,
       filter,
-      schema);
+      schema,
+      projection__);
 }
 
 /// A table holding an instance of the possible relation types.
diff --git a/experimental/computeir/Relation.fbs b/experimental/computeir/Relation.fbs
index 38fe5c7901306..308dcdb9ae17a 100644
--- a/experimental/computeir/Relation.fbs
+++ b/experimental/computeir/Relation.fbs
@@ -186,6 +186,15 @@ table Source {
   filter: Expression;
   /// Schemas are explicitly optional
   schema: org.apache.arrow.flatbuf.Schema;
+  /// An optional list of field indices indicating which columns should be read
+  /// from the source. Columns excluded from this listing will instead be replaced
+  /// with all-null placeholders to guarantee that the schema of the source is
+  /// unaffected by this projection.
+  ///
+  /// A missing value indicates all columns should be read.
+  ///
+  /// The behavior of an empty list is undefined.
+  projection: [FieldIndex];
 }
 
 /// The varieties of relations

From fbaa405575bceda9e1e22295e8297689ada06cad Mon Sep 17 00:00:00 2001
From: Alenka Frim <frim.alenka@gmail.com>
Date: Fri, 29 Oct 2021 14:37:58 -0400
Subject: [PATCH 052/194] ARROW-14517: [Python] Missing ampersand in
 CIpcReadOptions of CFeatherReader

Add a missing ampersand (&) for `CIpcReadOptions` in `libarrow_feather.pxd`.

Closes #11572 from AlenkaF/ARROW-14517

Authored-by: Alenka Frim <frim.alenka@gmail.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 python/pyarrow/includes/libarrow_feather.pxd | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/pyarrow/includes/libarrow_feather.pxd b/python/pyarrow/includes/libarrow_feather.pxd
index e5061c66dba27..722e947bfeca2 100644
--- a/python/pyarrow/includes/libarrow_feather.pxd
+++ b/python/pyarrow/includes/libarrow_feather.pxd
@@ -41,7 +41,7 @@ cdef extern from "arrow/ipc/api.h" namespace "arrow::ipc" nogil:
         @staticmethod
         CResult[shared_ptr[CFeatherReader]] Open(
             const shared_ptr[CRandomAccessFile]& file,
-            const CIpcReadOptions options)
+            const CIpcReadOptions& options)
         int version()
         shared_ptr[CSchema] schema()
 

From d3d9060b9162c5c28877a4153202fd52a67a07b5 Mon Sep 17 00:00:00 2001
From: Eduardo Ponce <edponce00@gmail.com>
Date: Sat, 30 Oct 2021 14:11:06 -0400
Subject: [PATCH 053/194] ARROW-14514: [C++][R] UBSAN error on round kernel

Add base class `OptionsWrapper` to primary template of `RoundOptionsWrapper`.

Closes #11573 from edponce/ARROW-14514-UBSAN-error-on-round-kernel

Authored-by: Eduardo Ponce <edponce00@gmail.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 cpp/src/arrow/compute/kernels/scalar_arithmetic.cc | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc b/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc
index be6f35164457e..1a60ed31e0a47 100644
--- a/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc
@@ -1031,7 +1031,7 @@ struct RoundImpl<Type, RoundMode::HALF_TO_ODD> {
 };
 
 // Specializations of kernel state for round kernels
-template <typename>
+template <typename OptionsType>
 struct RoundOptionsWrapper;
 
 template <>
@@ -1062,10 +1062,19 @@ template <>
 struct RoundOptionsWrapper<RoundToMultipleOptions>
     : public OptionsWrapper<RoundToMultipleOptions> {
   using OptionsType = RoundToMultipleOptions;
+  using State = RoundOptionsWrapper<OptionsType>;
+  using OptionsWrapper::OptionsWrapper;
 
   static Result<std::unique_ptr<KernelState>> Init(KernelContext* ctx,
                                                    const KernelInitArgs& args) {
-    ARROW_ASSIGN_OR_RAISE(auto state, OptionsWrapper<OptionsType>::Init(ctx, args));
+    std::unique_ptr<State> state;
+    if (auto options = static_cast<const OptionsType*>(args.options)) {
+      state = ::arrow::internal::make_unique<State>(*options);
+    } else {
+      return Status::Invalid(
+          "Attempted to initialize KernelState from null FunctionOptions");
+    }
+
     auto options = Get(*state);
     const auto& type = *args.inputs[0].type;
     if (!options.multiple || !options.multiple->is_valid) {

From f1672ed2405be42b64a11e38bd03fb76c0eb53b9 Mon Sep 17 00:00:00 2001
From: David Li <li.davidm96@gmail.com>
Date: Sat, 30 Oct 2021 14:12:36 -0400
Subject: [PATCH 054/194] ARROW-13081: [C++] Disallow comparing zoned and naive
 timestamps

Also do some testing of comparing types that are cast-compatible with each other.

Closes #11541 from lidavidm/arrow-13081

Authored-by: David Li <li.davidm96@gmail.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 .../arrow/compute/kernels/scalar_compare.cc   |  23 +++-
 .../compute/kernels/scalar_compare_test.cc    | 100 ++++++++++++++----
 2 files changed, 101 insertions(+), 22 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_compare.cc b/cpp/src/arrow/compute/kernels/scalar_compare.cc
index 681f2b68c19a4..b23261528ff33 100644
--- a/cpp/src/arrow/compute/kernels/scalar_compare.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_compare.cc
@@ -125,6 +125,23 @@ struct Maximum {
 
 // Implement Less, LessEqual by flipping arguments to Greater, GreaterEqual
 
+template <typename OutType, typename ArgType, typename Op>
+struct CompareTimestamps
+    : public applicator::ScalarBinaryEqualTypes<OutType, ArgType, Op> {
+  using Base = applicator::ScalarBinaryEqualTypes<OutType, ArgType, Op>;
+
+  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    const auto& lhs = checked_cast<const TimestampType&>(*batch[0].type());
+    const auto& rhs = checked_cast<const TimestampType&>(*batch[1].type());
+    if (lhs.timezone().empty() ^ rhs.timezone().empty()) {
+      return Status::Invalid(
+          "Cannot compare timestamp with timezone to timestamp without timezone, got: ",
+          lhs, " and ", rhs);
+    }
+    return Base::Exec(ctx, batch, out);
+  }
+};
+
 template <typename Op>
 void AddIntegerCompare(const std::shared_ptr<DataType>& ty, ScalarFunction* func) {
   auto exec =
@@ -210,10 +227,8 @@ std::shared_ptr<ScalarFunction> MakeCompareFunction(std::string name,
   // Add timestamp kernels
   for (auto unit : TimeUnit::values()) {
     InputType in_type(match::TimestampTypeUnit(unit));
-    auto exec =
-        GeneratePhysicalInteger<applicator::ScalarBinaryEqualTypes, BooleanType, Op>(
-            int64());
-    DCHECK_OK(func->AddKernel({in_type, in_type}, boolean(), std::move(exec)));
+    DCHECK_OK(func->AddKernel({in_type, in_type}, boolean(),
+                              CompareTimestamps<BooleanType, TimestampType, Op>::Exec));
   }
 
   // Duration
diff --git a/cpp/src/arrow/compute/kernels/scalar_compare_test.cc b/cpp/src/arrow/compute/kernels/scalar_compare_test.cc
index 800ae8063ef08..28471a8ae2fe6 100644
--- a/cpp/src/arrow/compute/kernels/scalar_compare_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_compare_test.cc
@@ -454,6 +454,54 @@ TEST(TestCompareTimestamps, Basics) {
   CheckArrayCase(seconds_utc, CompareOperator::EQUAL, "[false, false, true]");
 }
 
+TEST(TestCompareTimestamps, DifferentParameters) {
+  const std::vector<std::pair<std::string, std::string>> cases = {
+      {"equal", "[0, 0, 1]"},   {"not_equal", "[1, 1, 0]"},
+      {"less", "[1, 0, 0]"},    {"less_equal", "[1, 0, 1]"},
+      {"greater", "[0, 1, 0]"}, {"greater_equal", "[0, 1, 1]"},
+  };
+  const std::string lhs_json = R"(["1970-01-01","2000-02-29","1900-02-28"])";
+  const std::string rhs_json = R"(["1970-01-02","2000-02-01","1900-02-28"])";
+
+  for (const auto& op : cases) {
+    const auto& function = op.first;
+    const auto& expected = op.second;
+
+    SCOPED_TRACE(function);
+    {
+      // Different units should be fine
+      auto lhs = ArrayFromJSON(timestamp(TimeUnit::SECOND), lhs_json);
+      auto rhs = ArrayFromJSON(timestamp(TimeUnit::MILLI), rhs_json);
+      CheckScalarBinary(function, lhs, rhs, ArrayFromJSON(boolean(), expected));
+    }
+    {
+      // So are different time zones
+      auto lhs = ArrayFromJSON(timestamp(TimeUnit::SECOND, "America/New_York"), lhs_json);
+      auto rhs = ArrayFromJSON(timestamp(TimeUnit::SECOND, "America/Phoenix"), rhs_json);
+      CheckScalarBinary(function, lhs, rhs, ArrayFromJSON(boolean(), expected));
+    }
+    {
+      // But comparing naive to zoned is not OK
+      auto lhs = ArrayFromJSON(timestamp(TimeUnit::SECOND), lhs_json);
+      auto rhs = ArrayFromJSON(timestamp(TimeUnit::SECOND, "America/Phoenix"), rhs_json);
+      EXPECT_RAISES_WITH_MESSAGE_THAT(
+          Invalid,
+          ::testing::HasSubstr(
+              "Cannot compare timestamp with timezone to timestamp without timezone"),
+          CallFunction(function, {lhs, rhs}));
+    }
+    {
+      auto lhs = ArrayFromJSON(timestamp(TimeUnit::SECOND, "America/New_York"), lhs_json);
+      auto rhs = ArrayFromJSON(timestamp(TimeUnit::SECOND), rhs_json);
+      EXPECT_RAISES_WITH_MESSAGE_THAT(
+          Invalid,
+          ::testing::HasSubstr(
+              "Cannot compare timestamp with timezone to timestamp without timezone"),
+          CallFunction(function, {lhs, rhs}));
+    }
+  }
+}
+
 template <typename ArrowType>
 class TestCompareDecimal : public ::testing::Test {};
 TYPED_TEST_SUITE(TestCompareDecimal, DecimalArrowTypes);
@@ -462,12 +510,9 @@ TYPED_TEST(TestCompareDecimal, ArrayScalar) {
   auto ty = std::make_shared<TypeParam>(3, 2);
 
   std::vector<std::pair<std::string, std::string>> cases = {
-      std::make_pair("equal", "[1, 0, 0, null]"),
-      std::make_pair("not_equal", "[0, 1, 1, null]"),
-      std::make_pair("less", "[0, 0, 1, null]"),
-      std::make_pair("less_equal", "[1, 0, 1, null]"),
-      std::make_pair("greater", "[0, 1, 0, null]"),
-      std::make_pair("greater_equal", "[1, 1, 0, null]"),
+      {"equal", "[1, 0, 0, null]"},   {"not_equal", "[0, 1, 1, null]"},
+      {"less", "[0, 0, 1, null]"},    {"less_equal", "[1, 0, 1, null]"},
+      {"greater", "[0, 1, 0, null]"}, {"greater_equal", "[1, 1, 0, null]"},
   };
 
   auto lhs = ArrayFromJSON(ty, R"(["1.23", "2.34", "-1.23", null])");
@@ -492,12 +537,9 @@ TYPED_TEST(TestCompareDecimal, ScalarArray) {
   auto ty = std::make_shared<TypeParam>(3, 2);
 
   std::vector<std::pair<std::string, std::string>> cases = {
-      std::make_pair("equal", "[1, 0, 0, null]"),
-      std::make_pair("not_equal", "[0, 1, 1, null]"),
-      std::make_pair("less", "[0, 1, 0, null]"),
-      std::make_pair("less_equal", "[1, 1, 0, null]"),
-      std::make_pair("greater", "[0, 0, 1, null]"),
-      std::make_pair("greater_equal", "[1, 0, 1, null]"),
+      {"equal", "[1, 0, 0, null]"},   {"not_equal", "[0, 1, 1, null]"},
+      {"less", "[0, 1, 0, null]"},    {"less_equal", "[1, 1, 0, null]"},
+      {"greater", "[0, 0, 1, null]"}, {"greater_equal", "[1, 0, 1, null]"},
   };
 
   auto lhs = ScalarFromJSON(ty, R"("1.23")");
@@ -522,12 +564,12 @@ TYPED_TEST(TestCompareDecimal, ArrayArray) {
   auto ty = std::make_shared<TypeParam>(3, 2);
 
   std::vector<std::pair<std::string, std::string>> cases = {
-      std::make_pair("equal", "[1, 0, 0, 1, 0, 0, null, null]"),
-      std::make_pair("not_equal", "[0, 1, 1, 0, 1, 1, null, null]"),
-      std::make_pair("less", "[0, 1, 0, 0, 1, 0, null, null]"),
-      std::make_pair("less_equal", "[1, 1, 0, 1, 1, 0, null, null]"),
-      std::make_pair("greater", "[0, 0, 1, 0, 0, 1, null, null]"),
-      std::make_pair("greater_equal", "[1, 0, 1, 1, 0, 1, null, null]"),
+      {"equal", "[1, 0, 0, 1, 0, 0, null, null]"},
+      {"not_equal", "[0, 1, 1, 0, 1, 1, null, null]"},
+      {"less", "[0, 1, 0, 0, 1, 0, null, null]"},
+      {"less_equal", "[1, 1, 0, 1, 1, 0, null, null]"},
+      {"greater", "[0, 0, 1, 0, 0, 1, null, null]"},
+      {"greater_equal", "[1, 0, 1, 1, 0, 1, null, null]"},
   };
 
   auto lhs = ArrayFromJSON(
@@ -557,6 +599,28 @@ TYPED_TEST(TestCompareDecimal, ArrayArray) {
   }
 }
 
+TYPED_TEST(TestCompareDecimal, DifferentParameters) {
+  auto ty1 = std::make_shared<TypeParam>(3, 2);
+  auto ty2 = std::make_shared<TypeParam>(4, 3);
+
+  std::vector<std::pair<std::string, std::string>> cases = {
+      {"equal", "[1, 0, 0, 1, 0, 0]"},   {"not_equal", "[0, 1, 1, 0, 1, 1]"},
+      {"less", "[0, 1, 0, 0, 1, 0]"},    {"less_equal", "[1, 1, 0, 1, 1, 0]"},
+      {"greater", "[0, 0, 1, 0, 0, 1]"}, {"greater_equal", "[1, 0, 1, 1, 0, 1]"},
+  };
+
+  auto lhs = ArrayFromJSON(ty1, R"(["1.23", "1.23", "2.34", "-1.23", "-1.23", "1.23"])");
+  auto rhs =
+      ArrayFromJSON(ty2, R"(["1.230", "2.340", "1.230", "-1.230", "1.230", "-1.230"])");
+  for (const auto& op : cases) {
+    const auto& function = op.first;
+    const auto& expected = op.second;
+
+    SCOPED_TRACE(function);
+    CheckScalarBinary(function, lhs, rhs, ArrayFromJSON(boolean(), expected));
+  }
+}
+
 // Helper to organize tests for fixed size binary comparisons
 struct CompareCase {
   std::shared_ptr<DataType> lhs_type;

From 07b4813ef4c9e1bd0769100035c000093c3e2fbf Mon Sep 17 00:00:00 2001
From: David Li <li.davidm96@gmail.com>
Date: Mon, 1 Nov 2021 01:38:48 +0000
Subject: [PATCH 055/194] ARROW-14440: [C++][FlightRPC] Add gRPC + Flight
 example

This demonstrates how to register a normal gRPC service on a Flight server, allowing you to use all of gRPC's features and ecosystem while still serving Flight, all on the same port. This can be helpful for when DoAction is too limited/inconvenient an extension point and you are already using gRPC.

Closes #11524 from lidavidm/arrow-14440

Authored-by: David Li <li.davidm96@gmail.com>
Signed-off-by: Yibo Cai <yibo.cai@arm.com>
---
 cpp/cmake_modules/BuildUtils.cmake          |  13 ++-
 cpp/cmake_modules/ThirdpartyToolchain.cmake |  12 +++
 cpp/examples/arrow/CMakeLists.txt           |  86 ++++++++++++++---
 cpp/examples/arrow/flight_grpc_example.cc   | 100 ++++++++++++++++++++
 cpp/examples/arrow/helloworld.proto         |  30 ++++++
 cpp/examples/minimal_build/CMakeLists.txt   |   2 +-
 cpp/examples/parquet/CMakeLists.txt         |  42 ++++----
 dev/archery/archery/utils/lint.py           |   1 +
 8 files changed, 249 insertions(+), 37 deletions(-)
 create mode 100644 cpp/examples/arrow/flight_grpc_example.cc
 create mode 100644 cpp/examples/arrow/helloworld.proto

diff --git a/cpp/cmake_modules/BuildUtils.cmake b/cpp/cmake_modules/BuildUtils.cmake
index cd8290d1bbb67..9ab9b582cf6a4 100644
--- a/cpp/cmake_modules/BuildUtils.cmake
+++ b/cpp/cmake_modules/BuildUtils.cmake
@@ -798,7 +798,12 @@ endfunction()
 function(ADD_ARROW_EXAMPLE REL_EXAMPLE_NAME)
   set(options)
   set(one_value_args)
-  set(multi_value_args EXTRA_LINK_LIBS DEPENDENCIES PREFIX)
+  set(multi_value_args
+      EXTRA_INCLUDES
+      EXTRA_LINK_LIBS
+      EXTRA_SOURCES
+      DEPENDENCIES
+      PREFIX)
   cmake_parse_arguments(ARG
                         "${options}"
                         "${one_value_args}"
@@ -820,7 +825,7 @@ function(ADD_ARROW_EXAMPLE REL_EXAMPLE_NAME)
   if(EXISTS ${CMAKE_SOURCE_DIR}/examples/arrow/${REL_EXAMPLE_NAME}.cc)
     # This example has a corresponding .cc file, set it up as an executable.
     set(EXAMPLE_PATH "${EXECUTABLE_OUTPUT_PATH}/${EXAMPLE_NAME}")
-    add_executable(${EXAMPLE_NAME} "${REL_EXAMPLE_NAME}.cc")
+    add_executable(${EXAMPLE_NAME} "${REL_EXAMPLE_NAME}.cc" ${ARG_EXTRA_SOURCES})
     target_link_libraries(${EXAMPLE_NAME} ${ARROW_EXAMPLE_LINK_LIBS})
     add_dependencies(runexample ${EXAMPLE_NAME})
     set(NO_COLOR "--color_print=false")
@@ -834,6 +839,10 @@ function(ADD_ARROW_EXAMPLE REL_EXAMPLE_NAME)
     add_dependencies(${EXAMPLE_NAME} ${ARG_DEPENDENCIES})
   endif()
 
+  if(ARG_EXTRA_INCLUDES)
+    target_include_directories(${EXAMPLE_NAME} SYSTEM PUBLIC ${ARG_EXTRA_INCLUDES})
+  endif()
+
   add_test(${EXAMPLE_NAME} ${EXAMPLE_PATH})
   set_tests_properties(${EXAMPLE_NAME} PROPERTIES LABELS "example")
 endfunction()
diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake
index 0673aa18fdf07..c59ac19815379 100644
--- a/cpp/cmake_modules/ThirdpartyToolchain.cmake
+++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake
@@ -3306,6 +3306,9 @@ macro(build_grpc)
   set(GRPC_STATIC_LIBRARY_ADDRESS_SORTING
       "${GRPC_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}address_sorting${CMAKE_STATIC_LIBRARY_SUFFIX}"
   )
+  set(GRPC_STATIC_LIBRARY_GRPCPP_REFLECTION
+      "${GRPC_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}grpc++_reflection${CMAKE_STATIC_LIBRARY_SUFFIX}"
+  )
   set(GRPC_STATIC_LIBRARY_UPB
       "${GRPC_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}upb${CMAKE_STATIC_LIBRARY_SUFFIX}"
   )
@@ -3400,6 +3403,7 @@ macro(build_grpc)
                                        ${GRPC_STATIC_LIBRARY_GRPC}
                                        ${GRPC_STATIC_LIBRARY_GRPCPP}
                                        ${GRPC_STATIC_LIBRARY_ADDRESS_SORTING}
+                                       ${GRPC_STATIC_LIBRARY_GRPCPP_REFLECTION}
                                        ${GRPC_STATIC_LIBRARY_UPB}
                                        ${GRPC_CPP_PLUGIN}
                       CMAKE_ARGS ${GRPC_CMAKE_ARGS} ${EP_LOG_OPTIONS}
@@ -3433,6 +3437,12 @@ macro(build_grpc)
                                    "${GRPC_STATIC_LIBRARY_ADDRESS_SORTING}"
                                    INTERFACE_INCLUDE_DIRECTORIES "${GRPC_INCLUDE_DIR}")
 
+  add_library(gRPC::grpc++_reflection STATIC IMPORTED)
+  set_target_properties(gRPC::grpc++_reflection
+                        PROPERTIES IMPORTED_LOCATION
+                                   "${GRPC_STATIC_LIBRARY_GRPCPP_REFLECTION}"
+                                   INTERFACE_INCLUDE_DIRECTORIES "${GRPC_INCLUDE_DIR}")
+
   add_library(gRPC::grpc STATIC IMPORTED)
   set(GRPC_LINK_LIBRARIES
       gRPC::gpr
@@ -3512,6 +3522,8 @@ if(ARROW_WITH_GRPC)
 
   if(GRPC_VENDORED)
     set(GRPCPP_PP_INCLUDE TRUE)
+    # Examples need to link to static Arrow if we're using static gRPC
+    set(ARROW_GRPC_USE_SHARED OFF)
   else()
     # grpc++ headers may reside in ${GRPC_INCLUDE_DIR}/grpc++ or ${GRPC_INCLUDE_DIR}/grpcpp
     # depending on the gRPC version.
diff --git a/cpp/examples/arrow/CMakeLists.txt b/cpp/examples/arrow/CMakeLists.txt
index ac758b92d81cd..34c21abada5f2 100644
--- a/cpp/examples/arrow/CMakeLists.txt
+++ b/cpp/examples/arrow/CMakeLists.txt
@@ -15,30 +15,90 @@
 # specific language governing permissions and limitations
 # under the License.
 
-ADD_ARROW_EXAMPLE(row_wise_conversion_example)
+add_arrow_example(row_wise_conversion_example)
 
-if (ARROW_COMPUTE)
-  ADD_ARROW_EXAMPLE(compute_register_example)
+if(ARROW_COMPUTE)
+  add_arrow_example(compute_register_example)
 endif()
 
-if (ARROW_COMPUTE AND ARROW_CSV)
-  ADD_ARROW_EXAMPLE(compute_and_write_csv_example)
+if(ARROW_COMPUTE AND ARROW_CSV)
+  add_arrow_example(compute_and_write_csv_example)
 endif()
 
-if (ARROW_PARQUET AND ARROW_DATASET)
-  if (ARROW_BUILD_SHARED)
+if(ARROW_FLIGHT)
+  # Static gRPC means we cannot linked to shared Arrow, since then
+  # we'll violate ODR for gRPC symbols
+  if(ARROW_GRPC_USE_SHARED)
+    set(FLIGHT_EXAMPLES_LINK_LIBS arrow_flight_shared)
+    # We don't directly use symbols from the reflection library, so
+    # ensure the linker still links to it
+    set(GRPC_REFLECTION_LINK_LIBS -Wl,--no-as-needed gRPC::grpc++_reflection
+                                  -Wl,--as-needed)
+  elseif(NOT ARROW_BUILD_STATIC)
+    message(FATAL_ERROR "Statically built gRPC requires ARROW_BUILD_STATIC=ON")
+  else()
+    set(FLIGHT_EXAMPLES_LINK_LIBS arrow_flight_static)
+    set(GRPC_REFLECTION_LINK_LIBS -Wl,--whole-archive gRPC::grpc++_reflection
+                                  -Wl,--no-whole-archive)
+  endif()
+
+  set(FLIGHT_EXAMPLE_GENERATED_PROTO_FILES
+      "${CMAKE_CURRENT_BINARY_DIR}/helloworld.pb.cc"
+      "${CMAKE_CURRENT_BINARY_DIR}/helloworld.pb.h"
+      "${CMAKE_CURRENT_BINARY_DIR}/helloworld.grpc.pb.cc"
+      "${CMAKE_CURRENT_BINARY_DIR}/helloworld.grpc.pb.h")
+  set_source_files_properties(${FLIGHT_EXAMPLE_GENERATED_PROTO_FILES} PROPERTIES GENERATED
+                                                                                 TRUE)
+
+  set(FLIGHT_EXAMPLE_PROTO "helloworld.proto")
+  set(FLIGHT_EXAMPLE_PROTO_PATH "${CMAKE_CURRENT_LIST_DIR}")
+  set(FLIGHT_EXAMPLE_PROTO_DEPENDS ${FLIGHT_EXAMPLE_PROTO} ${ARROW_PROTOBUF_LIBPROTOBUF}
+                                   gRPC::grpc_cpp_plugin)
+
+  add_custom_command(OUTPUT ${FLIGHT_EXAMPLE_GENERATED_PROTO_FILES}
+                     COMMAND ${ARROW_PROTOBUF_PROTOC} "-I${FLIGHT_EXAMPLE_PROTO_PATH}"
+                             "--cpp_out=${CMAKE_CURRENT_BINARY_DIR}"
+                             "${FLIGHT_EXAMPLE_PROTO}"
+                     DEPENDS ${PROTO_EXAMPLE_PROTO_DEPENDS} ARGS
+                     COMMAND ${ARROW_PROTOBUF_PROTOC} "-I${FLIGHT_EXAMPLE_PROTO_PATH}"
+                             "--grpc_out=${CMAKE_CURRENT_BINARY_DIR}"
+                             "--plugin=protoc-gen-grpc=$<TARGET_FILE:gRPC::grpc_cpp_plugin>"
+                             "${FLIGHT_EXAMPLE_PROTO}")
+
+  add_custom_target(flight_grpc_example_gen ALL
+                    DEPENDS ${FLIGHT_EXAMPLE_GENERATED_PROTO_FILES})
+
+  add_arrow_example(flight_grpc_example
+                    DEPENDENCIES
+                    flight_grpc_example_gen
+                    # Not CMAKE_CURRENT_BINARY_DIR so we can #include
+                    # "examples/arrow/helloworld.pb.h" instead of
+                    # "helloworld.pb.h" (which fails lint)
+                    EXTRA_INCLUDES
+                    ${CMAKE_BINARY_DIR}
+                    EXTRA_LINK_LIBS
+                    ${FLIGHT_EXAMPLES_LINK_LIBS}
+                    gRPC::grpc++
+                    ${GRPC_REFLECTION_LINK_LIBS}
+                    ${ARROW_PROTOBUF_LIBPROTOBUF}
+                    ${GFLAGS_LIBRARIES}
+                    EXTRA_SOURCES
+                    "${CMAKE_CURRENT_BINARY_DIR}/helloworld.pb.cc"
+                    "${CMAKE_CURRENT_BINARY_DIR}/helloworld.grpc.pb.cc")
+endif()
+
+if(ARROW_PARQUET AND ARROW_DATASET)
+  if(ARROW_BUILD_SHARED)
     set(DATASET_EXAMPLES_LINK_LIBS arrow_dataset_shared)
   else()
     set(DATASET_EXAMPLES_LINK_LIBS arrow_dataset_static)
   endif()
 
-  ADD_ARROW_EXAMPLE(dataset_parquet_scan_example
-    EXTRA_LINK_LIBS
-    ${DATASET_EXAMPLES_LINK_LIBS})
+  add_arrow_example(dataset_parquet_scan_example EXTRA_LINK_LIBS
+                    ${DATASET_EXAMPLES_LINK_LIBS})
   add_dependencies(dataset_parquet_scan_example parquet)
 
-  ADD_ARROW_EXAMPLE(dataset_documentation_example
-    EXTRA_LINK_LIBS
-    ${DATASET_EXAMPLES_LINK_LIBS})
+  add_arrow_example(dataset_documentation_example EXTRA_LINK_LIBS
+                    ${DATASET_EXAMPLES_LINK_LIBS})
   add_dependencies(dataset_documentation_example parquet)
 endif()
diff --git a/cpp/examples/arrow/flight_grpc_example.cc b/cpp/examples/arrow/flight_grpc_example.cc
new file mode 100644
index 0000000000000..b0ecd10dd9d95
--- /dev/null
+++ b/cpp/examples/arrow/flight_grpc_example.cc
@@ -0,0 +1,100 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <signal.h>
+#include <cstdlib>
+#include <iostream>
+
+#include <arrow/api.h>
+#include <arrow/flight/api.h>
+#include <gflags/gflags.h>
+#include <grpc++/grpc++.h>
+
+#include "examples/arrow/helloworld.grpc.pb.h"
+#include "examples/arrow/helloworld.pb.h"
+
+// Demonstrate registering a gRPC service alongside a Flight service
+//
+// The gRPC service can be accessed with a gRPC client, on the same
+// port as the Flight service. Additionally, the CMake config for this
+// example links against the gRPC reflection library, enabling tools
+// like grpc_cli and grpcurl to list and call RPCs on the server
+// without needing local copies of the Protobuf definitions.
+// For example, with grpcurl (https://github.com/fullstorydev/grpcurl):
+//
+// grpcurl -d '{"name": "Rakka"}' -plaintext localhost:31337 HelloWorldService/SayHello
+
+DEFINE_int32(port, -1, "Server port to listen on");
+
+namespace flight = ::arrow::flight;
+
+#define ABORT_ON_FAILURE(expr)                     \
+  do {                                             \
+    arrow::Status status_ = (expr);                \
+    if (!status_.ok()) {                           \
+      std::cerr << status_.message() << std::endl; \
+      abort();                                     \
+    }                                              \
+  } while (0);
+
+// Flight service
+class SimpleFlightServer : public flight::FlightServerBase {};
+
+// gRPC service
+class HelloWorldServiceImpl : public HelloWorldService::Service {
+  grpc::Status SayHello(grpc::ServerContext* ctx, const HelloRequest* request,
+                        HelloResponse* reply) override {
+    const std::string& name = request->name();
+    if (name.empty()) {
+      return grpc::Status(grpc::StatusCode::INVALID_ARGUMENT, "Must provide a name!");
+    }
+    reply->set_reply("Hello, " + name);
+    return grpc::Status::OK;
+  }
+};
+
+int main(int argc, char** argv) {
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+
+  if (FLAGS_port < 0) {
+    // For CI
+    std::cout << "Must specify a port with -port" << std::endl;
+    return EXIT_SUCCESS;
+  }
+
+  std::unique_ptr<flight::FlightServerBase> server;
+  server.reset(new SimpleFlightServer());
+
+  flight::Location bind_location;
+  ABORT_ON_FAILURE(flight::Location::ForGrpcTcp("0.0.0.0", FLAGS_port, &bind_location));
+  flight::FlightServerOptions options(bind_location);
+
+  HelloWorldServiceImpl grpc_service;
+  int extra_port = 0;
+
+  options.builder_hook = [&](void* raw_builder) {
+    auto* builder = reinterpret_cast<grpc::ServerBuilder*>(raw_builder);
+    builder->AddListeningPort("0.0.0.0:0", grpc::InsecureServerCredentials(),
+                              &extra_port);
+    builder->RegisterService(&grpc_service);
+  };
+  ABORT_ON_FAILURE(server->Init(options));
+  std::cout << "Listening on ports " << FLAGS_port << " and " << extra_port << std::endl;
+  ABORT_ON_FAILURE(server->SetShutdownOnSignals({SIGTERM}));
+  ABORT_ON_FAILURE(server->Serve());
+  return EXIT_SUCCESS;
+}
diff --git a/cpp/examples/arrow/helloworld.proto b/cpp/examples/arrow/helloworld.proto
new file mode 100644
index 0000000000000..599f88b185a8e
--- /dev/null
+++ b/cpp/examples/arrow/helloworld.proto
@@ -0,0 +1,30 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+syntax = "proto3";
+
+service HelloWorldService {
+  rpc SayHello(HelloRequest) returns (HelloResponse);
+}
+
+message HelloRequest {
+  string name = 1;
+}
+
+message HelloResponse {
+  string reply = 1;
+}
\ No newline at end of file
diff --git a/cpp/examples/minimal_build/CMakeLists.txt b/cpp/examples/minimal_build/CMakeLists.txt
index 9fc20c70fe0d2..d3eea7ba0f08c 100644
--- a/cpp/examples/minimal_build/CMakeLists.txt
+++ b/cpp/examples/minimal_build/CMakeLists.txt
@@ -31,7 +31,7 @@ message(STATUS "Arrow SO version: ${ARROW_FULL_SO_VERSION}")
 
 add_executable(arrow_example example.cc)
 
-if (ARROW_LINK_SHARED)
+if(ARROW_LINK_SHARED)
   target_link_libraries(arrow_example PRIVATE arrow_shared)
 else()
   set(THREADS_PREFER_PTHREAD_FLAG ON)
diff --git a/cpp/examples/parquet/CMakeLists.txt b/cpp/examples/parquet/CMakeLists.txt
index 2d16948ae4ab8..e0d7432688a55 100644
--- a/cpp/examples/parquet/CMakeLists.txt
+++ b/cpp/examples/parquet/CMakeLists.txt
@@ -23,33 +23,33 @@ target_include_directories(parquet_low_level_example PRIVATE low_level_api/)
 target_include_directories(parquet_low_level_example2 PRIVATE low_level_api/)
 
 # The variables in these files are for illustration purposes
-set(PARQUET_EXAMPLES_WARNING_SUPPRESSIONS
-  low_level_api/reader_writer.cc
-  low_level_api/reader_writer2.cc)
+set(PARQUET_EXAMPLES_WARNING_SUPPRESSIONS low_level_api/reader_writer.cc
+                                          low_level_api/reader_writer2.cc)
 
-if (PARQUET_REQUIRE_ENCRYPTION)
+if(PARQUET_REQUIRE_ENCRYPTION)
   add_executable(parquet_encryption_example low_level_api/encryption_reader_writer.cc)
-  add_executable(parquet_encryption_example_all_crypto_options low_level_api/encryption_reader_writer_all_crypto_options.cc)
+  add_executable(parquet_encryption_example_all_crypto_options
+                 low_level_api/encryption_reader_writer_all_crypto_options.cc)
   target_include_directories(parquet_encryption_example PRIVATE low_level_api/)
-  target_include_directories(parquet_encryption_example_all_crypto_options PRIVATE low_level_api/)
+  target_include_directories(parquet_encryption_example_all_crypto_options
+                             PRIVATE low_level_api/)
 
   set(PARQUET_EXAMPLES_WARNING_SUPPRESSIONS
-    ${PARQUET_EXAMPLES_WARNING_SUPPRESSIONS}
-    low_level_api/encryption_reader_writer.cc
-    low_level_api/encryption_reader_writer_all_crypto_options.cc)
+      ${PARQUET_EXAMPLES_WARNING_SUPPRESSIONS} low_level_api/encryption_reader_writer.cc
+      low_level_api/encryption_reader_writer_all_crypto_options.cc)
 
 endif()
 
 if(UNIX)
   foreach(FILE ${PARQUET_EXAMPLES_WARNING_SUPPRESSIONS})
     set_property(SOURCE ${FILE}
-      APPEND_STRING
-      PROPERTY COMPILE_FLAGS "-Wno-unused-variable")
+                 APPEND_STRING
+                 PROPERTY COMPILE_FLAGS "-Wno-unused-variable")
   endforeach()
 endif()
 
 # Prefer shared linkage but use static if shared build is deactivated
-if (ARROW_BUILD_SHARED)
+if(ARROW_BUILD_SHARED)
   set(PARQUET_EXAMPLE_LINK_LIBS parquet_shared)
 else()
   set(PARQUET_EXAMPLE_LINK_LIBS parquet_static)
@@ -62,17 +62,17 @@ target_link_libraries(parquet_stream_api_example ${PARQUET_EXAMPLE_LINK_LIBS})
 
 if(PARQUET_REQUIRE_ENCRYPTION)
   target_link_libraries(parquet_encryption_example ${PARQUET_EXAMPLE_LINK_LIBS})
-  target_link_libraries(parquet_encryption_example_all_crypto_options ${PARQUET_EXAMPLE_LINK_LIBS})
+  target_link_libraries(parquet_encryption_example_all_crypto_options
+                        ${PARQUET_EXAMPLE_LINK_LIBS})
 endif()
 
 add_dependencies(parquet
-  parquet_low_level_example
-  parquet_low_level_example2
-  parquet_arrow_example
-  parquet_stream_api_example)
+                 parquet_low_level_example
+                 parquet_low_level_example2
+                 parquet_arrow_example
+                 parquet_stream_api_example)
 
-if (PARQUET_REQUIRE_ENCRYPTION)
-  add_dependencies(parquet
-    parquet_encryption_example
-    parquet_encryption_example_all_crypto_options)
+if(PARQUET_REQUIRE_ENCRYPTION)
+  add_dependencies(parquet parquet_encryption_example
+                   parquet_encryption_example_all_crypto_options)
 endif()
diff --git a/dev/archery/archery/utils/lint.py b/dev/archery/archery/utils/lint.py
index d95bfeea309a9..2f97ed9b2e5d4 100644
--- a/dev/archery/archery/utils/lint.py
+++ b/dev/archery/archery/utils/lint.py
@@ -149,6 +149,7 @@ def cmake_linter(src, fix=False):
             'ci/**/*.cmake',
             'cpp/CMakeLists.txt',
             'cpp/src/**/CMakeLists.txt',
+            'cpp/examples/**/CMakeLists.txt',
             'cpp/cmake_modules/*.cmake',
             'go/**/CMakeLists.txt',
             'java/**/CMakeLists.txt',

From 81612a5fd17293a755e2f606510ecd8b085cba7a Mon Sep 17 00:00:00 2001
From: Ian Alexander Joiner <iajoiner809@gmail.com>
Date: Mon, 1 Nov 2021 08:02:11 -0400
Subject: [PATCH 056/194] MINOR: Change the feature matrix to add CSV write
 support for C++

This feature has been available since 4.0.0 so we should update the feature matrix.

Closes #11580 from iajoiner/patch-2

Authored-by: Ian Alexander Joiner <iajoiner809@gmail.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 docs/source/status.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/status.rst b/docs/source/status.rst
index 879f20f81faa8..e0e31f0810a93 100644
--- a/docs/source/status.rst
+++ b/docs/source/status.rst
@@ -219,7 +219,7 @@ Third-Party Data Formats
 +=============================+=========+=========+=======+============+=======+=========+=======+
 | Avro                        |         | R       |       |            |       |         |       |
 +-----------------------------+---------+---------+-------+------------+-------+---------+-------+
-| CSV                         | R       |         | R/W   |            |       | R/W     | R/W   |
+| CSV                         | R/W     |         | R/W   |            |       | R/W     | R/W   |
 +-----------------------------+---------+---------+-------+------------+-------+---------+-------+
 | ORC                         | R/W     | R (2)   |       |            |       |         |       |
 +-----------------------------+---------+---------+-------+------------+-------+---------+-------+

From 273fab723ce1919c487085e4dbba72eeb669aa5b Mon Sep 17 00:00:00 2001
From: Nic Crane <thisisnic@gmail.com>
Date: Mon, 1 Nov 2021 16:14:50 +0000
Subject: [PATCH 057/194] ARROW-14533: [R] Turn linter off on curly braces on
 new line

Closes #11582 from thisisnic/ARROW_14533-linter

Authored-by: Nic Crane <thisisnic@gmail.com>
Signed-off-by: Nic Crane <thisisnic@gmail.com>
---
 r/.lintr | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/r/.lintr b/r/.lintr
index fb9ca8f87c7b4..b7c046f8e2cac 100644
--- a/r/.lintr
+++ b/r/.lintr
@@ -22,7 +22,8 @@ linters: with_defaults(
   # object_name_linter = object_name_linter(styles = c("snake_case", "camelCase", "CamelCase", "symbols", "dotted.case", "UPPERCASE", "SNAKE_CASE")),
   object_length_linter = object_length_linter(40),
   object_usage_linter = NULL, # R6 methods are flagged,
-  cyclocomp_linter = cyclocomp_linter(26) # TODO: reduce to default of 15
+  cyclocomp_linter = cyclocomp_linter(26), # TODO: reduce to default of 15
+  open_curly_linter = NULL # styler and lintr conflict on this (https://github.com/r-lib/styler/issues/549#issuecomment-537191536)
   )
 exclusions: list(
   "tests/testthat/latin1.R",

From a0c650415bc28920512077faecdfa9d07d3c4efe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dragos=20Moldovan-Gr=C3=BCnfeld?= <dragos.mold@gmail.com>
Date: Mon, 1 Nov 2021 15:42:37 -0500
Subject: [PATCH 058/194] ARROW-13887 [R] Capture error produced when reading
 in CSV file with headers and using a schema, and add suggestion
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes #11432 from dragosmg/ARROW-13887_csv_header_schema_error

Lead-authored-by: Dragos Moldovan-Grünfeld <dragos.mold@gmail.com>
Co-authored-by: Dragoș Moldovan-Grünfeld <dragos.mold@gmail.com>
Signed-off-by: Jonathan Keane <jkeane@gmail.com>
---
 r/R/csv.R                   |  7 ++++++-
 r/R/util.R                  | 15 +++++++++++++++
 r/tests/testthat/test-csv.R | 21 +++++++++++++++++++++
 3 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/r/R/csv.R b/r/R/csv.R
index ee890578ffccc..10be93091a403 100644
--- a/r/R/csv.R
+++ b/r/R/csv.R
@@ -192,7 +192,12 @@ read_delim_arrow <- function(file,
     convert_options = convert_options
   )
 
-  tab <- reader$Read()
+  tryCatch(
+    tab <- reader$Read(),
+    error = function(e) {
+      handle_csv_read_error(e, schema)
+    }
+  )
 
   # TODO: move this into convert_options using include_columns
   col_select <- enquo(col_select)
diff --git a/r/R/util.R b/r/R/util.R
index 9e3ade6a96773..ea438c5e4cbc7 100644
--- a/r/R/util.R
+++ b/r/R/util.R
@@ -193,3 +193,18 @@ repeat_value_as_array <- function(object, n) {
   }
   return(Scalar$create(object)$as_array(n))
 }
+
+handle_csv_read_error <- function(e, schema) {
+  msg <- conditionMessage(e)
+
+  if (grepl("conversion error", msg) && inherits(schema, "Schema")) {
+    abort(c(
+      msg,
+      i = paste("If you have supplied a schema and your data contains a header",
+                "row, you should supply the argument `skip = 1` to prevent the",
+                "header being read in as data.")
+    ))
+  }
+
+  abort(e)
+}
diff --git a/r/tests/testthat/test-csv.R b/r/tests/testthat/test-csv.R
index 023eee92e875d..f0416eb3f728f 100644
--- a/r/tests/testthat/test-csv.R
+++ b/r/tests/testthat/test-csv.R
@@ -266,6 +266,27 @@ test_that("Mix of guessing and declaring types", {
   expect_identical(df, tbl[, c("dbl", "false", "chr")])
 })
 
+test_that("more informative error when reading a CSV with headers and schema", {
+  tf <- tempfile()
+  on.exit(unlink(tf))
+
+  write.csv(example_data, tf, row.names = FALSE)
+
+  share_schema <- schema(
+    int = int32(),
+    dbl = float64(),
+    dbl2 = float64(),
+    lgl = boolean(),
+    false = boolean(),
+    chr = utf8(),
+    fct = utf8()
+  )
+
+  expect_error(
+    read_csv_arrow(tf, schema = share_schema),
+    "header row"
+  )
+})
 
 test_that("Write a CSV file with header", {
   tbl_out <- write_csv_arrow(tbl_no_dates, csv_file)

From 7667c10c448777b4b11dd88a084fa53e1da3e7c6 Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Tue, 2 Nov 2021 20:01:12 +0100
Subject: [PATCH 059/194] MINOR: [C++] Avoid linting files outside of the
 source tree(s)

The base `cpp` directory can typically contain build directories and other foreign files.
Only lint the C++ source files in `cpp/src` and `cpp/examples`.

Closes #11593 from pitrou/minor-fix-cpp-lint

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/CMakeLists.txt                    | 35 ++++++++++++---------------
 cpp/build-support/lint_exclusions.txt |  2 --
 cpp/build-support/run_clang_format.py |  6 +++--
 cpp/build-support/run_clang_tidy.py   |  6 +++--
 cpp/build-support/run_cpplint.py      |  6 +++--
 5 files changed, 27 insertions(+), 28 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index ea0eda4dfd85b..f3d6b24c48f45 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -233,15 +233,22 @@ find_program(CPPLINT_BIN
              HINTS ${BUILD_SUPPORT_DIR})
 message(STATUS "Found cpplint executable at ${CPPLINT_BIN}")
 
+set(COMMON_LINT_OPTIONS
+    --exclude_globs
+    ${LINT_EXCLUSIONS_FILE}
+    --source_dir
+    ${CMAKE_CURRENT_SOURCE_DIR}/src
+    --source_dir
+    ${CMAKE_CURRENT_SOURCE_DIR}/examples
+    --source_dir
+    ${CMAKE_CURRENT_SOURCE_DIR}/tools)
+
 add_custom_target(lint
                   ${PYTHON_EXECUTABLE}
                   ${BUILD_SUPPORT_DIR}/run_cpplint.py
                   --cpplint_binary
                   ${CPPLINT_BIN}
-                  --exclude_globs
-                  ${LINT_EXCLUSIONS_FILE}
-                  --source_dir
-                  ${CMAKE_CURRENT_SOURCE_DIR}
+                  ${COMMON_LINT_OPTIONS}
                   ${ARROW_LINT_QUIET})
 
 #
@@ -254,10 +261,7 @@ if(${CLANG_FORMAT_FOUND})
                     ${BUILD_SUPPORT_DIR}/run_clang_format.py
                     --clang_format_binary
                     ${CLANG_FORMAT_BIN}
-                    --exclude_globs
-                    ${LINT_EXCLUSIONS_FILE}
-                    --source_dir
-                    ${CMAKE_CURRENT_SOURCE_DIR}
+                    ${COMMON_LINT_OPTIONS}
                     --fix
                     ${ARROW_LINT_QUIET})
 
@@ -267,10 +271,7 @@ if(${CLANG_FORMAT_FOUND})
                     ${BUILD_SUPPORT_DIR}/run_clang_format.py
                     --clang_format_binary
                     ${CLANG_FORMAT_BIN}
-                    --exclude_globs
-                    ${LINT_EXCLUSIONS_FILE}
-                    --source_dir
-                    ${CMAKE_CURRENT_SOURCE_DIR}
+                    ${COMMON_LINT_OPTIONS}
                     ${ARROW_LINT_QUIET})
 endif()
 
@@ -294,12 +295,9 @@ if(${CLANG_TIDY_FOUND})
                     ${BUILD_SUPPORT_DIR}/run_clang_tidy.py
                     --clang_tidy_binary
                     ${CLANG_TIDY_BIN}
-                    --exclude_globs
-                    ${LINT_EXCLUSIONS_FILE}
                     --compile_commands
                     ${CMAKE_BINARY_DIR}/compile_commands.json
-                    --source_dir
-                    ${CMAKE_CURRENT_SOURCE_DIR}
+                    ${COMMON_LINT_OPTIONS}
                     --fix
                     ${ARROW_LINT_QUIET})
 
@@ -309,12 +307,9 @@ if(${CLANG_TIDY_FOUND})
                     ${BUILD_SUPPORT_DIR}/run_clang_tidy.py
                     --clang_tidy_binary
                     ${CLANG_TIDY_BIN}
-                    --exclude_globs
-                    ${LINT_EXCLUSIONS_FILE}
                     --compile_commands
                     ${CMAKE_BINARY_DIR}/compile_commands.json
-                    --source_dir
-                    ${CMAKE_CURRENT_SOURCE_DIR}
+                    ${COMMON_LINT_OPTIONS}
                     ${ARROW_LINT_QUIET})
 endif()
 
diff --git a/cpp/build-support/lint_exclusions.txt b/cpp/build-support/lint_exclusions.txt
index 4feb8fbe13861..73cbd884f44e3 100644
--- a/cpp/build-support/lint_exclusions.txt
+++ b/cpp/build-support/lint_exclusions.txt
@@ -1,8 +1,6 @@
 *_generated*
 *.grpc.fb.*
-*apidoc/*
 *arrowExports.cpp*
-*build_support/*
 *parquet_constants.*
 *parquet_types.*
 *pyarrow_api.h
diff --git a/cpp/build-support/run_clang_format.py b/cpp/build-support/run_clang_format.py
index fd653a530711e..96487251d0070 100755
--- a/cpp/build-support/run_clang_format.py
+++ b/cpp/build-support/run_clang_format.py
@@ -61,6 +61,7 @@ def _check_one_file(filename, formatted):
                         "that should be excluded from the checks")
     parser.add_argument("--source_dir",
                         required=True,
+                        action="append",
                         help="Root directory of the source code")
     parser.add_argument("--fix", default=False,
                         action="store_true",
@@ -78,8 +79,9 @@ def _check_one_file(filename, formatted):
             exclude_globs.extend(line.strip() for line in f)
 
     formatted_filenames = []
-    for path in lintutils.get_sources(arguments.source_dir, exclude_globs):
-        formatted_filenames.append(str(path))
+    for source_dir in arguments.source_dir:
+        for path in lintutils.get_sources(source_dir, exclude_globs):
+            formatted_filenames.append(str(path))
 
     if arguments.fix:
         if not arguments.quiet:
diff --git a/cpp/build-support/run_clang_tidy.py b/cpp/build-support/run_clang_tidy.py
index e5211be84e554..863c5bd70ab2c 100755
--- a/cpp/build-support/run_clang_tidy.py
+++ b/cpp/build-support/run_clang_tidy.py
@@ -83,6 +83,7 @@ def _check_all(cmd, filenames):
                         help="compile_commands.json to pass clang-tidy")
     parser.add_argument("--source_dir",
                         required=True,
+                        action="append",
                         help="Root directory of the source code")
     parser.add_argument("--fix", default=False,
                         action="store_true",
@@ -100,8 +101,9 @@ def _check_all(cmd, filenames):
             exclude_globs.append(line.strip())
 
     linted_filenames = []
-    for path in lintutils.get_sources(arguments.source_dir, exclude_globs):
-        linted_filenames.append(path)
+    for source_dir in arguments.source_dir:
+        for path in lintutils.get_sources(source_dir, exclude_globs):
+            linted_filenames.append(path)
 
     if not arguments.quiet:
         msg = 'Tidying {}' if arguments.fix else 'Checking {}'
diff --git a/cpp/build-support/run_cpplint.py b/cpp/build-support/run_cpplint.py
index cc98e094e6ce9..76c0fe0aefaca 100755
--- a/cpp/build-support/run_cpplint.py
+++ b/cpp/build-support/run_cpplint.py
@@ -67,6 +67,7 @@ def _check_some_files(completed_processes, filenames):
                         "that should be excluded from the checks")
     parser.add_argument("--source_dir",
                         required=True,
+                        action="append",
                         help="Root directory of the source code")
     parser.add_argument("--quiet", default=False,
                         action="store_true",
@@ -79,8 +80,9 @@ def _check_some_files(completed_processes, filenames):
             exclude_globs.extend(line.strip() for line in f)
 
     linted_filenames = []
-    for path in lintutils.get_sources(arguments.source_dir, exclude_globs):
-        linted_filenames.append(str(path))
+    for source_dir in arguments.source_dir:
+        for path in lintutils.get_sources(source_dir, exclude_globs):
+            linted_filenames.append(str(path))
 
     cmd = [
         arguments.cpplint_binary,

From 2917baf4744940ebe809c5b16effc806859f8843 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Wed, 3 Nov 2021 05:26:43 +0900
Subject: [PATCH 060/194] ARROW-14529: [GLib] Validate Decimal{128,256}DataType
 precision

Closes #11586 from kou/glib-validate-decimal-data-type-precision

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 c_glib/arrow-glib/basic-data-type.cpp    | 79 ++++++++++++++++--------
 c_glib/arrow-glib/basic-data-type.h      |  9 +--
 c_glib/test/test-decimal128-data-type.rb | 13 ++++
 c_glib/test/test-decimal256-data-type.rb | 13 ++++
 4 files changed, 81 insertions(+), 33 deletions(-)

diff --git a/c_glib/arrow-glib/basic-data-type.cpp b/c_glib/arrow-glib/basic-data-type.cpp
index 47ff79e616fb5..888b7143fbf6f 100644
--- a/c_glib/arrow-glib/basic-data-type.cpp
+++ b/c_glib/arrow-glib/basic-data-type.cpp
@@ -1287,20 +1287,29 @@ garrow_decimal_data_type_class_init(GArrowDecimalDataTypeClass *klass)
  * garrow_decimal_data_type_new:
  * @precision: The precision of decimal data.
  * @scale: The scale of decimal data.
+ * @error: (nullable): Return location for a #GError or %NULL.
  *
- * Returns: The newly created decimal data type.
+ * Returns: (nullable):
+ *   The newly created decimal data type on success, %NULL on error.
  *
- * Since: 0.10.0
+ *   #GArrowDecimal256DataType is used if @precision is larger than
+ *   garrow_decimal128_data_type_max_precision(),
+ *   #GArrowDecimal128DataType is used otherwise.
  *
- * Deprecated: 0.12.0:
- *   Use garrow_decimal128_data_type_new() instead.
+ * Since: 0.10.0
  */
 GArrowDecimalDataType *
 garrow_decimal_data_type_new(gint32 precision,
-                             gint32 scale)
+                             gint32 scale,
+                             GError **error)
 {
-  auto decimal128_data_type = garrow_decimal128_data_type_new(precision, scale);
-  return GARROW_DECIMAL_DATA_TYPE(decimal128_data_type);
+  if (precision <= garrow_decimal128_data_type_max_precision()) {
+    return GARROW_DECIMAL_DATA_TYPE(
+      garrow_decimal128_data_type_new(precision, scale, error));
+  } else {
+    return GARROW_DECIMAL_DATA_TYPE(
+      garrow_decimal256_data_type_new(precision, scale, error));
+  }
 }
 
 /**
@@ -1371,22 +1380,30 @@ garrow_decimal128_data_type_max_precision()
  * garrow_decimal128_data_type_new:
  * @precision: The precision of decimal data.
  * @scale: The scale of decimal data.
+ * @error: (nullable): Return location for a #GError or %NULL.
  *
- * Returns: The newly created 128-bit decimal data type.
+ * Returns: (nullable):
+ *   The newly created 128-bit decimal data type on success, %NULL on error.
  *
  * Since: 0.12.0
  */
 GArrowDecimal128DataType *
 garrow_decimal128_data_type_new(gint32 precision,
-                                gint32 scale)
-{
-  auto arrow_data_type = arrow::decimal128(precision, scale);
-
-  auto data_type =
-    GARROW_DECIMAL128_DATA_TYPE(g_object_new(GARROW_TYPE_DECIMAL128_DATA_TYPE,
-                                             "data-type", &arrow_data_type,
-                                             NULL));
-  return data_type;
+                                gint32 scale,
+                                GError **error)
+{
+  auto arrow_data_type_result = arrow::Decimal128Type::Make(precision, scale);
+  if (garrow::check(error,
+                    arrow_data_type_result,
+                    "[decimal128-data-type][new]")) {
+    auto arrow_data_type = *arrow_data_type_result;
+    return GARROW_DECIMAL128_DATA_TYPE(
+      g_object_new(GARROW_TYPE_DECIMAL128_DATA_TYPE,
+                   "data-type", &arrow_data_type,
+                   NULL));
+  } else {
+    return NULL;
+  }
 }
 
 
@@ -1421,22 +1438,30 @@ garrow_decimal256_data_type_max_precision()
  * garrow_decimal256_data_type_new:
  * @precision: The precision of decimal data.
  * @scale: The scale of decimal data.
+ * @error: (nullable): Return location for a #GError or %NULL.
  *
- * Returns: The newly created 256-bit decimal data type.
+ * Returns: (nullable):
+ *   The newly created 256-bit decimal data type on success, %NULL on error.
  *
  * Since: 3.0.0
  */
 GArrowDecimal256DataType *
 garrow_decimal256_data_type_new(gint32 precision,
-                                gint32 scale)
-{
-  auto arrow_data_type = arrow::decimal256(precision, scale);
-
-  auto data_type =
-    GARROW_DECIMAL256_DATA_TYPE(g_object_new(GARROW_TYPE_DECIMAL256_DATA_TYPE,
-                                             "data-type", &arrow_data_type,
-                                             NULL));
-  return data_type;
+                                gint32 scale,
+                                GError **error)
+{
+  auto arrow_data_type_result = arrow::Decimal256Type::Make(precision, scale);
+  if (garrow::check(error,
+                    arrow_data_type_result,
+                    "[decimal256-data-type][new]")) {
+    auto arrow_data_type = *arrow_data_type_result;
+    return GARROW_DECIMAL256_DATA_TYPE(
+      g_object_new(GARROW_TYPE_DECIMAL256_DATA_TYPE,
+                   "data-type", &arrow_data_type,
+                   NULL));
+  } else {
+    return NULL;
+  }
 }
 
 
diff --git a/c_glib/arrow-glib/basic-data-type.h b/c_glib/arrow-glib/basic-data-type.h
index f56a8b2d990ae..b498583e26529 100644
--- a/c_glib/arrow-glib/basic-data-type.h
+++ b/c_glib/arrow-glib/basic-data-type.h
@@ -456,11 +456,8 @@ struct _GArrowDecimalDataTypeClass
   GArrowFixedSizeBinaryDataTypeClass parent_class;
 };
 
-#ifndef GARROW_DISABLE_DEPRECATED
-GARROW_DEPRECATED_IN_0_12_FOR(garrow_decimal128_data_type_new)
 GArrowDecimalDataType *
-garrow_decimal_data_type_new(gint32 precision, gint32 scale);
-#endif
+garrow_decimal_data_type_new(gint32 precision, gint32 scale, GError **error);
 gint32 garrow_decimal_data_type_get_precision(GArrowDecimalDataType *decimal_data_type);
 gint32 garrow_decimal_data_type_get_scale(GArrowDecimalDataType *decimal_data_type);
 
@@ -482,7 +479,7 @@ garrow_decimal128_data_type_max_precision();
 
 GARROW_AVAILABLE_IN_0_12
 GArrowDecimal128DataType *
-garrow_decimal128_data_type_new(gint32 precision, gint32 scale);
+garrow_decimal128_data_type_new(gint32 precision, gint32 scale, GError **error);
 
 
 #define GARROW_TYPE_DECIMAL256_DATA_TYPE (garrow_decimal256_data_type_get_type())
@@ -502,7 +499,7 @@ garrow_decimal256_data_type_max_precision();
 
 GARROW_AVAILABLE_IN_3_0
 GArrowDecimal256DataType *
-garrow_decimal256_data_type_new(gint32 precision, gint32 scale);
+garrow_decimal256_data_type_new(gint32 precision, gint32 scale, GError **error);
 
 #define GARROW_TYPE_EXTENSION_DATA_TYPE (garrow_extension_data_type_get_type())
 G_DECLARE_DERIVABLE_TYPE(GArrowExtensionDataType,
diff --git a/c_glib/test/test-decimal128-data-type.rb b/c_glib/test/test-decimal128-data-type.rb
index b27e1cad1ea3f..bcee24187f4c2 100644
--- a/c_glib/test/test-decimal128-data-type.rb
+++ b/c_glib/test/test-decimal128-data-type.rb
@@ -40,4 +40,17 @@ def test_scale
     data_type = Arrow::Decimal128DataType.new(8, 2)
     assert_equal(2, data_type.scale)
   end
+
+  def test_deciaml_data_type_new
+    assert_equal(Arrow::Decimal128DataType.new(8, 2),
+                 Arrow::DecimalDataType.new(8, 2))
+  end
+
+  def test_invalid_precision
+    message =
+      "[decimal128-data-type][new]: Invalid: Decimal precision out of range: 39"
+    assert_raise(Arrow::Error::Invalid.new(message)) do
+      Arrow::Decimal128DataType.new(39, 1)
+    end
+  end
 end
diff --git a/c_glib/test/test-decimal256-data-type.rb b/c_glib/test/test-decimal256-data-type.rb
index 596c3dab92998..3070a4e4c6ca4 100644
--- a/c_glib/test/test-decimal256-data-type.rb
+++ b/c_glib/test/test-decimal256-data-type.rb
@@ -40,4 +40,17 @@ def test_scale
     data_type = Arrow::Decimal256DataType.new(8, 2)
     assert_equal(2, data_type.scale)
   end
+
+  def test_deciaml_data_type_new
+    assert_equal(Arrow::Decimal256DataType.new(39, 1),
+                 Arrow::DecimalDataType.new(39, 1))
+  end
+
+  def test_invalid_precision
+    message =
+      "[decimal256-data-type][new]: Invalid: Decimal precision out of range: 77"
+    assert_raise(Arrow::Error::Invalid.new(message)) do
+      Arrow::Decimal256DataType.new(77, 1)
+    end
+  end
 end

From 92e3da573738d21ef7e74775f67ede499d59ebd7 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Wed, 3 Nov 2021 05:27:59 +0900
Subject: [PATCH 061/194] ARROW-14530: [GLib] Return error for invalid decimal
 string

Closes #11587 from kou/glib-decimal-string-validate

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 c_glib/arrow-glib/decimal.cpp  | 35 +++++++++++++++++++++++-----------
 c_glib/arrow-glib/decimal.h    |  8 ++++++--
 c_glib/test/test-decimal128.rb | 11 +++++++++++
 c_glib/test/test-decimal256.rb | 11 +++++++++++
 4 files changed, 52 insertions(+), 13 deletions(-)

diff --git a/c_glib/arrow-glib/decimal.cpp b/c_glib/arrow-glib/decimal.cpp
index 497d76fcfaa36..ebda68e0ff7cb 100644
--- a/c_glib/arrow-glib/decimal.cpp
+++ b/c_glib/arrow-glib/decimal.cpp
@@ -60,11 +60,18 @@ struct DecimalConverter<arrow::Decimal256> {
 
 template <typename Decimal>
 typename DecimalConverter<Decimal>::GArrowType *
-garrow_decimal_new_string(const gchar *data)
-{
-  auto arrow_decimal = std::make_shared<Decimal>(data);
-  DecimalConverter<Decimal> converter;
-  return converter.new_raw(&arrow_decimal);
+garrow_decimal_new_string(const gchar *data,
+                          GError **error,
+                          const gchar *tag)
+{
+  auto arrow_decimal_result = Decimal::FromString(data);
+  if (garrow::check(error, arrow_decimal_result, tag)) {
+    auto arrow_decimal = std::make_shared<Decimal>(*arrow_decimal_result);
+    DecimalConverter<Decimal> converter;
+    return converter.new_raw(&arrow_decimal);
+  } else {
+    return NULL;
+  }
 }
 
 template <typename Decimal>
@@ -375,15 +382,18 @@ garrow_decimal128_class_init(GArrowDecimal128Class *klass)
 /**
  * garrow_decimal128_new_string:
  * @data: The data of the decimal.
+ * @error: (nullable): Return location for a #GError or %NULL.
  *
- * Returns: A newly created #GArrowDecimal128.
+ * Returns: (nullable):
+ *   A newly created #GArrowDecimal128 on success, %NULL on error.
  *
  * Since: 0.10.0
  */
 GArrowDecimal128 *
-garrow_decimal128_new_string(const gchar *data)
+garrow_decimal128_new_string(const gchar *data, GError **error)
 {
-  return garrow_decimal_new_string<arrow::Decimal128>(data);
+  return garrow_decimal_new_string<arrow::Decimal128>(
+    data, error, "[decimal128][new][string]");
 }
 
 /**
@@ -780,15 +790,18 @@ garrow_decimal256_class_init(GArrowDecimal256Class *klass)
 /**
  * garrow_decimal256_new_string:
  * @data: The data of the decimal.
+ * @error: (nullable): Return location for a #GError or %NULL.
  *
- * Returns: A newly created #GArrowDecimal256.
+ * Returns: (nullable):
+ *   A newly created #GArrowDecimal256 on success, %NULL on error.
  *
  * Since: 3.0.0
  */
 GArrowDecimal256 *
-garrow_decimal256_new_string(const gchar *data)
+garrow_decimal256_new_string(const gchar *data, GError **error)
 {
-  return garrow_decimal_new_string<arrow::Decimal256>(data);
+  return garrow_decimal_new_string<arrow::Decimal256>(
+    data, error, "[decimal256][new][string]");
 }
 
 /**
diff --git a/c_glib/arrow-glib/decimal.h b/c_glib/arrow-glib/decimal.h
index 61f849234933b..97b521f3fbe94 100644
--- a/c_glib/arrow-glib/decimal.h
+++ b/c_glib/arrow-glib/decimal.h
@@ -37,7 +37,9 @@ struct _GArrowDecimal128Class
   GObjectClass parent_class;
 };
 
-GArrowDecimal128 *garrow_decimal128_new_string(const gchar *data);
+GArrowDecimal128 *
+garrow_decimal128_new_string(const gchar *data,
+                             GError **error);
 GArrowDecimal128 *garrow_decimal128_new_integer(const gint64 data);
 GARROW_AVAILABLE_IN_3_0
 GArrowDecimal128 *garrow_decimal128_copy(GArrowDecimal128 *decimal);
@@ -99,7 +101,9 @@ struct _GArrowDecimal256Class
 };
 
 GARROW_AVAILABLE_IN_3_0
-GArrowDecimal256 *garrow_decimal256_new_string(const gchar *data);
+GArrowDecimal256 *
+garrow_decimal256_new_string(const gchar *data,
+                             GError **error);
 GARROW_AVAILABLE_IN_3_0
 GArrowDecimal256 *garrow_decimal256_new_integer(const gint64 data);
 GARROW_AVAILABLE_IN_3_0
diff --git a/c_glib/test/test-decimal128.rb b/c_glib/test/test-decimal128.rb
index 8f14cfbe52092..d032afd510db7 100644
--- a/c_glib/test/test-decimal128.rb
+++ b/c_glib/test/test-decimal128.rb
@@ -18,6 +18,17 @@
 class TestDecimal128 < Test::Unit::TestCase
   include Helper::Omittable
 
+  def test_new_string_invalid
+    message =
+      "[decimal128][new][string]: Invalid: " +
+      "The string '1,1' is not a valid decimal128 number"
+    error = assert_raise(Arrow::Error::Invalid) do
+      Arrow::Decimal128.new("1,1")
+    end
+    assert_equal(message,
+                 error.message.lines.first.chomp)
+  end
+
   def test_copy
     decimal = Arrow::Decimal128.new("234.23445")
     assert_equal(decimal, decimal.copy)
diff --git a/c_glib/test/test-decimal256.rb b/c_glib/test/test-decimal256.rb
index d422aef339e2e..24fd3b5552b2f 100644
--- a/c_glib/test/test-decimal256.rb
+++ b/c_glib/test/test-decimal256.rb
@@ -18,6 +18,17 @@
 class TestDecimal256 < Test::Unit::TestCase
   include Helper::Omittable
 
+  def test_new_string_invalid
+    message =
+      "[decimal256][new][string]: Invalid: " +
+      "The string '1,1' is not a valid decimal256 number"
+    error = assert_raise(Arrow::Error::Invalid) do
+      Arrow::Decimal256.new("1,1")
+    end
+    assert_equal(message,
+                 error.message.lines.first.chomp)
+  end
+
   def test_copy
     decimal = Arrow::Decimal256.new("234.23445")
     assert_equal(decimal, decimal.copy)

From bf67ec74635db2183619601f025e4724bd5a6b75 Mon Sep 17 00:00:00 2001
From: Neal Richardson <neal.p.richardson@gmail.com>
Date: Tue, 2 Nov 2021 17:52:34 -0400
Subject: [PATCH 062/194] ARROW-14538: [R] Work around empty tr call on Solaris

Patch included when submitting 6.0.0.x to CRAN.

Closes #11585 from nealrichardson/lame-tr

Authored-by: Neal Richardson <neal.p.richardson@gmail.com>
Signed-off-by: Neal Richardson <neal.p.richardson@gmail.com>
---
 r/inst/build_arrow_static.sh |  4 +++-
 r/vignettes/install.Rmd      | 38 ++++++++++++++++++------------------
 2 files changed, 22 insertions(+), 20 deletions(-)

diff --git a/r/inst/build_arrow_static.sh b/r/inst/build_arrow_static.sh
index 5f01ae0a75f5f..c424646e3154e 100755
--- a/r/inst/build_arrow_static.sh
+++ b/r/inst/build_arrow_static.sh
@@ -37,7 +37,9 @@ SOURCE_DIR="$(cd "${SOURCE_DIR}" && pwd)"
 DEST_DIR="$(mkdir -p "${DEST_DIR}" && cd "${DEST_DIR}" && pwd)"
 
 # Make some env vars case-insensitive
-LIBARROW_MINIMAL=`echo $LIBARROW_MINIMAL | tr '[:upper:]' '[:lower:]'`
+if [ "$LIBARROW_MINIMAL" != "" ]; then
+  LIBARROW_MINIMAL=`echo $LIBARROW_MINIMAL | tr '[:upper:]' '[:lower:]'`
+fi
 
 if [ "$LIBARROW_MINIMAL" = "false" ]; then
   ARROW_DEFAULT_PARAM="ON"
diff --git a/r/vignettes/install.Rmd b/r/vignettes/install.Rmd
index 66f3e8e2e6eb5..5bd76a3719b49 100644
--- a/r/vignettes/install.Rmd
+++ b/r/vignettes/install.Rmd
@@ -214,39 +214,39 @@ This build script is also what is used to generate the prebuilt binaries.
 
 ## How the package is installed - advanced
 
-This subsection contains information which is likely to be most relevant mostly 
+This subsection contains information which is likely to be most relevant mostly
 to Arrow developers and is not necessary for Arrow users to install Arrow.
 
-There are a number of scripts that are triggered when `R CMD INSTALL .` is run. 
-For Arrow users, these should all just work without configuration and pull in 
+There are a number of scripts that are triggered when `R CMD INSTALL .` is run.
+For Arrow users, these should all just work without configuration and pull in
 the most complete pieces (e.g. official binaries that we host).
 
 An overview of these scripts is shown below:
 
-* `configure` and `configure.win` - these scripts are triggered during 
-`R CMD INSTALL .` on non-Windows and Windows platforms, respectively. They 
-handle finding the Arrow library, setting up the build variables necessary, and 
-writing the package Makevars file that is used to compile the C++ code in the R 
+* `configure` and `configure.win` - these scripts are triggered during
+`R CMD INSTALL .` on non-Windows and Windows platforms, respectively. They
+handle finding the Arrow library, setting up the build variables necessary, and
+writing the package Makevars file that is used to compile the C++ code in the R
 package.
 
-* `tools/nixlibs.R` - this script is sometimes called by `configure` on Linux 
-(or on any non-windows OS with the environment variable 
-`FORCE_BUNDLED_BUILD=true`). This sets up the build process for our bundled 
-builds (which is the default on linux). The operative logic is at the end of 
-the script, but it will do the following (and it will stop with the first one 
-that succeeds and some of the steps are only checked if they are enabled via an 
+* `tools/nixlibs.R` - this script is sometimes called by `configure` on Linux
+(or on any non-windows OS with the environment variable
+`FORCE_BUNDLED_BUILD=true`). This sets up the build process for our bundled
+builds (which is the default on linux). The operative logic is at the end of
+the script, but it will do the following (and it will stop with the first one
+that succeeds and some of the steps are only checked if they are enabled via an
 environment variable):
   * Check if there is an already built libarrow in `arrow/r/libarrow-{version}`,
   use that to link against if it exists.
   * Check if a binary is available from our hosted unofficial builds.
   * Download the Arrow source and build the Arrow Library from source.
-  * `*** Proceed without C++` dependencies (this is an error and the package 
-  will not work, but if you see this message you know the previous steps have 
+  * `*** Proceed without C++` dependencies (this is an error and the package
+  will not work, but if you see this message you know the previous steps have
   not succeeded/were not enabled)
-  
-* `inst/build_arrow_static.sh` - called by `tools/nixlibs.R` when the Arrow 
-library is being built.  It builds Arrow for a bundled, static build, and 
-mirrors the steps described in the ["Arrow R Developer Guide" vignette]("./developing.html")
+
+* `inst/build_arrow_static.sh` - called by `tools/nixlibs.R` when the Arrow
+library is being built.  It builds Arrow for a bundled, static build, and
+mirrors the steps described in the ["Arrow R Developer Guide" vignette](./developing.html)
 
 # Troubleshooting
 

From 7ffeead4000b0419946ea4f7785fad5157e1fb9d Mon Sep 17 00:00:00 2001
From: Weston Pace <weston.pace@gmail.com>
Date: Wed, 3 Nov 2021 00:33:43 -1000
Subject: [PATCH 063/194] ARROW-14539: [C++] Dataset scanner test failing a
 DCHECK

If the exec plan ends early (either through a failure or user cancellation) it is possible for `AsyncTaskGroup::AddTask` to be called after `AsyncTaskGroup::End`.  This led to `AsyncTaskGroup` trying to mark its future finished twice.

I modified `AsyncTaskGroup` so that it won't mark the future twice if this condition occurs.  Instead it returns a cancelled status.  There is no good way to prevent this condition in `MapNode` without introducing a mutex / more synchronization so instead I just modified `MapNode` to expect a potential cancelled status return from `AddTask`.

Closes #11590 from westonpace/bugfix/ARROW-14539--dcheck-on-scanner-test

Authored-by: Weston Pace <weston.pace@gmail.com>
Signed-off-by: Weston Pace <weston.pace@gmail.com>
---
 cpp/src/arrow/compute/exec/exec_plan.cc |  8 ++++++--
 cpp/src/arrow/util/async_util.cc        | 12 ++++++------
 cpp/src/arrow/util/async_util_test.cc   |  9 +++------
 3 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/cpp/src/arrow/compute/exec/exec_plan.cc b/cpp/src/arrow/compute/exec/exec_plan.cc
index 7e7824d8524b0..82e119d44a94a 100644
--- a/cpp/src/arrow/compute/exec/exec_plan.cc
+++ b/cpp/src/arrow/compute/exec/exec_plan.cc
@@ -338,7 +338,9 @@ Future<> MapNode::finished() { return finished_; }
 void MapNode::SubmitTask(std::function<Result<ExecBatch>(ExecBatch)> map_fn,
                          ExecBatch batch) {
   Status status;
-  if (finished_.is_finished()) {
+  // This will be true if the node is stopped early due to an error or manual
+  // cancellation
+  if (input_counter_.Completed()) {
     return;
   }
   auto task = [this, map_fn, batch]() {
@@ -368,7 +370,9 @@ void MapNode::SubmitTask(std::function<Result<ExecBatch>(ExecBatch)> map_fn,
       this->Finish(status);
     }
   }
-  if (!status.ok()) {
+  // If we get a cancelled status from AddTask it means this node was stopped
+  // or errored out already so we can just drop the task.
+  if (!status.ok() && !status.IsCancelled()) {
     if (input_counter_.Cancel()) {
       this->Finish(status);
     }
diff --git a/cpp/src/arrow/util/async_util.cc b/cpp/src/arrow/util/async_util.cc
index f5b9bdcbe6c13..59e1f074de3e3 100644
--- a/cpp/src/arrow/util/async_util.cc
+++ b/cpp/src/arrow/util/async_util.cc
@@ -43,8 +43,8 @@ void AsyncDestroyable::Destroy() {
 
 Status AsyncTaskGroup::AddTask(std::function<Result<Future<>>()> task) {
   auto guard = mutex_.Lock();
-  if (all_tasks_done_.is_finished()) {
-    return Status::Invalid("Attempt to add a task after the task group has completed");
+  if (finished_adding_) {
+    return Status::Cancelled("Ignoring task added after the task group has been ended");
   }
   if (!err_.ok()) {
     return err_;
@@ -80,8 +80,8 @@ Status AsyncTaskGroup::AddTaskUnlocked(const Future<>& task_fut,
 
 Status AsyncTaskGroup::AddTask(const Future<>& task_fut) {
   auto guard = mutex_.Lock();
-  if (all_tasks_done_.is_finished()) {
-    return Status::Invalid("Attempt to add a task after the task group has completed");
+  if (finished_adding_) {
+    return Status::Cancelled("Ignoring task added after the task group has been ended");
   }
   if (!err_.ok()) {
     return err_;
@@ -106,8 +106,8 @@ SerializedAsyncTaskGroup::SerializedAsyncTaskGroup() : on_finished_(Future<>::Ma
 Status SerializedAsyncTaskGroup::AddTask(std::function<Result<Future<>>()> task) {
   util::Mutex::Guard guard = mutex_.Lock();
   ARROW_RETURN_NOT_OK(err_);
-  if (on_finished_.is_finished()) {
-    return Status::Invalid("Attempt to add a task after a task group has finished");
+  if (ended_) {
+    return Status::Cancelled("Ignoring task added after the task group has been ended");
   }
   tasks_.push(std::move(task));
   if (!processing_.is_valid()) {
diff --git a/cpp/src/arrow/util/async_util_test.cc b/cpp/src/arrow/util/async_util_test.cc
index eae4adfdfa116..8173dac5d24a7 100644
--- a/cpp/src/arrow/util/async_util_test.cc
+++ b/cpp/src/arrow/util/async_util_test.cc
@@ -133,21 +133,18 @@ TYPED_TEST(TypedTestAsyncTaskGroup, OnFinishedDoesNotEnd) {
 TYPED_TEST(TypedTestAsyncTaskGroup, AddAfterDone) {
   TypeParam task_group;
   ASSERT_FINISHES_OK(task_group.End());
-  ASSERT_RAISES(Invalid, task_group.AddTask([] { return Future<>::Make(); }));
+  ASSERT_RAISES(Cancelled, task_group.AddTask([] { return Future<>::Make(); }));
 }
 
-TYPED_TEST(TypedTestAsyncTaskGroup, AddAfterWaitButBeforeFinish) {
+TYPED_TEST(TypedTestAsyncTaskGroup, AddAfterEndButBeforeFinish) {
   TypeParam task_group;
   Future<> task_one = Future<>::Make();
   ASSERT_OK(task_group.AddTask([task_one] { return task_one; }));
   Future<> finish_fut = task_group.End();
   AssertNotFinished(finish_fut);
-  Future<> task_two = Future<>::Make();
-  ASSERT_OK(task_group.AddTask([task_two] { return task_two; }));
+  ASSERT_RAISES(Cancelled, task_group.AddTask([] { return Future<>::Make(); }));
   AssertNotFinished(finish_fut);
   task_one.MarkFinished();
-  AssertNotFinished(finish_fut);
-  task_two.MarkFinished();
   AssertFinished(finish_fut);
   ASSERT_FINISHES_OK(finish_fut);
 }

From 7190a94c6eeb2c0ad4e7b8662534e7ed2d277619 Mon Sep 17 00:00:00 2001
From: Weston Pace <weston.pace@gmail.com>
Date: Wed, 3 Nov 2021 00:35:41 -1000
Subject: [PATCH 064/194] ARROW-14355: [C++] Create naive implementation of
 algorithm to estimate table/batch buffer size

This estimates the combined size (in bytes) of all buffers referenced by various containers.  This will be needed for tracing and profiling the query engine.  For example, we may want to report how much data is passing through a given exec node over time.

Closes #11441 from westonpace/feature/ARROW-14355--naive-estimate-buffer-size

Authored-by: Weston Pace <weston.pace@gmail.com>
Signed-off-by: Weston Pace <weston.pace@gmail.com>
---
 cpp/src/arrow/CMakeLists.txt         |   1 +
 cpp/src/arrow/compute/exec.cc        |   8 ++
 cpp/src/arrow/compute/exec.h         |   9 +++
 cpp/src/arrow/datum.cc               |  20 +++++
 cpp/src/arrow/datum.h                |   5 ++
 cpp/src/arrow/datum_test.cc          |  16 ++++
 cpp/src/arrow/util/CMakeLists.txt    |   1 +
 cpp/src/arrow/util/byte_size.cc      | 107 +++++++++++++++++++++++++++
 cpp/src/arrow/util/byte_size.h       |  53 +++++++++++++
 cpp/src/arrow/util/byte_size_test.cc | 104 ++++++++++++++++++++++++++
 10 files changed, 324 insertions(+)
 create mode 100644 cpp/src/arrow/util/byte_size.cc
 create mode 100644 cpp/src/arrow/util/byte_size.h
 create mode 100644 cpp/src/arrow/util/byte_size_test.cc

diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index d7e433f48440b..077f655e36ad9 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -199,6 +199,7 @@ set(ARROW_SRCS
     util/bitmap_builders.cc
     util/bitmap_ops.cc
     util/bpacking.cc
+    util/byte_size.cc
     util/cancel.cc
     util/compression.cc
     util/counting_semaphore.cc
diff --git a/cpp/src/arrow/compute/exec.cc b/cpp/src/arrow/compute/exec.cc
index 50f1ad4fd0b7c..8a869453bdd5d 100644
--- a/cpp/src/arrow/compute/exec.cc
+++ b/cpp/src/arrow/compute/exec.cc
@@ -102,6 +102,14 @@ void PrintTo(const ExecBatch& batch, std::ostream* os) {
   }
 }
 
+int64_t ExecBatch::TotalBufferSize() const {
+  int64_t sum = 0;
+  for (const auto& value : values) {
+    sum += value.TotalBufferSize();
+  }
+  return sum;
+}
+
 std::string ExecBatch::ToString() const {
   std::stringstream ss;
   PrintTo(*this, &ss);
diff --git a/cpp/src/arrow/compute/exec.h b/cpp/src/arrow/compute/exec.h
index 7707622bc5312..faebddb73342d 100644
--- a/cpp/src/arrow/compute/exec.h
+++ b/cpp/src/arrow/compute/exec.h
@@ -212,6 +212,15 @@ struct ARROW_EXPORT ExecBatch {
   /// by ExecBatchIterator which by design does not yield length-0 batches.
   int64_t length;
 
+  /// \brief The sum of bytes in each buffer referenced by the batch
+  ///
+  /// Note: Scalars are not counted
+  /// Note: Some values may referenced only part of a buffer, for
+  ///       example, an array with an offset.  The actual data
+  ///       visible to this batch will be smaller than the total
+  ///       buffer size in this case.
+  int64_t TotalBufferSize() const;
+
   /// \brief Return the value at the i-th index
   template <typename index_type>
   inline const Datum& operator[](index_type i) const {
diff --git a/cpp/src/arrow/datum.cc b/cpp/src/arrow/datum.cc
index 397e91de5b9af..007e008e74fff 100644
--- a/cpp/src/arrow/datum.cc
+++ b/cpp/src/arrow/datum.cc
@@ -28,6 +28,7 @@
 #include "arrow/record_batch.h"
 #include "arrow/scalar.h"
 #include "arrow/table.h"
+#include "arrow/util/byte_size.h"
 #include "arrow/util/logging.h"
 #include "arrow/util/memory.h"
 
@@ -128,6 +129,25 @@ int64_t Datum::length() const {
   }
 }
 
+int64_t Datum::TotalBufferSize() const {
+  switch (this->kind()) {
+    case Datum::ARRAY:
+      return util::TotalBufferSize(*util::get<std::shared_ptr<ArrayData>>(this->value));
+    case Datum::CHUNKED_ARRAY:
+      return util::TotalBufferSize(
+          *util::get<std::shared_ptr<ChunkedArray>>(this->value));
+    case Datum::RECORD_BATCH:
+      return util::TotalBufferSize(*util::get<std::shared_ptr<RecordBatch>>(this->value));
+    case Datum::TABLE:
+      return util::TotalBufferSize(*util::get<std::shared_ptr<Table>>(this->value));
+    case Datum::SCALAR:
+      return 0;
+    default:
+      DCHECK(false);
+      return 0;
+  }
+}
+
 int64_t Datum::null_count() const {
   if (this->kind() == Datum::ARRAY) {
     return util::get<std::shared_ptr<ArrayData>>(this->value)->GetNullCount();
diff --git a/cpp/src/arrow/datum.h b/cpp/src/arrow/datum.h
index da851d917d888..e51bbaab7f5a0 100644
--- a/cpp/src/arrow/datum.h
+++ b/cpp/src/arrow/datum.h
@@ -191,6 +191,11 @@ struct ARROW_EXPORT Datum {
     return util::get<std::shared_ptr<ArrayData>>(this->value);
   }
 
+  /// \brief The sum of bytes in each buffer referenced by the datum
+  /// Note: Scalars report a size of 0
+  /// \see arrow::util::TotalBufferSize for caveats
+  int64_t TotalBufferSize() const;
+
   ArrayData* mutable_array() const { return this->array().get(); }
 
   std::shared_ptr<Array> make_array() const;
diff --git a/cpp/src/arrow/datum_test.cc b/cpp/src/arrow/datum_test.cc
index cf65d515ddac5..7135cc1dd734b 100644
--- a/cpp/src/arrow/datum_test.cc
+++ b/cpp/src/arrow/datum_test.cc
@@ -145,6 +145,22 @@ TEST(Datum, ToString) {
   ASSERT_EQ("Collection(Array, Scalar)", v4.ToString());
 }
 
+TEST(Datum, TotalBufferSize) {
+  auto arr = ArrayFromJSON(int8(), "[1, 2, 3, 4]");
+  Datum arr_datum(arr);
+  ASSERT_OK_AND_ASSIGN(std::shared_ptr<ChunkedArray> chunked_arr,
+                       ChunkedArray::Make({arr}));
+  Datum chunked_datum(chunked_arr);
+  std::shared_ptr<Schema> schm = schema({field("a", int8())});
+  Datum rb_datum(RecordBatch::Make(schm, 4, {arr}));
+  Datum tab_datum(Table::Make(std::move(schm), {std::move(arr)}, 4));
+
+  ASSERT_EQ(4, arr_datum.TotalBufferSize());
+  ASSERT_EQ(4, chunked_datum.TotalBufferSize());
+  ASSERT_EQ(4, rb_datum.TotalBufferSize());
+  ASSERT_EQ(4, tab_datum.TotalBufferSize());
+}
+
 TEST(ValueDescr, Basics) {
   ValueDescr d1(utf8(), ValueDescr::SCALAR);
   ValueDescr d2 = ValueDescr::Any(utf8());
diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt
index 6d36fde930b1e..1983819f445d8 100644
--- a/cpp/src/arrow/util/CMakeLists.txt
+++ b/cpp/src/arrow/util/CMakeLists.txt
@@ -45,6 +45,7 @@ add_arrow_test(utility-test
                async_util_test.cc
                bit_block_counter_test.cc
                bit_util_test.cc
+               byte_size_test.cc
                cache_test.cc
                checked_cast_test.cc
                compression_test.cc
diff --git a/cpp/src/arrow/util/byte_size.cc b/cpp/src/arrow/util/byte_size.cc
new file mode 100644
index 0000000000000..b30868eb8d905
--- /dev/null
+++ b/cpp/src/arrow/util/byte_size.cc
@@ -0,0 +1,107 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/util/byte_size.h"
+
+#include <cstdint>
+#include <unordered_set>
+
+#include "arrow/array.h"
+#include "arrow/buffer.h"
+
+namespace arrow {
+
+namespace util {
+
+namespace {
+
+int64_t DoTotalBufferSize(const ArrayData& array_data,
+                          std::unordered_set<const uint8_t*>* seen_buffers) {
+  int64_t sum = 0;
+  for (const auto& buffer : array_data.buffers) {
+    if (buffer && seen_buffers->insert(buffer->data()).second) {
+      sum += buffer->size();
+    }
+  }
+  for (const auto& child : array_data.child_data) {
+    sum += DoTotalBufferSize(*child, seen_buffers);
+  }
+  if (array_data.dictionary) {
+    sum += DoTotalBufferSize(*array_data.dictionary, seen_buffers);
+  }
+  return sum;
+}
+
+int64_t DoTotalBufferSize(const Array& array,
+                          std::unordered_set<const uint8_t*>* seen_buffers) {
+  return DoTotalBufferSize(*array.data(), seen_buffers);
+}
+
+int64_t DoTotalBufferSize(const ChunkedArray& chunked_array,
+                          std::unordered_set<const uint8_t*>* seen_buffers) {
+  int64_t sum = 0;
+  for (const auto& chunk : chunked_array.chunks()) {
+    sum += DoTotalBufferSize(*chunk, seen_buffers);
+  }
+  return sum;
+}
+
+int64_t DoTotalBufferSize(const RecordBatch& record_batch,
+                          std::unordered_set<const uint8_t*>* seen_buffers) {
+  int64_t sum = 0;
+  for (const auto& column : record_batch.columns()) {
+    sum += DoTotalBufferSize(*column, seen_buffers);
+  }
+  return sum;
+}
+
+int64_t DoTotalBufferSize(const Table& table,
+                          std::unordered_set<const uint8_t*>* seen_buffers) {
+  int64_t sum = 0;
+  for (const auto& column : table.columns()) {
+    sum += DoTotalBufferSize(*column, seen_buffers);
+  }
+  return sum;
+}
+
+}  // namespace
+
+int64_t TotalBufferSize(const ArrayData& array_data) {
+  std::unordered_set<const uint8_t*> seen_buffers;
+  return DoTotalBufferSize(array_data, &seen_buffers);
+}
+
+int64_t TotalBufferSize(const Array& array) { return TotalBufferSize(*array.data()); }
+
+int64_t TotalBufferSize(const ChunkedArray& chunked_array) {
+  std::unordered_set<const uint8_t*> seen_buffers;
+  return DoTotalBufferSize(chunked_array, &seen_buffers);
+}
+
+int64_t TotalBufferSize(const RecordBatch& record_batch) {
+  std::unordered_set<const uint8_t*> seen_buffers;
+  return DoTotalBufferSize(record_batch, &seen_buffers);
+}
+
+int64_t TotalBufferSize(const Table& table) {
+  std::unordered_set<const uint8_t*> seen_buffers;
+  return DoTotalBufferSize(table, &seen_buffers);
+}
+
+}  // namespace util
+
+}  // namespace arrow
diff --git a/cpp/src/arrow/util/byte_size.h b/cpp/src/arrow/util/byte_size.h
new file mode 100644
index 0000000000000..041d4e216b948
--- /dev/null
+++ b/cpp/src/arrow/util/byte_size.h
@@ -0,0 +1,53 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+
+#include "arrow/array.h"
+#include "arrow/record_batch.h"
+#include "arrow/table.h"
+
+namespace arrow {
+
+namespace util {
+
+/// \brief The sum of bytes in each buffer referenced by the array
+///
+/// Note: An array may only reference a portion of a buffer.
+///       This method will overestimate in this case and return the
+///       byte size of the entire buffer.
+/// Note: If a buffer is referenced multiple times then it will
+///       only be counted once.
+int64_t ARROW_EXPORT TotalBufferSize(const ArrayData& array_data);
+/// \brief The sum of bytes in each buffer referenced by the array
+/// Note: The caveats on the ArrayData overload apply here as well
+int64_t ARROW_EXPORT TotalBufferSize(const Array& array);
+/// \brief The sum of bytes in each buffer referenced by the array
+/// Note: The caveats on the ArrayData overload apply here as well
+int64_t ARROW_EXPORT TotalBufferSize(const ChunkedArray& chunked_array);
+/// \brief The sum of bytes in each buffer referenced by the batch
+/// Note: The caveats on the ArrayData overload apply here as well
+int64_t ARROW_EXPORT TotalBufferSize(const RecordBatch& record_batch);
+/// \brief The sum of bytes in each buffer referenced by the table
+/// Note: The caveats on the ArrayData overload apply here as well
+int64_t ARROW_EXPORT TotalBufferSize(const Table& table);
+
+}  // namespace util
+
+}  // namespace arrow
diff --git a/cpp/src/arrow/util/byte_size_test.cc b/cpp/src/arrow/util/byte_size_test.cc
new file mode 100644
index 0000000000000..d352b17cf45a8
--- /dev/null
+++ b/cpp/src/arrow/util/byte_size_test.cc
@@ -0,0 +1,104 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/util/byte_size.h"
+
+#include <gtest/gtest.h>
+
+#include "arrow/array.h"
+#include "arrow/buffer.h"
+#include "arrow/testing/generator.h"
+#include "arrow/testing/gtest_util.h"
+
+namespace arrow {
+namespace util {
+
+TEST(TotalBufferSize, Arrays) {
+  std::shared_ptr<Array> no_nulls = ArrayFromJSON(int16(), "[1, 2, 3]");
+  ASSERT_EQ(6, TotalBufferSize(*no_nulls->data()));
+
+  std::shared_ptr<Array> with_nulls =
+      ArrayFromJSON(int16(), "[1, 2, 3, 4, null, 6, 7, 8, 9]");
+  ASSERT_EQ(20, TotalBufferSize(*with_nulls->data()));
+}
+
+TEST(TotalBufferSize, NestedArray) {
+  std::shared_ptr<Array> array_with_children =
+      ArrayFromJSON(list(int64()), "[[0, 1, 2, 3, 4], [5], null]");
+  // The offsets array will have 4 4-byte offsets      (16)
+  // The child array will have 6 8-byte values         (48)
+  // The child array will not have a validity bitmap
+  // The list array will have a 1 byte validity bitmap  (1)
+  ASSERT_EQ(65, TotalBufferSize(*array_with_children));
+}
+
+TEST(TotalBufferSize, ArrayWithOffset) {
+  std::shared_ptr<Array> base_array =
+      ArrayFromJSON(int16(), "[1, 2, 3, 4, null, 6, 7, 8, 9]");
+  std::shared_ptr<Array> sliced = base_array->Slice(8, 1);
+  ASSERT_EQ(20, TotalBufferSize(*sliced));
+}
+
+TEST(TotalBufferSize, ArrayWithDict) {
+  std::shared_ptr<Array> arr = ArrayFromJSON(dictionary(int32(), int8()), "[0, 0, 0]");
+  ASSERT_EQ(13, TotalBufferSize(*arr));
+}
+
+TEST(TotalBufferSize, ChunkedArray) {
+  ArrayVector arrays;
+  for (int i = 0; i < 10; i++) {
+    arrays.push_back(ConstantArrayGenerator::Zeroes(5, int32()));
+  }
+  ASSERT_OK_AND_ASSIGN(std::shared_ptr<ChunkedArray> chunked_array,
+                       ChunkedArray::Make(std::move(arrays)));
+  ASSERT_EQ(5 * 4 * 10, TotalBufferSize(*chunked_array));
+}
+
+TEST(TotalBufferSize, RecordBatch) {
+  std::shared_ptr<RecordBatch> record_batch = ConstantArrayGenerator::Zeroes(
+      10, schema({field("a", int32()), field("b", int64())}));
+  ASSERT_EQ(10 * 4 + 10 * 8, TotalBufferSize(*record_batch));
+}
+
+TEST(TotalBufferSize, Table) {
+  ArrayVector c1_arrays, c2_arrays;
+  for (int i = 0; i < 5; i++) {
+    c1_arrays.push_back(ConstantArrayGenerator::Zeroes(10, int32()));
+  }
+  for (int i = 0; i < 10; i++) {
+    c2_arrays.push_back(ConstantArrayGenerator::Zeroes(5, int64()));
+  }
+  ASSERT_OK_AND_ASSIGN(std::shared_ptr<ChunkedArray> c1,
+                       ChunkedArray::Make(std::move(c1_arrays)));
+  ASSERT_OK_AND_ASSIGN(std::shared_ptr<ChunkedArray> c2,
+                       ChunkedArray::Make(std::move(c2_arrays)));
+  std::shared_ptr<Schema> schm = schema({field("a", int32()), field("b", int64())});
+  std::shared_ptr<Table> table = Table::Make(std::move(schm), {c1, c2});
+  ASSERT_EQ(5 * 10 * 4 + 10 * 5 * 8, TotalBufferSize(*table));
+}
+
+TEST(TotalBufferSize, SharedBuffers) {
+  std::shared_ptr<Array> shared = ArrayFromJSON(int16(), "[1, 2, 3]");
+  std::shared_ptr<Array> first = shared->Slice(0, 2);
+  std::shared_ptr<Array> second = shared->Slice(1, 2);
+  ASSERT_OK_AND_ASSIGN(std::shared_ptr<ChunkedArray> combined,
+                       ChunkedArray::Make({first, second}));
+  ASSERT_EQ(6, TotalBufferSize(*combined));
+}
+
+}  // namespace util
+}  // namespace arrow

From 96614116ccf15a488095dfbf7cb6f30e01878f3a Mon Sep 17 00:00:00 2001
From: Carlos O'Ryan <coryan@google.com>
Date: Wed, 3 Nov 2021 13:32:15 +0100
Subject: [PATCH 065/194] ARROW-14559: [C++] reduce memory usage in
 GcsFileSystem::OpenInputStream

Closes #11595 from coryan/ARROW-14559-reduce-memory-usage-in-GcsFileSystem-OpenInputStream

Authored-by: Carlos O'Ryan <coryan@google.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/arrow/filesystem/gcsfs.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cpp/src/arrow/filesystem/gcsfs.cc b/cpp/src/arrow/filesystem/gcsfs.cc
index beb6c0a89ef90..73d599fffa520 100644
--- a/cpp/src/arrow/filesystem/gcsfs.cc
+++ b/cpp/src/arrow/filesystem/gcsfs.cc
@@ -95,7 +95,8 @@ class GcsInputStream : public arrow::io::InputStream {
     if (!stream_.status().ok()) {
       return internal::ToArrowStatus(stream_.status());
     }
-    return arrow::SliceMutableBufferSafe(std::move(buffer), 0, stream_.gcount());
+    RETURN_NOT_OK(buffer->Resize(stream_.gcount(), true));
+    return buffer;
   }
 
  private:

From 16af17c3327cfe624038913a7a808656082f69e0 Mon Sep 17 00:00:00 2001
From: Weston Pace <weston.pace@gmail.com>
Date: Wed, 3 Nov 2021 08:42:09 -0400
Subject: [PATCH 066/194] ARROW-14356: [C++] Create kernel to determine buffer
 memory "referenced" by arrays (even if there are offsets)

This creates a `byte_ranges` vector function which extracts the buffer ranges referenced by an array.  It's a bit involved because array offsets are in "# of values" and that does not always translate cleanly into an offset into the underlying buffers.  The trickiest case is dense unions where we need to traverse the values buffer to figure out the correct array offset for the child arrays.

The one case not handled by this PR is dictionary arrays.  A sliced array may only reference a portion of the dictionary but since that referenced portion need not even be contiguous it would be too expensive to figure out.

Pruning of duplicate ranges will be handled in ARROW-14357

Once this is in I plan to wrap it with another utility function to sum up the range lengths (the only reason I'm returning ranges is so that I can later prune duplicate ranges).  This function will be complementary to (i.e. will not replace) the naive function TotalBufferSize added in #11441 .  The two functions will provide two possible ways of measuring the size of an element.

Closes #11542 from westonpace/feature/ARROW-14356--buffer-size-bytes-kernel

Authored-by: Weston Pace <weston.pace@gmail.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 cpp/src/arrow/util/byte_size.cc      | 282 ++++++++++++++++++++++
 cpp/src/arrow/util/byte_size.h       |  45 +++-
 cpp/src/arrow/util/byte_size_test.cc | 341 +++++++++++++++++++++++++++
 3 files changed, 664 insertions(+), 4 deletions(-)

diff --git a/cpp/src/arrow/util/byte_size.cc b/cpp/src/arrow/util/byte_size.cc
index b30868eb8d905..091e384803802 100644
--- a/cpp/src/arrow/util/byte_size.cc
+++ b/cpp/src/arrow/util/byte_size.cc
@@ -21,10 +21,16 @@
 #include <unordered_set>
 
 #include "arrow/array.h"
+#include "arrow/array/builder_primitive.h"
 #include "arrow/buffer.h"
+#include "arrow/util/logging.h"
+#include "arrow/visitor_inline.h"
 
 namespace arrow {
 
+using internal::checked_cast;
+using internal::checked_pointer_cast;
+
 namespace util {
 
 namespace {
@@ -102,6 +108,282 @@ int64_t TotalBufferSize(const Table& table) {
   return DoTotalBufferSize(table, &seen_buffers);
 }
 
+namespace {
+
+struct GetByteRangesArray {
+  const ArrayData& input;
+  int64_t offset;
+  int64_t length;
+  UInt64Builder* range_starts;
+  UInt64Builder* range_offsets;
+  UInt64Builder* range_lengths;
+
+  Status VisitBitmap(const std::shared_ptr<Buffer>& buffer) const {
+    if (buffer) {
+      uint64_t data_start = reinterpret_cast<uint64_t>(buffer->data());
+      RETURN_NOT_OK(range_starts->Append(data_start));
+      RETURN_NOT_OK(range_offsets->Append(BitUtil::RoundDown(offset, 8) / 8));
+      RETURN_NOT_OK(range_lengths->Append(BitUtil::CoveringBytes(offset, length)));
+    }
+    return Status::OK();
+  }
+
+  Status VisitFixedWidthArray(const Buffer& buffer, const FixedWidthType& type) const {
+    uint64_t data_start = reinterpret_cast<uint64_t>(buffer.data());
+    uint64_t offset_bits = offset * type.bit_width();
+    uint64_t offset_bytes = BitUtil::RoundDown(static_cast<int64_t>(offset_bits), 8) / 8;
+    uint64_t end_byte =
+        BitUtil::RoundUp(static_cast<int64_t>(offset_bits + (length * type.bit_width())),
+                         8) /
+        8;
+    uint64_t length_bytes = (end_byte - offset_bytes);
+    RETURN_NOT_OK(range_starts->Append(data_start));
+    RETURN_NOT_OK(range_offsets->Append(offset_bytes));
+    return range_lengths->Append(length_bytes);
+  }
+
+  Status Visit(const FixedWidthType& type) const {
+    static_assert(sizeof(uint8_t*) <= sizeof(uint64_t),
+                  "Undefined behavior if pointer larger than uint64_t");
+    RETURN_NOT_OK(VisitBitmap(input.buffers[0]));
+    RETURN_NOT_OK(VisitFixedWidthArray(*input.buffers[1], type));
+    if (input.dictionary) {
+      // This is slightly imprecise because we always assume the entire dictionary is
+      // referenced.  If this array has an offset it may only be referencing a portion of
+      // the dictionary
+      GetByteRangesArray dict_visitor{*input.dictionary,
+                                      input.dictionary->offset,
+                                      input.dictionary->length,
+                                      range_starts,
+                                      range_offsets,
+                                      range_lengths};
+      return VisitTypeInline(*input.dictionary->type, &dict_visitor);
+    }
+    return Status::OK();
+  }
+
+  Status Visit(const NullType& type) const { return Status::OK(); }
+
+  template <typename BaseBinaryType>
+  Status VisitBaseBinary(const BaseBinaryType& type) const {
+    using offset_type = typename BaseBinaryType::offset_type;
+    RETURN_NOT_OK(VisitBitmap(input.buffers[0]));
+
+    const Buffer& offsets_buffer = *input.buffers[1];
+    RETURN_NOT_OK(
+        range_starts->Append(reinterpret_cast<uint64_t>(offsets_buffer.data())));
+    RETURN_NOT_OK(range_offsets->Append(sizeof(offset_type) * offset));
+    RETURN_NOT_OK(range_lengths->Append(sizeof(offset_type) * length));
+
+    const offset_type* offsets = input.GetValues<offset_type>(1, offset);
+    const Buffer& values = *input.buffers[2];
+    offset_type start = offsets[0];
+    offset_type end = offsets[length];
+    RETURN_NOT_OK(range_starts->Append(reinterpret_cast<uint64_t>(values.data())));
+    RETURN_NOT_OK(range_offsets->Append(static_cast<uint64_t>(start)));
+    return range_lengths->Append(static_cast<uint64_t>(end - start));
+  }
+
+  Status Visit(const BinaryType& type) const { return VisitBaseBinary(type); }
+
+  Status Visit(const LargeBinaryType& type) const { return VisitBaseBinary(type); }
+
+  template <typename BaseListType>
+  Status VisitBaseList(const BaseListType& type) const {
+    using offset_type = typename BaseListType::offset_type;
+    RETURN_NOT_OK(VisitBitmap(input.buffers[0]));
+
+    const Buffer& offsets_buffer = *input.buffers[1];
+    RETURN_NOT_OK(
+        range_starts->Append(reinterpret_cast<uint64_t>(offsets_buffer.data())));
+    RETURN_NOT_OK(range_offsets->Append(sizeof(offset_type) * offset));
+    RETURN_NOT_OK(range_lengths->Append(sizeof(offset_type) * length));
+
+    const offset_type* offsets = input.GetValues<offset_type>(1, offset);
+    int64_t start = static_cast<int64_t>(offsets[0]);
+    int64_t end = static_cast<int64_t>(offsets[length]);
+    GetByteRangesArray child{*input.child_data[0], start,         end - start,
+                             range_starts,         range_offsets, range_lengths};
+    return VisitTypeInline(*type.value_type(), &child);
+  }
+
+  Status Visit(const ListType& type) const { return VisitBaseList(type); }
+
+  Status Visit(const LargeListType& type) const { return VisitBaseList(type); }
+
+  Status Visit(const FixedSizeListType& type) const {
+    RETURN_NOT_OK(VisitBitmap(input.buffers[0]));
+    GetByteRangesArray child{*input.child_data[0],
+                             offset * type.list_size(),
+                             length * type.list_size(),
+                             range_starts,
+                             range_offsets,
+                             range_lengths};
+    return VisitTypeInline(*type.value_type(), &child);
+  }
+
+  Status Visit(const StructType& type) const {
+    for (int i = 0; i < type.num_fields(); i++) {
+      GetByteRangesArray child{*input.child_data[i],
+                               offset + input.child_data[i]->offset,
+                               length,
+                               range_starts,
+                               range_offsets,
+                               range_lengths};
+      RETURN_NOT_OK(VisitTypeInline(*type.field(i)->type(), &child));
+    }
+    return Status::OK();
+  }
+
+  Status Visit(const DenseUnionType& type) const {
+    // Skip validity map for DenseUnionType
+    // Types buffer is always int8
+    RETURN_NOT_OK(VisitFixedWidthArray(
+        *input.buffers[1], *std::dynamic_pointer_cast<FixedWidthType>(int8())));
+    // Offsets buffer is always int32
+    RETURN_NOT_OK(VisitFixedWidthArray(
+        *input.buffers[2], *std::dynamic_pointer_cast<FixedWidthType>(int32())));
+
+    // We have to loop through the types buffer to figure out the correct
+    // offset / length being referenced in the child arrays
+    std::vector<int64_t> lengths_per_type(type.type_codes().size());
+    std::vector<int64_t> offsets_per_type(type.type_codes().size());
+    const int8_t* type_codes = input.GetValues<int8_t>(1, 0);
+    for (const int8_t* it = type_codes; it != type_codes + offset; it++) {
+      DCHECK_NE(type.child_ids()[static_cast<std::size_t>(*it)],
+                UnionType::kInvalidChildId);
+      offsets_per_type[type.child_ids()[static_cast<std::size_t>(*it)]]++;
+    }
+    for (const int8_t* it = type_codes + offset; it != type_codes + offset + length;
+         it++) {
+      DCHECK_NE(type.child_ids()[static_cast<std::size_t>(*it)],
+                UnionType::kInvalidChildId);
+      lengths_per_type[type.child_ids()[static_cast<std::size_t>(*it)]]++;
+    }
+
+    for (int i = 0; i < type.num_fields(); i++) {
+      GetByteRangesArray child{
+          *input.child_data[i], offsets_per_type[i] + input.child_data[i]->offset,
+          lengths_per_type[i],  range_starts,
+          range_offsets,        range_lengths};
+      RETURN_NOT_OK(VisitTypeInline(*type.field(i)->type(), &child));
+    }
+
+    return Status::OK();
+  }
+
+  Status Visit(const SparseUnionType& type) const {
+    // Skip validity map for SparseUnionType
+    // Types buffer is always int8
+    RETURN_NOT_OK(VisitFixedWidthArray(
+        *input.buffers[1], *std::dynamic_pointer_cast<FixedWidthType>(int8())));
+
+    for (int i = 0; i < type.num_fields(); i++) {
+      GetByteRangesArray child{*input.child_data[i],
+                               offset + input.child_data[i]->offset,
+                               length,
+                               range_starts,
+                               range_offsets,
+                               range_lengths};
+      RETURN_NOT_OK(VisitTypeInline(*type.field(i)->type(), &child));
+    }
+
+    return Status::OK();
+  }
+
+  Status Visit(const ExtensionType& extension_type) const {
+    GetByteRangesArray storage{input,        offset,        length,
+                               range_starts, range_offsets, range_lengths};
+    return VisitTypeInline(*extension_type.storage_type(), &storage);
+  }
+
+  Status Visit(const DataType& type) const {
+    return Status::TypeError("Extracting byte ranges not supported for type ",
+                             type.ToString());
+  }
+
+  static std::shared_ptr<DataType> RangesType() {
+    return struct_(
+        {field("start", uint64()), field("offset", uint64()), field("length", uint64())});
+  }
+
+  Result<std::shared_ptr<Array>> MakeRanges() const {
+    std::shared_ptr<Array> range_starts_arr, range_offsets_arr, range_lengths_arr;
+    RETURN_NOT_OK(range_starts->Finish(&range_starts_arr));
+    RETURN_NOT_OK(range_offsets->Finish(&range_offsets_arr));
+    RETURN_NOT_OK(range_lengths->Finish(&range_lengths_arr));
+    return StructArray::Make(
+        {range_starts_arr, range_offsets_arr, range_lengths_arr},
+        {field("start", uint64()), field("offset", uint64()), field("length", uint64())});
+  }
+
+  static Result<std::shared_ptr<Array>> Exec(const ArrayData& input) {
+    UInt64Builder range_starts, range_offsets, range_lengths;
+    GetByteRangesArray self{input,         input.offset,   input.length,
+                            &range_starts, &range_offsets, &range_lengths};
+    RETURN_NOT_OK(VisitTypeInline(*input.type, &self));
+    return self.MakeRanges();
+  }
+};
+
+int64_t RangesToLengthSum(const Array& ranges) {
+  int64_t sum = 0;
+  const StructArray& ranges_struct = checked_cast<const StructArray&>(ranges);
+  std::shared_ptr<UInt64Array> lengths =
+      checked_pointer_cast<UInt64Array>(ranges_struct.field(2));
+  for (auto length : *lengths) {
+    sum += static_cast<int64_t>(*length);
+  }
+  return sum;
+}
+
+}  // namespace
+
+Result<std::shared_ptr<Array>> ReferencedRanges(const ArrayData& array_data) {
+  return GetByteRangesArray::Exec(array_data);
+}
+
+Result<int64_t> ReferencedBufferSize(const ArrayData& array_data) {
+  ARROW_ASSIGN_OR_RAISE(const std::shared_ptr<Array> ranges,
+                        GetByteRangesArray::Exec(array_data));
+  return RangesToLengthSum(*ranges);
+}
+
+Result<int64_t> ReferencedBufferSize(const Array& array) {
+  ARROW_ASSIGN_OR_RAISE(const std::shared_ptr<Array> ranges,
+                        GetByteRangesArray::Exec(*array.data()));
+  return RangesToLengthSum(*ranges);
+}
+
+Result<int64_t> ReferencedBufferSize(const ChunkedArray& array) {
+  int64_t sum = 0;
+  for (const auto& chunk : array.chunks()) {
+    ARROW_ASSIGN_OR_RAISE(int64_t chunk_sum, ReferencedBufferSize(*chunk));
+    sum += chunk_sum;
+  }
+  return sum;
+}
+
+Result<int64_t> ReferencedBufferSize(const RecordBatch& record_batch) {
+  int64_t sum = 0;
+  for (const auto& column : record_batch.columns()) {
+    ARROW_ASSIGN_OR_RAISE(int64_t column_sum, ReferencedBufferSize(*column));
+    sum += column_sum;
+  }
+  return sum;
+}
+
+Result<int64_t> ReferencedBufferSize(const Table& table) {
+  int64_t sum = 0;
+  for (const auto& column : table.columns()) {
+    for (const auto& chunk : column->chunks()) {
+      ARROW_ASSIGN_OR_RAISE(int64_t chunk_sum, ReferencedBufferSize(*chunk));
+      sum += chunk_sum;
+    }
+  }
+  return sum;
+}
+
 }  // namespace util
 
 }  // namespace arrow
diff --git a/cpp/src/arrow/util/byte_size.h b/cpp/src/arrow/util/byte_size.h
index 041d4e216b948..da9f12a99aab7 100644
--- a/cpp/src/arrow/util/byte_size.h
+++ b/cpp/src/arrow/util/byte_size.h
@@ -36,18 +36,55 @@ namespace util {
 ///       only be counted once.
 int64_t ARROW_EXPORT TotalBufferSize(const ArrayData& array_data);
 /// \brief The sum of bytes in each buffer referenced by the array
-/// Note: The caveats on the ArrayData overload apply here as well
+/// \see TotalBufferSize(const ArrayData& array_data) for details
 int64_t ARROW_EXPORT TotalBufferSize(const Array& array);
 /// \brief The sum of bytes in each buffer referenced by the array
-/// Note: The caveats on the ArrayData overload apply here as well
+/// \see TotalBufferSize(const ArrayData& array_data) for details
 int64_t ARROW_EXPORT TotalBufferSize(const ChunkedArray& chunked_array);
 /// \brief The sum of bytes in each buffer referenced by the batch
-/// Note: The caveats on the ArrayData overload apply here as well
+/// \see TotalBufferSize(const ArrayData& array_data) for details
 int64_t ARROW_EXPORT TotalBufferSize(const RecordBatch& record_batch);
 /// \brief The sum of bytes in each buffer referenced by the table
-/// Note: The caveats on the ArrayData overload apply here as well
+/// \see TotalBufferSize(const ArrayData& array_data) for details
 int64_t ARROW_EXPORT TotalBufferSize(const Table& table);
 
+/// \brief Calculate the buffer ranges referenced by the array
+///
+/// These ranges will take into account array offsets
+///
+/// The ranges may contain duplicates
+///
+/// Dictionary arrays will ignore the offset of their containing array
+///
+/// The return value will be a struct array corresponding to the schema:
+/// schema({field("start", uint64()), field("offset", uint64()), field("length",
+/// uint64()))
+Result<std::shared_ptr<Array>> ARROW_EXPORT ReferencedRanges(const ArrayData& array_data);
+
+/// \brief Returns the sum of bytes from all buffer ranges referenced
+///
+/// Unlike TotalBufferSize this method will account for array
+/// offsets.
+///
+/// If buffers are shared between arrays then the shared
+/// portion will only be counted multiple times.
+///
+/// Dictionary arrays will always be counted in their entirety
+/// even if the array only references a portion of the dictionary.
+Result<int64_t> ARROW_EXPORT ReferencedBufferSize(const ArrayData& array_data);
+/// \brief Returns the sum of bytes from all buffer ranges referenced
+/// \see ReferencedBufferSize(const ArrayData& array_data) for details
+Result<int64_t> ARROW_EXPORT ReferencedBufferSize(const Array& array_data);
+/// \brief Returns the sum of bytes from all buffer ranges referenced
+/// \see ReferencedBufferSize(const ArrayData& array_data) for details
+Result<int64_t> ARROW_EXPORT ReferencedBufferSize(const ChunkedArray& array_data);
+/// \brief Returns the sum of bytes from all buffer ranges referenced
+/// \see ReferencedBufferSize(const ArrayData& array_data) for details
+Result<int64_t> ARROW_EXPORT ReferencedBufferSize(const RecordBatch& array_data);
+/// \brief Returns the sum of bytes from all buffer ranges referenced
+/// \see ReferencedBufferSize(const ArrayData& array_data) for details
+Result<int64_t> ARROW_EXPORT ReferencedBufferSize(const Table& array_data);
+
 }  // namespace util
 
 }  // namespace arrow
diff --git a/cpp/src/arrow/util/byte_size_test.cc b/cpp/src/arrow/util/byte_size_test.cc
index d352b17cf45a8..33ba552f6ccfd 100644
--- a/cpp/src/arrow/util/byte_size_test.cc
+++ b/cpp/src/arrow/util/byte_size_test.cc
@@ -19,12 +19,18 @@
 
 #include <gtest/gtest.h>
 
+#include <unordered_map>
+
 #include "arrow/array.h"
 #include "arrow/buffer.h"
+#include "arrow/testing/extension_type.h"
 #include "arrow/testing/generator.h"
 #include "arrow/testing/gtest_util.h"
 
 namespace arrow {
+
+using internal::checked_pointer_cast;
+
 namespace util {
 
 TEST(TotalBufferSize, Arrays) {
@@ -100,5 +106,340 @@ TEST(TotalBufferSize, SharedBuffers) {
   ASSERT_EQ(6, TotalBufferSize(*combined));
 }
 
+struct ExpectedRange {
+  // The index of the expected buffer.  If an array shares buffers then multiple
+  // ExpectedRange objects will have the same index.
+  int index;
+  // The start of the expected range, as an offset from the source buffer start
+  uint64_t offset;
+  uint64_t length;
+};
+
+std::shared_ptr<Array> ExpectedRangesToArray(
+    const std::vector<ExpectedRange>& ranges,
+    const std::function<uint64_t(const ExpectedRange&)>& key_func) {
+  UInt64Builder builder;
+  for (const auto& range : ranges) {
+    ARROW_EXPECT_OK(builder.Append(key_func(range)));
+  }
+  std::shared_ptr<Array> arr;
+  ARROW_EXPECT_OK(builder.Finish(&arr));
+  return arr;
+}
+
+std::shared_ptr<Array> RangesToOffsets(const std::vector<ExpectedRange>& ranges) {
+  return ExpectedRangesToArray(ranges,
+                               [](const ExpectedRange& range) { return range.offset; });
+}
+
+std::shared_ptr<Array> RangesToLengths(const std::vector<ExpectedRange>& ranges) {
+  return ExpectedRangesToArray(ranges,
+                               [](const ExpectedRange& range) { return range.length; });
+}
+
+// We can't validate the buffer addresses exactly because they are unpredictable pointer
+// values.  However, when multiple ranges come from the same buffer we can validate that
+// the buffers are the same.
+void CheckBufferRangeStarts(const std::shared_ptr<Array>& starts,
+                            const std::vector<ExpectedRange>& expected) {
+  const std::shared_ptr<UInt64Array>& starts_uint64 =
+      checked_pointer_cast<UInt64Array>(starts);
+  std::unordered_map<int, uint64_t> previous_buffer_starts;
+  ASSERT_NE(nullptr, starts_uint64);
+  const uint64_t* starts_data = starts_uint64->raw_values();
+  for (std::size_t i = 0; i < expected.size(); i++) {
+    const auto& previous_buffer_start = previous_buffer_starts.find(expected[i].index);
+    if (previous_buffer_start == previous_buffer_starts.end()) {
+      previous_buffer_starts.insert({expected[i].index, starts_data[i]});
+    } else {
+      ASSERT_EQ(starts_data[i], previous_buffer_start->second);
+    }
+  }
+}
+
+void CheckBufferRanges(const std::shared_ptr<Array>& input,
+                       const std::vector<ExpectedRange>& expected) {
+  ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> result, ReferencedRanges(*input->data()));
+  std::shared_ptr<StructArray> result_struct = checked_pointer_cast<StructArray>(result);
+  ASSERT_NE(nullptr, result_struct);
+  AssertArraysEqual(*result_struct->field(1), *RangesToOffsets(expected));
+  AssertArraysEqual(*result_struct->field(2), *RangesToLengths(expected));
+  CheckBufferRangeStarts(result_struct->field(0), expected);
+}
+
+void CheckFixedWidthStarts(const std::shared_ptr<Array>& input) {
+  ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> result, ReferencedRanges(*input->data()));
+  uint64_t expected_values_start =
+      reinterpret_cast<uint64_t>(input->data()->buffers[1]->data());
+  uint64_t expected_validity_start =
+      reinterpret_cast<uint64_t>(input->null_bitmap_data());
+  std::shared_ptr<StructArray> result_struct = checked_pointer_cast<StructArray>(result);
+  const uint64_t* raw_starts =
+      checked_pointer_cast<UInt64Array>(result_struct->field(0))->raw_values();
+  ASSERT_EQ(expected_validity_start, raw_starts[0]);
+  ASSERT_EQ(expected_values_start, raw_starts[1]);
+}
+
+TEST(ByteRanges, StartValue) {
+  std::shared_ptr<Array> bool_arr = ArrayFromJSON(
+      boolean(), "[true, true, true, null, null, null, true, true, true, true]");
+  CheckFixedWidthStarts(bool_arr);
+  CheckFixedWidthStarts(bool_arr->Slice(9, 1));
+
+  std::shared_ptr<Array> ts_arr =
+      ArrayFromJSON(timestamp(TimeUnit::SECOND),
+                    R"(["1970-01-01","2000-02-29","3989-07-14","1900-02-28", null])");
+  CheckFixedWidthStarts(ts_arr);
+  CheckFixedWidthStarts(ts_arr->Slice(2, 1));
+}
+
+TEST(ByteRanges, FixedWidthTypes) {
+  std::shared_ptr<Array> bool_arr = ArrayFromJSON(
+      boolean(), "[true, true, true, null, null, null, true, true, true, true]");
+  CheckBufferRanges(bool_arr, {{0, 0, 2}, {1, 0, 2}});
+  CheckBufferRanges(bool_arr->Slice(1, 8), {{0, 0, 2}, {1, 0, 2}});
+  CheckBufferRanges(bool_arr->Slice(1, 5), {{0, 0, 1}, {1, 0, 1}});
+  CheckBufferRanges(bool_arr->Slice(5, 5), {{0, 0, 2}, {1, 0, 2}});
+  CheckBufferRanges(bool_arr->Slice(9, 1), {{0, 1, 1}, {1, 1, 1}});
+
+  std::shared_ptr<Array> bool_arr_no_validity = ArrayFromJSON(boolean(), "[true, true]");
+  CheckBufferRanges(bool_arr_no_validity, {{0, 0, 1}});
+
+  std::shared_ptr<Array> fsb_arr =
+      ArrayFromJSON(fixed_size_binary(4), R"(["foox", "barz", null])");
+  CheckBufferRanges(fsb_arr, {{0, 0, 1}, {1, 0, 12}});
+  CheckBufferRanges(fsb_arr->Slice(1, 1), {{0, 0, 1}, {1, 4, 4}});
+}
+
+TEST(ByteRanges, DictionaryArray) {
+  std::shared_ptr<Array> dict_arr =
+      ArrayFromJSON(dictionary(int16(), utf8()), R"(["x", "abc", "x", null])");
+  CheckBufferRanges(dict_arr, {{0, 0, 1}, {1, 0, 8}, {2, 0, 8}, {3, 0, 4}});
+  CheckBufferRanges(dict_arr->Slice(2, 2), {{0, 0, 1}, {1, 4, 4}, {2, 0, 8}, {3, 0, 4}});
+}
+
+template <typename Type>
+class ByteRangesVariableBinary : public ::testing::Test {};
+TYPED_TEST_SUITE(ByteRangesVariableBinary, BaseBinaryArrowTypes);
+
+TYPED_TEST(ByteRangesVariableBinary, Basic) {
+  using offset_type = typename TypeParam::offset_type;
+  auto type = TypeTraits<TypeParam>::type_singleton();
+  std::shared_ptr<Array> str_arr = ArrayFromJSON(type, R"(["a", "bb", "ccc", "dddd"])");
+  CheckBufferRanges(str_arr, {{0, 0, 4 * sizeof(offset_type)}, {1, 0, 10}});
+  CheckBufferRanges(str_arr->Slice(0, 1), {{0, 0, sizeof(offset_type)}, {1, 0, 1}});
+  CheckBufferRanges(str_arr->Slice(1, 1),
+                    {{0, sizeof(offset_type), sizeof(offset_type)}, {1, 1, 2}});
+  CheckBufferRanges(str_arr->Slice(2, 2),
+                    {{0, 2 * sizeof(offset_type), 2 * sizeof(offset_type)}, {1, 3, 7}});
+
+  std::shared_ptr<Array> str_with_null_arr =
+      ArrayFromJSON(type, R"(["a", null, "ccc", null])");
+  CheckBufferRanges(str_with_null_arr,
+                    {{0, 0, 1}, {1, 0, 4 * sizeof(offset_type)}, {2, 0, 4}});
+  CheckBufferRanges(str_with_null_arr->Slice(0, 1),
+                    {{0, 0, 1}, {1, 0, sizeof(offset_type)}, {2, 0, 1}});
+  CheckBufferRanges(
+      str_with_null_arr->Slice(1, 1),
+      {{0, 0, 1}, {1, sizeof(offset_type), sizeof(offset_type)}, {2, 1, 0}});
+  CheckBufferRanges(
+      str_with_null_arr->Slice(2, 2),
+      {{0, 0, 1}, {1, 2 * sizeof(offset_type), 2 * sizeof(offset_type)}, {2, 1, 3}});
+}
+
+template <typename Type>
+class ByteRangesList : public ::testing::Test {};
+TYPED_TEST_SUITE(ByteRangesList, ListArrowTypes);
+
+TYPED_TEST(ByteRangesList, Basic) {
+  using offset_type = typename TypeParam::offset_type;
+  std::shared_ptr<DataType> type = std::make_shared<TypeParam>(int32());
+  std::shared_ptr<Array> list_arr = ArrayFromJSON(type, "[[1, 2], [3], [0]]");
+  CheckBufferRanges(list_arr, {{0, 0, 3 * sizeof(offset_type)}, {1, 0, 16}});
+  CheckBufferRanges(list_arr->Slice(2, 1),
+                    {{0, 2 * sizeof(offset_type), sizeof(offset_type)}, {1, 12, 4}});
+
+  std::shared_ptr<Array> list_arr_with_nulls_in_items =
+      ArrayFromJSON(type, "[[1, null], [3], [0]]");
+  CheckBufferRanges(list_arr_with_nulls_in_items,
+                    {{0, 0, 3 * sizeof(offset_type)}, {1, 0, 1}, {2, 0, 16}});
+  CheckBufferRanges(list_arr_with_nulls_in_items->Slice(0, 1),
+                    {{0, 0, sizeof(offset_type)}, {1, 0, 1}, {2, 0, 8}});
+
+  std::shared_ptr<Array> list_arr_with_nulls =
+      ArrayFromJSON(type, "[[1, null], null, [0]]");
+  CheckBufferRanges(list_arr_with_nulls,
+                    {{0, 0, 1}, {1, 0, 3 * sizeof(offset_type)}, {2, 0, 1}, {3, 0, 12}});
+  CheckBufferRanges(list_arr_with_nulls->Slice(0, 2),
+                    {{0, 0, 1}, {1, 0, 2 * sizeof(offset_type)}, {2, 0, 1}, {3, 0, 8}});
+}
+
+TYPED_TEST(ByteRangesList, NestedList) {
+  using offset_type = typename TypeParam::offset_type;
+  std::shared_ptr<DataType> type =
+      std::make_shared<TypeParam>(std::make_shared<TypeParam>(int32()));
+  std::shared_ptr<Array> list_arr =
+      ArrayFromJSON(type, "[[[1], [2, 3, 4]], null, [[null]], [null, [5]]]");
+  CheckBufferRanges(list_arr, {{0, 0, 1},
+                               {1, 0, 4 * sizeof(offset_type)},
+                               {2, 0, 1},
+                               {3, 0, 5 * sizeof(offset_type)},
+                               {4, 0, 1},
+                               {5, 0, 24}});
+  CheckBufferRanges(list_arr->Slice(2, 2),
+                    {{0, 0, 1},
+                     {1, 2 * sizeof(offset_type), 2 * sizeof(offset_type)},
+                     {2, 0, 1},
+                     {3, 2 * sizeof(offset_type), 3 * sizeof(offset_type)},
+                     {4, 0, 1},
+                     {5, 16, 8}});
+}
+
+TEST(ByteRanges, FixedSizeList) {
+  std::shared_ptr<Array> list_arr = ArrayFromJSON(
+      fixed_size_list(int8(), 2), "[[0, 1], [2, 3], [4, 5], [6, 7], [9, 10]]");
+  CheckBufferRanges(list_arr, {{0, 0, 10}});
+  CheckBufferRanges(list_arr->Slice(2, 2), {{0, 4, 4}});
+
+  std::shared_ptr<Array> list_arr_nulls_in_items = ArrayFromJSON(
+      fixed_size_list(int8(), 2), "[[0, null], [null, null], [4, 5], [6, 7], [9, 10]]");
+  CheckBufferRanges(list_arr_nulls_in_items, {{0, 0, 2}, {1, 0, 10}});
+  CheckBufferRanges(list_arr_nulls_in_items->Slice(4, 1), {{0, 1, 1}, {1, 8, 2}});
+
+  std::shared_ptr<Array> list_arr_with_nulls = ArrayFromJSON(
+      fixed_size_list(int8(), 2), "[[0, null], null, [4, 5], null, [9, 10]]");
+  CheckBufferRanges(list_arr_with_nulls, {{0, 0, 1}, {1, 0, 2}, {2, 0, 10}});
+  CheckBufferRanges(list_arr_with_nulls->Slice(4, 1), {{0, 0, 1}, {1, 1, 1}, {2, 8, 2}});
+}
+
+TEST(ByteRanges, Map) {
+  std::shared_ptr<Array> map_arr = ArrayFromJSON(
+      map(utf8(), uint16()), R"([[["x", 1], ["y", 2]], [["x", 3], ["y", 4]]])");
+  CheckBufferRanges(map_arr, {{0, 0, 8},    // 2 32-bit outer list indices
+                              {1, 0, 16},   // 4 32-bit inner list indices
+                              {2, 0, 4},    // 4 keys
+                              {3, 0, 8}});  // 4 values
+  CheckBufferRanges(map_arr->Slice(1, 1), {{0, 4, 4}, {1, 8, 8}, {2, 2, 2}, {3, 4, 4}});
+}
+
+TEST(ByteRanges, DenseUnion) {
+  std::shared_ptr<DataType> type =
+      dense_union({field("a", int64()), field("b", utf8())}, {2, 5});
+  std::shared_ptr<Array> union_array = ArrayFromJSON(type, R"([
+      [2, null],
+      [2, 222],
+      [5, "hello"],
+      [5, "eh"],
+      [2, null],
+      [2, 111],
+      [5, null]
+    ])");
+  CheckBufferRanges(union_array, {
+                                     {0, 0, 7},   // Types buffer, 1 byte per row
+                                     {1, 0, 28},  // Offsets buffer, 4 bytes per row
+                                     {2, 0, 1},   // int64 validity
+                                     {3, 0, 32},  // int64 values
+                                     {4, 0, 1},   // string validity
+                                     {5, 0, 12},  // string offsets
+                                     {6, 0, 7}    // string values
+                                 });
+  CheckBufferRanges(union_array->Slice(3, 3),
+                    {
+                        {0, 3, 3},    // Types buffer, 1 byte per row
+                        {1, 12, 12},  // Offsets buffer, 4 bytes per row
+                        {2, 0, 1},    // int64 validity
+                        {3, 16, 16},  // int64 values
+                        {4, 0, 1},    // string validity
+                        {5, 4, 4},    // string offsets
+                        {6, 5, 2}     // string values
+                    });
+}
+
+TEST(ByteRanges, SparseUnion) {
+  std::shared_ptr<DataType> type =
+      sparse_union({field("a", int64()), field("b", utf8())}, {2, 5});
+  std::shared_ptr<Array> union_array = ArrayFromJSON(type, R"([
+      [2, null],
+      [2, 222],
+      [5, "hello"],
+      [5, "eh"],
+      [2, null],
+      [2, 111],
+      [5, null]
+    ])");
+  CheckBufferRanges(union_array, {
+                                     {0, 0, 7},   // Types buffer, 1 byte per row
+                                     {1, 0, 1},   // int64 validity
+                                     {2, 0, 56},  // int64 values
+                                     {3, 0, 1},   // string validity
+                                     {4, 0, 28},  // string offsets
+                                     {5, 0, 7}    // string values
+                                 });
+  CheckBufferRanges(union_array->Slice(3, 3),
+                    {
+                        {0, 3, 3},    // Types buffer, 1 byte per row
+                        {1, 0, 1},    // int64 validity
+                        {2, 24, 24},  // int64 values
+                        {3, 0, 1},    // string validity
+                        {4, 12, 12},  // string offsets
+                        {5, 5, 2}     // string values
+                    });
+}
+
+TEST(ByteRanges, ExtensionArray) {
+  std::shared_ptr<Array> ext_arr = ExampleUuid();
+  CheckBufferRanges(ext_arr, {{0, 0, 1}, {1, 0, 64}});
+}
+
+TEST(ByteRanges, NullArray) {
+  ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> null_array, MakeArrayOfNull(null(), 42));
+  CheckBufferRanges(null_array, {});
+  CheckBufferRanges(null_array->Slice(15, 10), {});
+}
+
+TEST(ByteRanges, SharedArrayRange) {
+  std::shared_ptr<Array> shared_arr = ArrayFromJSON(int16(), "[1, 2, 3]");
+  ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> struct_arr,
+                       StructArray::Make({shared_arr, shared_arr},
+                                         {field("a", int16()), field("b", int16())}));
+  CheckBufferRanges(struct_arr, {{0, 0, 6}, {0, 0, 6}});
+  auto sliced = checked_pointer_cast<StructArray>(struct_arr->Slice(2, 1));
+  CheckBufferRanges(struct_arr->Slice(2, 1), {{0, 4, 2}, {0, 4, 2}});
+}
+
+TEST(ByteRanges, PartialOverlapArrayRange) {
+  std::shared_ptr<Array> shared_arr = ArrayFromJSON(int16(), "[1, 2, 3]");
+  std::shared_ptr<Array> first = shared_arr->Slice(0, 2);
+  std::shared_ptr<Array> second = shared_arr->Slice(1, 2);
+  ASSERT_OK_AND_ASSIGN(
+      std::shared_ptr<Array> struct_arr,
+      StructArray::Make({first, second}, {field("a", int16()), field("b", int16())}));
+  CheckBufferRanges(struct_arr, {{0, 0, 4}, {0, 2, 4}});
+}
+
+TEST(ByteRanges, ChunkedArrayNoOverlap) {
+  std::shared_ptr<Array> first = ArrayFromJSON(int16(), "[1, null, 3]");
+  std::shared_ptr<Array> second = ArrayFromJSON(int16(), "[4, 5, 6]");
+  std::shared_ptr<ChunkedArray> chunked =
+      std::make_shared<ChunkedArray>(ArrayVector{first, second});
+  ASSERT_OK_AND_EQ(13, ReferencedBufferSize(*chunked));
+}
+
+TEST(ByteRanges, RecordBatchNoOverlap) {
+  std::shared_ptr<Array> first = ArrayFromJSON(int16(), "[1, null, 3]");
+  std::shared_ptr<Array> second = ArrayFromJSON(int16(), "[4, 5, 6]");
+  std::shared_ptr<RecordBatch> record_batch =
+      RecordBatch::Make(schema({field("a", int16()), field("b", int16())}),
+                        first->length(), {first, second});
+  ASSERT_OK_AND_EQ(13, ReferencedBufferSize(*record_batch));
+}
+
+TEST(ByteRanges, TableNoOverlap) {
+  std::shared_ptr<Table> table =
+      TableFromJSON(schema({field("a", int16()), field("b", int16())}),
+                    {"[[1, null], [2, 2]]", "[[3, 4]]"});
+  ASSERT_OK_AND_EQ(13, ReferencedBufferSize(*table));
+}
+
 }  // namespace util
 }  // namespace arrow

From 09b79a10acc4cbd792996276c6bed7bd7ed3765b Mon Sep 17 00:00:00 2001
From: Yue <niyue.com@gmail.com>
Date: Wed, 3 Nov 2021 10:25:03 -0400
Subject: [PATCH 067/194] ARROW-12683: [C++] Enable fine-grained I/O
 (coalescing) in IPC reader

This PR tries to fix https://issues.apache.org/jira/browse/ARROW-12683 ([C++] Enable fine-grained I/O (coalescing) in IPC reader)

This is my first PR for arrow, please forgive my ignorance and let me know the issues for code format/convention/etc.
And probably I chose a wrong issue as the first problem I want to contribute since after investigating this issue for a while, I realize it is more difficult than I expected :(

Currently I chose an approach that can re-use the current code as much as possible in `ArrayLoader`, to do that, I use a no-op random access file to record the IO and replay only the necessary read operation later. But I am not certain if this is the best approach for solving this issue, and if this kind of approach doesn't fit, feel free to reject this PR, and please let me know how this should be done and I can give it another try.

Besides passing the unit tests, I verified the IO behavior under Linux manually by watching the file pages loaded in page cache, and it works largely as I expected, and the IO saving varies depending on the specific field to be accessed.

Closes #11486 from niyue/feature/fine_grained_io

Authored-by: Yue <niyue.com@gmail.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 cpp/src/arrow/ipc/message.cc         |  47 +++++-
 cpp/src/arrow/ipc/message.h          |  13 +-
 cpp/src/arrow/ipc/read_write_test.cc | 225 +++++++++++++++++++++++++++
 cpp/src/arrow/ipc/reader.cc          | 117 +++++++++++++-
 cpp/src/arrow/ipc/reader_internal.h  |  84 ++++++++++
 cpp/src/arrow/ipc/test_common.cc     |  10 ++
 cpp/src/arrow/ipc/test_common.h      |   4 +
 7 files changed, 486 insertions(+), 14 deletions(-)
 create mode 100644 cpp/src/arrow/ipc/reader_internal.h

diff --git a/cpp/src/arrow/ipc/message.cc b/cpp/src/arrow/ipc/message.cc
index 197556efceabe..ab20897e86199 100644
--- a/cpp/src/arrow/ipc/message.cc
+++ b/cpp/src/arrow/ipc/message.cc
@@ -30,6 +30,8 @@
 #include "arrow/io/interfaces.h"
 #include "arrow/ipc/metadata_internal.h"
 #include "arrow/ipc/options.h"
+#include "arrow/ipc/reader.h"
+#include "arrow/ipc/reader_internal.h"
 #include "arrow/ipc/util.h"
 #include "arrow/status.h"
 #include "arrow/util/endian.h"
@@ -279,8 +281,39 @@ std::string FormatMessageType(MessageType type) {
   return "unknown";
 }
 
+Status ReadFieldsSubset(int64_t offset, int32_t metadata_length,
+                        io::RandomAccessFile* file,
+                        const FieldsLoaderFunction& fields_loader,
+                        const std::shared_ptr<Buffer>& metadata, int64_t required_size,
+                        std::shared_ptr<Buffer>& body) {
+  const flatbuf::Message* message = nullptr;
+  uint8_t continuation_metadata_size = sizeof(int32_t) + sizeof(int32_t);
+  // skip 8 bytes (32-bit continuation indicator + 32-bit little-endian length prefix)
+  RETURN_NOT_OK(internal::VerifyMessage(metadata->data() + continuation_metadata_size,
+                                        metadata->size() - continuation_metadata_size,
+                                        &message));
+  auto batch = message->header_as_RecordBatch();
+  if (batch == nullptr) {
+    return Status::IOError(
+        "Header-type of flatbuffer-encoded Message is not RecordBatch.");
+  }
+  internal::IoRecordedRandomAccessFile io_recorded_random_access_file(required_size);
+  RETURN_NOT_OK(fields_loader(batch, &io_recorded_random_access_file));
+  auto const& read_ranges = io_recorded_random_access_file.GetReadRanges();
+  for (auto const& range : read_ranges) {
+    auto read_result = file->ReadAt(offset + metadata_length + range.offset, range.length,
+                                    body->mutable_data() + range.offset);
+    if (!read_result.ok()) {
+      return Status::IOError("Failed to read message body, error ",
+                             read_result.status().ToString());
+    }
+  }
+  return Status::OK();
+}
+
 Result<std::unique_ptr<Message>> ReadMessage(int64_t offset, int32_t metadata_length,
-                                             io::RandomAccessFile* file) {
+                                             io::RandomAccessFile* file,
+                                             const FieldsLoaderFunction& fields_loader) {
   std::unique_ptr<Message> result;
   auto listener = std::make_shared<AssignMessageDecoderListener>(&result);
   MessageDecoder decoder(listener);
@@ -308,8 +341,16 @@ Result<std::unique_ptr<Message>> ReadMessage(int64_t offset, int32_t metadata_le
                              " invalid. File offset: ", offset,
                              ", metadata length: ", metadata_length);
     case MessageDecoder::State::BODY: {
-      ARROW_ASSIGN_OR_RAISE(auto body, file->ReadAt(offset + metadata_length,
-                                                    decoder.next_required_size()));
+      std::shared_ptr<Buffer> body;
+      if (fields_loader) {
+        ARROW_ASSIGN_OR_RAISE(
+            body, AllocateBuffer(decoder.next_required_size(), default_memory_pool()));
+        RETURN_NOT_OK(ReadFieldsSubset(offset, metadata_length, file, fields_loader,
+                                       metadata, decoder.next_required_size(), body));
+      } else {
+        ARROW_ASSIGN_OR_RAISE(
+            body, file->ReadAt(offset + metadata_length, decoder.next_required_size()));
+      }
       if (body->size() < decoder.next_required_size()) {
         return Status::IOError("Expected to be able to read ",
                                decoder.next_required_size(),
diff --git a/cpp/src/arrow/ipc/message.h b/cpp/src/arrow/ipc/message.h
index b2683259cb40e..9c0ed8ced2ebd 100644
--- a/cpp/src/arrow/ipc/message.h
+++ b/cpp/src/arrow/ipc/message.h
@@ -20,6 +20,7 @@
 #pragma once
 
 #include <cstdint>
+#include <functional>
 #include <memory>
 #include <string>
 #include <utility>
@@ -441,6 +442,10 @@ class ARROW_EXPORT MessageReader {
   virtual Result<std::unique_ptr<Message>> ReadNextMessage() = 0;
 };
 
+// the first parameter of the function should be a pointer to metadata (aka.
+// org::apache::arrow::flatbuf::RecordBatch*)
+using FieldsLoaderFunction = std::function<Status(const void*, io::RandomAccessFile*)>;
+
 /// \brief Read encapsulated RPC message from position in file
 ///
 /// Read a length-prefixed message flatbuffer starting at the indicated file
@@ -453,11 +458,13 @@ class ARROW_EXPORT MessageReader {
 /// first 4 bytes after the offset are the message length
 /// \param[in] metadata_length the total number of bytes to read from file
 /// \param[in] file the seekable file interface to read from
+/// \param[in] fields_loader the function for loading subset of fields from the given file
 /// \return the message read
+
 ARROW_EXPORT
-Result<std::unique_ptr<Message>> ReadMessage(const int64_t offset,
-                                             const int32_t metadata_length,
-                                             io::RandomAccessFile* file);
+Result<std::unique_ptr<Message>> ReadMessage(
+    const int64_t offset, const int32_t metadata_length, io::RandomAccessFile* file,
+    const FieldsLoaderFunction& fields_loader = {});
 
 ARROW_EXPORT
 Future<std::shared_ptr<Message>> ReadMessageAsync(
diff --git a/cpp/src/arrow/ipc/read_write_test.cc b/cpp/src/arrow/ipc/read_write_test.cc
index 70edab1f6b74c..4f2e51060d947 100644
--- a/cpp/src/arrow/ipc/read_write_test.cc
+++ b/cpp/src/arrow/ipc/read_write_test.cc
@@ -35,6 +35,7 @@
 #include "arrow/ipc/message.h"
 #include "arrow/ipc/metadata_internal.h"
 #include "arrow/ipc/reader.h"
+#include "arrow/ipc/reader_internal.h"
 #include "arrow/ipc/test_common.h"
 #include "arrow/ipc/writer.h"
 #include "arrow/record_batch.h"
@@ -62,6 +63,7 @@ using internal::TemporaryDir;
 namespace ipc {
 
 using internal::FieldPosition;
+using internal::IoRecordedRandomAccessFile;
 
 namespace test {
 
@@ -1706,6 +1708,54 @@ TEST_F(TestFileFormat, ReadFieldSubset) { TestReadSubsetOfFields(); }
 
 TEST_F(TestFileFormatGenerator, ReadFieldSubset) { TestReadSubsetOfFields(); }
 
+class TrackedRandomAccessFile : public io::RandomAccessFile {
+ public:
+  explicit TrackedRandomAccessFile(io::RandomAccessFile* delegate)
+      : delegate_(delegate) {}
+
+  Status Close() override { return delegate_->Close(); }
+  bool closed() const override { return delegate_->closed(); }
+  Result<int64_t> Tell() const override { return delegate_->Tell(); }
+  Status Seek(int64_t position) override { return delegate_->Seek(position); }
+  Result<int64_t> Read(int64_t nbytes, void* out) override {
+    ARROW_ASSIGN_OR_RAISE(auto position, delegate_->Tell());
+    SaveReadRange(position, nbytes);
+    return delegate_->Read(nbytes, out);
+  }
+  Result<std::shared_ptr<Buffer>> Read(int64_t nbytes) override {
+    ARROW_ASSIGN_OR_RAISE(auto position, delegate_->Tell());
+    SaveReadRange(position, nbytes);
+    return delegate_->Read(nbytes);
+  }
+  bool supports_zero_copy() const override { return delegate_->supports_zero_copy(); }
+  Result<int64_t> GetSize() override { return delegate_->GetSize(); }
+  Result<int64_t> ReadAt(int64_t position, int64_t nbytes, void* out) override {
+    SaveReadRange(position, nbytes);
+    return delegate_->ReadAt(position, nbytes, out);
+  }
+  Result<std::shared_ptr<Buffer>> ReadAt(int64_t position, int64_t nbytes) override {
+    SaveReadRange(position, nbytes);
+    return delegate_->ReadAt(position, nbytes);
+  }
+  Future<std::shared_ptr<Buffer>> ReadAsync(const io::IOContext& io_context,
+                                            int64_t position, int64_t nbytes) override {
+    SaveReadRange(position, nbytes);
+    return delegate_->ReadAsync(io_context, position, nbytes);
+  }
+
+  int64_t num_reads() const { return read_ranges_.size(); }
+
+  const std::vector<io::ReadRange>& get_read_ranges() const { return read_ranges_; }
+
+ private:
+  io::RandomAccessFile* delegate_;
+  std::vector<io::ReadRange> read_ranges_;
+
+  void SaveReadRange(int64_t offset, int64_t length) {
+    read_ranges_.emplace_back(io::ReadRange{offset, length});
+  }
+};
+
 TEST(TestRecordBatchStreamReader, EmptyStreamWithDictionaries) {
   // ARROW-6006
   auto f0 = arrow::field("f0", arrow::dictionary(arrow::int8(), arrow::utf8()));
@@ -2410,6 +2460,181 @@ TEST(DictionaryMemo, AddDictionaryType) {
   AssertMemoDictionaryType(memo, 44, utf8());
 }
 
+TEST(IoRecordedRandomAccessFile, IoRecording) {
+  IoRecordedRandomAccessFile file(42);
+  ASSERT_TRUE(file.GetReadRanges().empty());
+
+  ASSERT_OK(file.ReadAt(1, 2));
+  ASSERT_EQ(file.GetReadRanges().size(), 1);
+  ASSERT_EQ(file.GetReadRanges()[0], (io::ReadRange{1, 2}));
+
+  ASSERT_OK(file.ReadAt(5, 3));
+  ASSERT_EQ(file.GetReadRanges().size(), 2);
+  ASSERT_EQ(file.GetReadRanges()[1], (io::ReadRange{5, 3}));
+
+  // continuous IOs will be merged
+  ASSERT_OK(file.ReadAt(5 + 3, 6));
+  ASSERT_EQ(file.GetReadRanges().size(), 2);
+  ASSERT_EQ(file.GetReadRanges()[1], (io::ReadRange{5, 3 + 6}));
+
+  // this should not happen but reading out of bounds will do no harm
+  ASSERT_OK(file.ReadAt(43, 1));
+}
+
+TEST(IoRecordedRandomAccessFile, IoRecordingWithOutput) {
+  std::shared_ptr<Buffer> out;
+  IoRecordedRandomAccessFile file(42);
+  ASSERT_TRUE(file.GetReadRanges().empty());
+  ASSERT_EQ(file.ReadAt(1, 2, &out), 2L);
+  ASSERT_EQ(file.GetReadRanges().size(), 1);
+  ASSERT_EQ(file.GetReadRanges()[0], (io::ReadRange{1, 2}));
+
+  ASSERT_EQ(file.ReadAt(5, 1, &out), 1);
+  ASSERT_EQ(file.GetReadRanges().size(), 2);
+  ASSERT_EQ(file.GetReadRanges()[1], (io::ReadRange{5, 1}));
+
+  // continuous IOs will be merged
+  ASSERT_EQ(file.ReadAt(5 + 1, 6, &out), 6);
+  ASSERT_EQ(file.GetReadRanges().size(), 2);
+  ASSERT_EQ(file.GetReadRanges()[1], (io::ReadRange{5, 1 + 6}));
+}
+
+TEST(IoRecordedRandomAccessFile, ReadWithCurrentPosition) {
+  IoRecordedRandomAccessFile file(42);
+  ASSERT_TRUE(file.GetReadRanges().empty());
+
+  ASSERT_OK(file.Read(10));
+  ASSERT_EQ(file.GetReadRanges().size(), 1);
+  ASSERT_EQ(file.GetReadRanges()[0], (io::ReadRange{0, 10}));
+
+  // the previous read should advance the position
+  ASSERT_OK(file.Read(10));
+  ASSERT_EQ(file.GetReadRanges().size(), 1);
+  // the two reads are merged into single continuous IO
+  ASSERT_EQ(file.GetReadRanges()[0], (io::ReadRange{0, 20}));
+}
+
+Status MakeBooleanInt32Int64Batch(const int length, std::shared_ptr<RecordBatch>* out) {
+  // Make the schema
+  auto f0 = field("f0", boolean());
+  auto f1 = field("f1", int32());
+  auto f2 = field("f2", int64());
+  auto schema = ::arrow::schema({f0, f1, f2});
+
+  std::shared_ptr<Array> a0, a1, a2;
+  RETURN_NOT_OK(MakeRandomBooleanArray(length, false, &a0));
+  RETURN_NOT_OK(MakeRandomInt32Array(length, false, arrow::default_memory_pool(), &a1));
+  RETURN_NOT_OK(MakeRandomInt64Array(length, false, arrow::default_memory_pool(), &a2));
+  *out = RecordBatch::Make(schema, length, {a0, a1, a2});
+  return Status::OK();
+}
+
+void GetReadRecordBatchReadRanges(
+    uint32_t num_rows, const std::vector<int>& included_fields,
+    const std::vector<int64_t>& expected_body_read_lengths) {
+  std::shared_ptr<RecordBatch> batch;
+  // [bool, int32, int64] batch
+  ASSERT_OK(MakeBooleanInt32Int64Batch(num_rows, &batch));
+
+  ASSERT_OK_AND_ASSIGN(auto sink, io::BufferOutputStream::Create(0));
+  ASSERT_OK_AND_ASSIGN(auto writer, MakeFileWriter(sink.get(), batch->schema()));
+  ASSERT_OK(writer->WriteRecordBatch(*batch));
+  ASSERT_OK(writer->Close());
+  ASSERT_OK_AND_ASSIGN(auto buffer, sink->Finish());
+
+  io::BufferReader buffer_reader(buffer);
+  TrackedRandomAccessFile tracked(&buffer_reader);
+
+  auto read_options = IpcReadOptions::Defaults();
+  // if empty, return all fields
+  read_options.included_fields = included_fields;
+  ASSERT_OK_AND_ASSIGN(auto reader, RecordBatchFileReader::Open(&tracked, read_options));
+  ASSERT_OK_AND_ASSIGN(auto out_batch, reader->ReadRecordBatch(0));
+
+  ASSERT_EQ(out_batch->num_rows(), num_rows);
+  ASSERT_EQ(out_batch->num_columns(),
+            included_fields.empty() ? 3 : included_fields.size());
+
+  auto read_ranges = tracked.get_read_ranges();
+
+  // there are 3 read IOs before reading body:
+  // 1) read magic and footer length IO
+  // 2) read footer IO
+  // 3) read record batch metadata IO
+  ASSERT_EQ(read_ranges.size(), 3 + expected_body_read_lengths.size());
+  const int32_t magic_size = static_cast<int>(strlen(ipc::internal::kArrowMagicBytes));
+  // read magic and footer length IO
+  auto file_end_size = magic_size + sizeof(int32_t);
+  ASSERT_EQ(read_ranges[0].length, file_end_size);
+  // read footer IO
+  ASSERT_EQ(read_ranges[1].length, 256);
+  // read record batch metadata
+  ASSERT_EQ(read_ranges[2].length, 240);
+  for (uint32_t i = 0; i < expected_body_read_lengths.size(); i++) {
+    ASSERT_EQ(read_ranges[3 + i].length, expected_body_read_lengths[i]);
+  }
+}
+
+void GetReadRecordBatchReadRanges(
+    const std::vector<int>& included_fields,
+    const std::vector<int64_t>& expected_body_read_lengths) {
+  return GetReadRecordBatchReadRanges(5, included_fields, expected_body_read_lengths);
+}
+
+TEST(TestRecordBatchFileReaderIo, LoadAllFieldsShouldReadTheEntireBody) {
+  // read the entire record batch body in single read
+  // the batch has 5 * bool + 5 * int32 + 5 * int32
+  // ==>
+  // + 5 bool:  5 bits      (aligned to  8 bytes)
+  // + 5 int32: 5 * 4 bytes (aligned to 24 bytes)
+  // + 5 int64: 5 * 8 bytes (aligned to 40 bytes)
+  GetReadRecordBatchReadRanges({}, {8 + 24 + 40});
+}
+
+TEST(TestRecordBatchFileReaderIo, ReadSingleFieldAtTheStart) {
+  // read only the bool field
+  // + 5 bool:  5 bits (1 byte)
+  GetReadRecordBatchReadRanges({0}, {1});
+}
+
+TEST(TestRecordBatchFileReaderIo, ReadSingleFieldInTheMiddle) {
+  // read only the int32 field
+  // + 5 int32: 5 * 4 bytes
+  GetReadRecordBatchReadRanges({1}, {20});
+}
+
+TEST(TestRecordBatchFileReaderIo, ReadSingleFieldInTheEnd) {
+  // read only the int64 field
+  // + 5 int64: 5 * 8 bytes
+  GetReadRecordBatchReadRanges({2}, {40});
+}
+
+TEST(TestRecordBatchFileReaderIo, SkipTheFieldInTheMiddle) {
+  // read the bool field and the int64 field
+  // two IOs for body are expected, first for reading bool and the second for reading
+  // int64
+  // + 5 bool:  5 bits (1 byte)
+  // + 5 int64: 5 * 8 bytes
+  GetReadRecordBatchReadRanges({0, 2}, {1, 40});
+}
+
+TEST(TestRecordBatchFileReaderIo, ReadTwoContinousFields) {
+  // read the int32 field and the int64 field
+  // + 5 int32: 5 * 4 bytes
+  // + 5 int64: 5 * 8 bytes
+  GetReadRecordBatchReadRanges({1, 2}, {20, 40});
+}
+
+TEST(TestRecordBatchFileReaderIo, ReadTwoContinousFieldsWithIoMerged) {
+  // change the array length to 64 so that bool field and int32 are continuous without
+  // padding
+  // read the bool field and the int32 field since the bool field's aligned offset
+  // is continuous with next field (int32 field), two IOs are merged into one
+  // + 64 bool: 64 bits (8 bytes)
+  // + 64 int32: 64 * 4 bytes (256 bytes)
+  GetReadRecordBatchReadRanges(64, {0, 1}, {8 + 64 * 4});
+}
+
 }  // namespace test
 }  // namespace ipc
 }  // namespace arrow
diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc
index a98f844c749ec..e9d85ff608843 100644
--- a/cpp/src/arrow/ipc/reader.cc
+++ b/cpp/src/arrow/ipc/reader.cc
@@ -36,6 +36,7 @@
 #include "arrow/io/memory.h"
 #include "arrow/ipc/message.h"
 #include "arrow/ipc/metadata_internal.h"
+#include "arrow/ipc/reader_internal.h"
 #include "arrow/ipc/util.h"
 #include "arrow/ipc/writer.h"
 #include "arrow/record_batch.h"
@@ -967,8 +968,9 @@ static inline FileBlock FileBlockFromFlatbuffer(const flatbuf::Block* block) {
   return FileBlock{block->offset(), block->metaDataLength(), block->bodyLength()};
 }
 
-static Result<std::unique_ptr<Message>> ReadMessageFromBlock(const FileBlock& block,
-                                                             io::RandomAccessFile* file) {
+static Result<std::unique_ptr<Message>> ReadMessageFromBlock(
+    const FileBlock& block, io::RandomAccessFile* file,
+    const FieldsLoaderFunction& fields_loader) {
   if (!BitUtil::IsMultipleOf8(block.offset) ||
       !BitUtil::IsMultipleOf8(block.metadata_length) ||
       !BitUtil::IsMultipleOf8(block.body_length)) {
@@ -978,8 +980,8 @@ static Result<std::unique_ptr<Message>> ReadMessageFromBlock(const FileBlock& bl
   // TODO(wesm): this breaks integration tests, see ARROW-3256
   // DCHECK_EQ((*out)->body_length(), block.body_length);
 
-  ARROW_ASSIGN_OR_RAISE(auto message,
-                        ReadMessage(block.offset, block.metadata_length, file));
+  ARROW_ASSIGN_OR_RAISE(auto message, ReadMessage(block.offset, block.metadata_length,
+                                                  file, fields_loader));
   return std::move(message);
 }
 
@@ -1061,6 +1063,31 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader {
     return internal::GetMetadataVersion(footer_->version());
   }
 
+  static Status LoadFieldsSubset(const flatbuf::RecordBatch* metadata,
+                                 const IpcReadOptions& options,
+                                 io::RandomAccessFile* file,
+                                 const std::shared_ptr<Schema>& schema,
+                                 const std::vector<bool>* inclusion_mask,
+                                 MetadataVersion metadata_version = MetadataVersion::V5) {
+    ArrayLoader loader(metadata, metadata_version, options, file);
+    for (int i = 0; i < schema->num_fields(); ++i) {
+      const Field& field = *schema->field(i);
+      if (!inclusion_mask || (*inclusion_mask)[i]) {
+        // Read field
+        ArrayData column;
+        RETURN_NOT_OK(loader.Load(&field, &column));
+        if (metadata->length() != column.length) {
+          return Status::IOError("Array length did not match record batch length");
+        }
+      } else {
+        // Skip field. This logic must be executed to advance the state of the
+        // loader to the next field
+        RETURN_NOT_OK(loader.SkipField(&field));
+      }
+    }
+    return Status::OK();
+  }
+
   Result<std::shared_ptr<RecordBatch>> ReadRecordBatch(int i) override {
     DCHECK_GE(i, 0);
     DCHECK_LT(i, num_record_batches());
@@ -1070,7 +1097,19 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader {
       read_dictionaries_ = true;
     }
 
-    ARROW_ASSIGN_OR_RAISE(auto message, ReadMessageFromBlock(GetRecordBatchBlock(i)));
+    FieldsLoaderFunction fields_loader = {};
+    if (!field_inclusion_mask_.empty()) {
+      auto& schema = schema_;
+      auto& inclusion_mask = field_inclusion_mask_;
+      auto& read_options = options_;
+      fields_loader = [schema, inclusion_mask, read_options](const void* metadata,
+                                                             io::RandomAccessFile* file) {
+        return LoadFieldsSubset(static_cast<const flatbuf::RecordBatch*>(metadata),
+                                read_options, file, schema, &inclusion_mask);
+      };
+    }
+    ARROW_ASSIGN_OR_RAISE(auto message,
+                          ReadMessageFromBlock(GetRecordBatchBlock(i), fields_loader));
 
     CHECK_HAS_BODY(*message);
     ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message->body()));
@@ -1193,8 +1232,10 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader {
     return FileBlockFromFlatbuffer(footer_->dictionaries()->Get(i));
   }
 
-  Result<std::unique_ptr<Message>> ReadMessageFromBlock(const FileBlock& block) {
-    ARROW_ASSIGN_OR_RAISE(auto message, arrow::ipc::ReadMessageFromBlock(block, file_));
+  Result<std::unique_ptr<Message>> ReadMessageFromBlock(
+      const FileBlock& block, const FieldsLoaderFunction& fields_loader = {}) {
+    ARROW_ASSIGN_OR_RAISE(auto message,
+                          arrow::ipc::ReadMessageFromBlock(block, file_, fields_loader));
     ++stats_.num_messages;
     return std::move(message);
   }
@@ -1529,7 +1570,6 @@ class StreamDecoder::StreamDecoderImpl : public MessageDecoderListener {
   }
 
   Status OnRecordBatchMessageDecoded(std::unique_ptr<Message> message) {
-    IpcReadContext context(&dictionary_memo_, options_, swap_endian_);
     if (message->type() == MessageType::DICTIONARY_BATCH) {
       return ReadDictionary(*message);
     } else {
@@ -2090,6 +2130,67 @@ Status FuzzIpcTensorStream(const uint8_t* data, int64_t size) {
   return Status::OK();
 }
 
+Result<int64_t> IoRecordedRandomAccessFile::GetSize() { return file_size_; }
+
+Result<int64_t> IoRecordedRandomAccessFile::ReadAt(int64_t position, int64_t nbytes,
+                                                   void* out) {
+  auto num_bytes_read = std::min(file_size_, position + nbytes) - position;
+
+  if (!read_ranges_.empty() &&
+      position == read_ranges_.back().offset + read_ranges_.back().length) {
+    // merge continuous IOs into one if possible
+    read_ranges_.back().length += num_bytes_read;
+  } else {
+    // no real IO is performed, it is only saved into a vector for replaying later
+    read_ranges_.emplace_back(io::ReadRange{position, num_bytes_read});
+  }
+  return num_bytes_read;
+}
+
+Result<std::shared_ptr<Buffer>> IoRecordedRandomAccessFile::ReadAt(int64_t position,
+                                                                   int64_t nbytes) {
+  std::shared_ptr<Buffer> out;
+  auto result = ReadAt(position, nbytes, &out);
+  return out;
+}
+
+Status IoRecordedRandomAccessFile::Close() {
+  closed_ = true;
+  return Status::OK();
+}
+
+Status IoRecordedRandomAccessFile::Abort() { return Status::OK(); }
+
+Result<int64_t> IoRecordedRandomAccessFile::Tell() const { return position_; }
+
+bool IoRecordedRandomAccessFile::closed() const { return closed_; }
+
+Status IoRecordedRandomAccessFile::Seek(int64_t position) {
+  position_ = position;
+  return Status::OK();
+}
+
+Result<int64_t> IoRecordedRandomAccessFile::Read(int64_t nbytes, void* out) {
+  ARROW_ASSIGN_OR_RAISE(int64_t bytes_read, ReadAt(position_, nbytes, out));
+  position_ += bytes_read;
+  return bytes_read;
+}
+
+Result<std::shared_ptr<Buffer>> IoRecordedRandomAccessFile::Read(int64_t nbytes) {
+  ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> buffer, ReadAt(position_, nbytes));
+  auto num_bytes_read = std::min(file_size_, position_ + nbytes) - position_;
+  position_ += num_bytes_read;
+  return std::move(buffer);
+}
+
+const io::IOContext& IoRecordedRandomAccessFile::io_context() const {
+  return io_context_;
+}
+
+const std::vector<io::ReadRange>& IoRecordedRandomAccessFile::GetReadRanges() const {
+  return read_ranges_;
+}
+
 }  // namespace internal
 }  // namespace ipc
 }  // namespace arrow
diff --git a/cpp/src/arrow/ipc/reader_internal.h b/cpp/src/arrow/ipc/reader_internal.h
new file mode 100644
index 0000000000000..a71d070bb0d0e
--- /dev/null
+++ b/cpp/src/arrow/ipc/reader_internal.h
@@ -0,0 +1,84 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include "arrow/io/type_fwd.h"
+#include "arrow/result.h"
+#include "arrow/type_fwd.h"
+
+namespace arrow {
+namespace io {
+struct ReadRange;
+}
+
+namespace ipc {
+
+namespace internal {
+/// \class IoRecordedRandomAccessFile
+/// \brief An RandomAccessFile that doesn't perform real IO, but only save all the IO
+/// operations it receives, including read operation's <offset, length>, for replaying
+/// later
+class ARROW_EXPORT IoRecordedRandomAccessFile : public io::RandomAccessFile {
+ public:
+  explicit IoRecordedRandomAccessFile(const int64_t file_size)
+      : file_size_(file_size), position_(0) {}
+
+  Status Close() override;
+
+  Status Abort() override;
+
+  /// \brief Return the position in this stream
+  Result<int64_t> Tell() const override;
+
+  /// \brief Return whether the stream is closed
+  bool closed() const override;
+
+  Status Seek(int64_t position) override;
+
+  Result<int64_t> GetSize() override;
+
+  Result<int64_t> ReadAt(int64_t position, int64_t nbytes, void* out) override;
+
+  Result<std::shared_ptr<Buffer>> ReadAt(int64_t position, int64_t nbytes) override;
+
+  Result<int64_t> Read(int64_t nbytes, void* out) override;
+
+  Result<std::shared_ptr<Buffer>> Read(int64_t nbytes) override;
+
+  const io::IOContext& io_context() const override;
+
+  /// \brief Return a vector containing all the read operations this file receives, each
+  /// read operation is represented as an arrow::io::ReadRange
+  ///
+  /// \return a vector
+  const std::vector<io::ReadRange>& GetReadRanges() const;
+
+ private:
+  const int64_t file_size_;
+  std::vector<io::ReadRange> read_ranges_;
+  int64_t position_;
+  bool closed_ = false;
+  io::IOContext io_context_;
+};
+
+}  // namespace internal
+}  // namespace ipc
+}  // namespace arrow
diff --git a/cpp/src/arrow/ipc/test_common.cc b/cpp/src/arrow/ipc/test_common.cc
index 5068eca001ac6..e31bb530c034e 100644
--- a/cpp/src/arrow/ipc/test_common.cc
+++ b/cpp/src/arrow/ipc/test_common.cc
@@ -83,6 +83,16 @@ Status MakeRandomInt32Array(int64_t length, bool include_nulls, MemoryPool* pool
   return Status::OK();
 }
 
+Status MakeRandomInt64Array(int64_t length, bool include_nulls, MemoryPool* pool,
+                            std::shared_ptr<Array>* out, uint32_t seed) {
+  random::RandomArrayGenerator rand(seed);
+  const double null_probability = include_nulls ? 0.5 : 0.0;
+
+  *out = rand.Int64(length, 0, 1000, null_probability);
+
+  return Status::OK();
+}
+
 namespace {
 
 template <typename ArrayType>
diff --git a/cpp/src/arrow/ipc/test_common.h b/cpp/src/arrow/ipc/test_common.h
index 48df28b2d5a28..b4c7e31c925e5 100644
--- a/cpp/src/arrow/ipc/test_common.h
+++ b/cpp/src/arrow/ipc/test_common.h
@@ -44,6 +44,10 @@ ARROW_TESTING_EXPORT
 Status MakeRandomInt32Array(int64_t length, bool include_nulls, MemoryPool* pool,
                             std::shared_ptr<Array>* out, uint32_t seed = 0);
 
+ARROW_TESTING_EXPORT
+Status MakeRandomInt64Array(int64_t length, bool include_nulls, MemoryPool* pool,
+                            std::shared_ptr<Array>* out, uint32_t seed = 0);
+
 ARROW_TESTING_EXPORT
 Status MakeRandomListArray(const std::shared_ptr<Array>& child_array, int num_lists,
                            bool include_nulls, MemoryPool* pool,

From 64b80dd27dc7c88630ab50b49a052a9219ba1700 Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Wed, 3 Nov 2021 16:47:53 +0100
Subject: [PATCH 068/194] ARROW-14554: [C++][CI] Fix OSS-Fuzz build failure
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ARROW-14393 downgraded the bundled GTest version to a version that doesn't build on the OSS-Fuzz setup
(and there doesn't seem to be an obvious fix or workaround around those build errors).

Re-bump the bundled GTest version, and try to fix the issue in the Windows verification script differently.

Also, add a Crossbow task for the OSS-Fuzz build check, since the Github Actions cron job failure went unnoticed.

Closes #11583 from pitrou/oss-fuzz-gtest

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Krisztián Szűcs <szucs.krisztian@gmail.com>
---
 ci/conda_env_cpp.txt                          |  2 +-
 ci/docker/conda-cpp.dockerfile                |  1 +
 cpp/src/arrow/testing/random_test.cc          | 13 +++-
 cpp/thirdparty/versions.txt                   |  4 +-
 dev/release/verify-release-candidate.bat      |  7 --
 .../tasks/fuzz-tests/github.oss-fuzz.yml      | 75 +++++++------------
 dev/tasks/tasks.yml                           | 10 +++
 7 files changed, 51 insertions(+), 61 deletions(-)
 rename .github/workflows/cpp_cron.yml => dev/tasks/fuzz-tests/github.oss-fuzz.yml (53%)

diff --git a/ci/conda_env_cpp.txt b/ci/conda_env_cpp.txt
index d2ccb66a26b42..277bf6a425bcd 100644
--- a/ci/conda_env_cpp.txt
+++ b/ci/conda_env_cpp.txt
@@ -26,7 +26,7 @@ gflags
 glog
 gmock>=1.10.0
 grpc-cpp>=1.27.3
-gtest=1.10.0
+gtest>=1.10.0
 libprotobuf
 libutf8proc
 lz4-c
diff --git a/ci/docker/conda-cpp.dockerfile b/ci/docker/conda-cpp.dockerfile
index ff31930c06cf9..40a855b5dd277 100644
--- a/ci/docker/conda-cpp.dockerfile
+++ b/ci/docker/conda-cpp.dockerfile
@@ -48,6 +48,7 @@ ENV ARROW_BUILD_TESTS=ON \
     ARROW_WITH_SNAPPY=ON \
     ARROW_WITH_ZLIB=ON \
     ARROW_WITH_ZSTD=ON \
+    GTest_SOURCE=BUNDLED \
     PARQUET_BUILD_EXAMPLES=ON \
     PARQUET_BUILD_EXECUTABLES=ON \
     PARQUET_HOME=$CONDA_PREFIX
diff --git a/cpp/src/arrow/testing/random_test.cc b/cpp/src/arrow/testing/random_test.cc
index 002c6c9b7e800..6b20d821d48e6 100644
--- a/cpp/src/arrow/testing/random_test.cc
+++ b/cpp/src/arrow/testing/random_test.cc
@@ -35,9 +35,16 @@ namespace random {
 // Use short arrays since especially in debug mode, generating list(list()) is slow
 constexpr int64_t kExpectedLength = 24;
 
-class RandomArrayTest : public ::testing::TestWithParam<std::shared_ptr<Field>> {
+struct RandomTestParam {
+  RandomTestParam(std::shared_ptr<Field> field)  // NOLINT runtime/explicit
+      : field(std::move(field)) {}
+
+  std::shared_ptr<Field> field;
+};
+
+class RandomArrayTest : public ::testing::TestWithParam<RandomTestParam> {
  protected:
-  std::shared_ptr<Field> GetField() { return GetParam(); }
+  std::shared_ptr<Field> GetField() { return GetParam().field; }
 };
 
 TEST_P(RandomArrayTest, GenerateArray) {
@@ -144,7 +151,7 @@ auto values = ::testing::Values(
 INSTANTIATE_TEST_SUITE_P(
     TestRandomArrayGeneration, RandomArrayTest, values,
     [](const ::testing::TestParamInfo<RandomArrayTest::ParamType>& info) {
-      return std::to_string(info.index) + info.param->name();
+      return std::to_string(info.index) + info.param.field->name();
     });
 
 template <typename T>
diff --git a/cpp/thirdparty/versions.txt b/cpp/thirdparty/versions.txt
index 12da4645849de..3528d4eff9de8 100644
--- a/cpp/thirdparty/versions.txt
+++ b/cpp/thirdparty/versions.txt
@@ -53,8 +53,8 @@ ARROW_GOOGLE_CLOUD_CPP_BUILD_VERSION=v1.31.1
 ARROW_GOOGLE_CLOUD_CPP_BUILD_SHA256_CHECKSUM=dc7cbf95b506a84b48cf71e0462985d262183edeaabdacaaee2109852394a609
 ARROW_GRPC_BUILD_VERSION=v1.35.0
 ARROW_GRPC_BUILD_SHA256_CHECKSUM=27dd2fc5c9809ddcde8eb6fa1fa278a3486566dfc28335fca13eb8df8bd3b958
-ARROW_GTEST_BUILD_VERSION=1.10.0
-ARROW_GTEST_BUILD_SHA256_CHECKSUM=9dc9157a9a1551ec7a7e43daea9a694a0bb5fb8bec81235d8a1e6ef64c716dcb
+ARROW_GTEST_BUILD_VERSION=1.11.0
+ARROW_GTEST_BUILD_SHA256_CHECKSUM=b4870bf121ff7795ba20d20bcdd8627b8e088f2d1dab299a031c1034eddc93d5
 ARROW_JEMALLOC_BUILD_VERSION=5.2.1
 ARROW_JEMALLOC_BUILD_SHA256_CHECKSUM=34330e5ce276099e2e8950d9335db5a875689a4c6a56751ef3b1d8c537f887f6
 ARROW_LZ4_BUILD_VERSION=v1.9.3
diff --git a/dev/release/verify-release-candidate.bat b/dev/release/verify-release-candidate.bat
index fee8c01bc6333..cc5f88379eb92 100644
--- a/dev/release/verify-release-candidate.bat
+++ b/dev/release/verify-release-candidate.bat
@@ -88,17 +88,10 @@ cmake -G "%GENERATOR%" ^
       -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^
       -DCMAKE_INSTALL_PREFIX=%ARROW_HOME% ^
       -DCMAKE_UNITY_BUILD=ON ^
-      -DGTest_SOURCE=BUNDLED ^
       ..  || exit /B
 
 cmake --build . --target INSTALL --config Release || exit /B 1
 
-@rem NOTE(wesm): Building googletest is flaky for me with ninja. Building it
-@rem first fixes the problem
-
-@rem ninja googletest_ep || exit /B 1
-@rem ninja install || exit /B 1
-
 @rem Get testing datasets for Parquet unit tests
 git clone https://github.com/apache/parquet-testing.git %_VERIFICATION_DIR%\parquet-testing
 set PARQUET_TEST_DATA=%_VERIFICATION_DIR%\parquet-testing\data
diff --git a/.github/workflows/cpp_cron.yml b/dev/tasks/fuzz-tests/github.oss-fuzz.yml
similarity index 53%
rename from .github/workflows/cpp_cron.yml
rename to dev/tasks/fuzz-tests/github.oss-fuzz.yml
index c031e5961cb34..43958b51ac100 100644
--- a/.github/workflows/cpp_cron.yml
+++ b/dev/tasks/fuzz-tests/github.oss-fuzz.yml
@@ -1,4 +1,3 @@
-
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
@@ -16,70 +15,50 @@
 # specific language governing permissions and limitations
 # under the License.
 
-name: C++ Cron
-
-on:
-  push:
-    paths:
-      - '.github/workflows/cpp_cron.yml'
-  pull_request:
-    paths:
-      - '.github/workflows/cpp_cron.yml'
-  schedule:
-    - cron: |
-        0 */12 * * *
+{% import 'macros.jinja' as macros with context %}
 
-env:
-  ARROW_ENABLE_TIMING_TESTS: OFF
-  DOCKER_VOLUME_PREFIX: ".docker/"
-  ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }}
-  ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }}
+{{ macros.github_header() }}
 
 jobs:
-
-  oss-fuzz:
-    name: OSS-Fuzz build check
+  test:
+    name: OSS-Fuzz build test
     runs-on: ubuntu-latest
-    if: ${{ !contains(github.event.pull_request.title, 'WIP') && github.repository == 'apache/arrow' }}
-    timeout-minutes: 60
-    strategy:
-      fail-fast: false
-      matrix:
-        ubuntu: [18.04]
+  {% if env is defined %}
+    env:
+    {% for key, value in env.items() %}
+      {{ key }}: "{{ value }}"
+    {% endfor %}
+  {% endif %}
     steps:
-      - name: Checkout Arrow
-        uses: actions/checkout@v2
-        with:
-          fetch-depth: 0
-      - name: Fetch Submodules and Tags
-        shell: bash
-        run: ci/scripts/util_checkout.sh
+      {{ macros.github_checkout_arrow()|indent }}
+
+      - name: Checkout OSS-Fuzz
+        # (not using the dedicated Action for this as its checkout
+        #  location choice is relatively obscure)
+        run: |
+          git clone --depth=50 https://github.com/google/oss-fuzz.git
+
       - name: Free Up Disk Space
         shell: bash
-        run: ci/scripts/util_cleanup.sh
-      - name: Checkout OSS-Fuzz
-        uses: actions/checkout@v1
-        with:
-          path: oss-fuzz
-          repository: google/oss-fuzz
-          ref: master
+        run: arrow/ci/scripts/util_cleanup.sh
+
       - name: Install dependencies
-        working-directory: ../oss-fuzz
+        working-directory: oss-fuzz
         run: |
           python3 -m pip install setuptools
           python3 -m pip install -r infra/ci/requirements.txt
+
       - name: Build image
-        shell: bash
-        working-directory: ../oss-fuzz
+        working-directory: oss-fuzz
         run: |
           python3 infra/helper.py build_image --pull arrow
+
       - name: Build fuzzers
-        shell: bash
-        working-directory: ../oss-fuzz
+        working-directory: oss-fuzz
         run: |
           python3 infra/helper.py build_fuzzers arrow `pwd`/../arrow
+
       - name: Check build
-        shell: bash
-        working-directory: ../oss-fuzz
+        working-directory: oss-fuzz
         run: |
           python3 infra/helper.py check_build arrow
diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml
index 944bc9838c8ee..70926ef8af56f 100644
--- a/dev/tasks/tasks.yml
+++ b/dev/tasks/tasks.yml
@@ -104,6 +104,9 @@ groups:
   example-cpp:
     - example-*cpp*
 
+  fuzz:
+    - test-*fuzz*
+
   verify-rc:
     - verify-rc-*
 
@@ -1185,6 +1188,13 @@ tasks:
     params:
       run: ubuntu-docs
 
+  ############################## Fuzz tests #################################
+
+  test-build-cpp-fuzz:
+    ci: github
+    template: fuzz-tests/github.oss-fuzz.yml
+
+
   ############################## vcpkg tests ##################################
 
   test-build-vcpkg-win:

From 54f38ed46c7f871cddb1c3136c3e7349fd4d7760 Mon Sep 17 00:00:00 2001
From: Attila Lendvai <attila@lendvai.name>
Date: Wed, 3 Nov 2021 17:01:03 +0100
Subject: [PATCH 069/194] ARROW-14550: [Doc] Remove the JSON license; a
 non-free one.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

see https://issues.apache.org/jira/browse/ARROW-14550

Closes #11596 from attila-lendvai/license

Authored-by: Attila Lendvai <attila@lendvai.name>
Signed-off-by: Krisztián Szűcs <szucs.krisztian@gmail.com>
---
 LICENSE.txt            |  36 -------------
 go/arrow/LICENSE.txt   | 114 -----------------------------------------
 go/parquet/LICENSE.txt | 114 -----------------------------------------
 3 files changed, 264 deletions(-)

diff --git a/LICENSE.txt b/LICENSE.txt
index 519a73f04f247..d285caa4ff2c9 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -1488,42 +1488,6 @@ Other dependencies and licenses:
     OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
     DAMAGE.
 
-    Open Source Software Licensed Under the JSON License:
-    --------------------------------------------------------------------
-
-    json.org
-    Copyright (c) 2002 JSON.org
-    All Rights Reserved.
-
-    JSON_checker
-    Copyright (c) 2002 JSON.org
-    All Rights Reserved.
-
-
-    Terms of the JSON License:
-    ---------------------------------------------------
-
-    Permission is hereby granted, free of charge, to any person obtaining a
-    copy of this software and associated documentation files (the "Software"),
-    to deal in the Software without restriction, including without limitation
-    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-    and/or sell copies of the Software, and to permit persons to whom the
-    Software is furnished to do so, subject to the following conditions:
-
-    The above copyright notice and this permission notice shall be included in
-    all copies or substantial portions of the Software.
-
-    The Software shall be used for Good, not Evil.
-
-    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-    DEALINGS IN THE SOFTWARE.
-
-
     Terms of the MIT License:
     --------------------------------------------------------------------
 
diff --git a/go/arrow/LICENSE.txt b/go/arrow/LICENSE.txt
index 6884e08f45599..bb586ce1e1e45 100644
--- a/go/arrow/LICENSE.txt
+++ b/go/arrow/LICENSE.txt
@@ -1275,120 +1275,6 @@ THE SOFTWARE.
 
 --------------------------------------------------------------------------------
 
-3rdparty dependency rapidjson is statically linked in certain binary
-distributions, like the python wheels. rapidjson and its dependencies have the
-following licenses:
-
-Tencent is pleased to support the open source community by making RapidJSON
-available.
-
-Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip.
-All rights reserved.
-
-If you have downloaded a copy of the RapidJSON binary from Tencent, please note
-that the RapidJSON binary is licensed under the MIT License.
-If you have downloaded a copy of the RapidJSON source code from Tencent, please
-note that RapidJSON source code is licensed under the MIT License, except for
-the third-party components listed below which are subject to different license
-terms.  Your integration of RapidJSON into your own projects may require
-compliance with the MIT License, as well as the other licenses applicable to
-the third-party components included within RapidJSON. To avoid the problematic
-JSON license in your own projects, it's sufficient to exclude the
-bin/jsonchecker/ directory, as it's the only code under the JSON license.
-A copy of the MIT License is included in this file.
-
-Other dependencies and licenses:
-
-    Open Source Software Licensed Under the BSD License:
-    --------------------------------------------------------------------
-
-    The msinttypes r29
-    Copyright (c) 2006-2013 Alexander Chemeris
-    All rights reserved.
-
-    Redistribution and use in source and binary forms, with or without
-    modification, are permitted provided that the following conditions are met:
-
-    * Redistributions of source code must retain the above copyright notice,
-    this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright notice,
-    this list of conditions and the following disclaimer in the documentation
-    and/or other materials provided with the distribution.
-    * Neither the name of  copyright holder nor the names of its contributors
-    may be used to endorse or promote products derived from this software
-    without specific prior written permission.
-
-    THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND ANY
-    EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-    WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-    DISCLAIMED. IN NO EVENT SHALL THE REGENTS AND CONTRIBUTORS BE LIABLE FOR
-    ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-    DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-    SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
-    OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
-    DAMAGE.
-
-    Open Source Software Licensed Under the JSON License:
-    --------------------------------------------------------------------
-
-    json.org
-    Copyright (c) 2002 JSON.org
-    All Rights Reserved.
-
-    JSON_checker
-    Copyright (c) 2002 JSON.org
-    All Rights Reserved.
-
-
-    Terms of the JSON License:
-    ---------------------------------------------------
-
-    Permission is hereby granted, free of charge, to any person obtaining a
-    copy of this software and associated documentation files (the "Software"),
-    to deal in the Software without restriction, including without limitation
-    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-    and/or sell copies of the Software, and to permit persons to whom the
-    Software is furnished to do so, subject to the following conditions:
-
-    The above copyright notice and this permission notice shall be included in
-    all copies or substantial portions of the Software.
-
-    The Software shall be used for Good, not Evil.
-
-    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-    DEALINGS IN THE SOFTWARE.
-
-
-    Terms of the MIT License:
-    --------------------------------------------------------------------
-
-    Permission is hereby granted, free of charge, to any person obtaining a
-    copy of this software and associated documentation files (the "Software"),
-    to deal in the Software without restriction, including without limitation
-    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-    and/or sell copies of the Software, and to permit persons to whom the
-    Software is furnished to do so, subject to the following conditions:
-
-    The above copyright notice and this permission notice shall be included
-    in all copies or substantial portions of the Software.
-
-    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-    DEALINGS IN THE SOFTWARE.
-
---------------------------------------------------------------------------------
-
 3rdparty dependency snappy is statically linked in certain binary
 distributions, like the python wheels. snappy has the following license:
 
diff --git a/go/parquet/LICENSE.txt b/go/parquet/LICENSE.txt
index 6884e08f45599..bb586ce1e1e45 100644
--- a/go/parquet/LICENSE.txt
+++ b/go/parquet/LICENSE.txt
@@ -1275,120 +1275,6 @@ THE SOFTWARE.
 
 --------------------------------------------------------------------------------
 
-3rdparty dependency rapidjson is statically linked in certain binary
-distributions, like the python wheels. rapidjson and its dependencies have the
-following licenses:
-
-Tencent is pleased to support the open source community by making RapidJSON
-available.
-
-Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip.
-All rights reserved.
-
-If you have downloaded a copy of the RapidJSON binary from Tencent, please note
-that the RapidJSON binary is licensed under the MIT License.
-If you have downloaded a copy of the RapidJSON source code from Tencent, please
-note that RapidJSON source code is licensed under the MIT License, except for
-the third-party components listed below which are subject to different license
-terms.  Your integration of RapidJSON into your own projects may require
-compliance with the MIT License, as well as the other licenses applicable to
-the third-party components included within RapidJSON. To avoid the problematic
-JSON license in your own projects, it's sufficient to exclude the
-bin/jsonchecker/ directory, as it's the only code under the JSON license.
-A copy of the MIT License is included in this file.
-
-Other dependencies and licenses:
-
-    Open Source Software Licensed Under the BSD License:
-    --------------------------------------------------------------------
-
-    The msinttypes r29
-    Copyright (c) 2006-2013 Alexander Chemeris
-    All rights reserved.
-
-    Redistribution and use in source and binary forms, with or without
-    modification, are permitted provided that the following conditions are met:
-
-    * Redistributions of source code must retain the above copyright notice,
-    this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright notice,
-    this list of conditions and the following disclaimer in the documentation
-    and/or other materials provided with the distribution.
-    * Neither the name of  copyright holder nor the names of its contributors
-    may be used to endorse or promote products derived from this software
-    without specific prior written permission.
-
-    THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND ANY
-    EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-    WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-    DISCLAIMED. IN NO EVENT SHALL THE REGENTS AND CONTRIBUTORS BE LIABLE FOR
-    ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-    DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-    SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
-    OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
-    DAMAGE.
-
-    Open Source Software Licensed Under the JSON License:
-    --------------------------------------------------------------------
-
-    json.org
-    Copyright (c) 2002 JSON.org
-    All Rights Reserved.
-
-    JSON_checker
-    Copyright (c) 2002 JSON.org
-    All Rights Reserved.
-
-
-    Terms of the JSON License:
-    ---------------------------------------------------
-
-    Permission is hereby granted, free of charge, to any person obtaining a
-    copy of this software and associated documentation files (the "Software"),
-    to deal in the Software without restriction, including without limitation
-    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-    and/or sell copies of the Software, and to permit persons to whom the
-    Software is furnished to do so, subject to the following conditions:
-
-    The above copyright notice and this permission notice shall be included in
-    all copies or substantial portions of the Software.
-
-    The Software shall be used for Good, not Evil.
-
-    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-    DEALINGS IN THE SOFTWARE.
-
-
-    Terms of the MIT License:
-    --------------------------------------------------------------------
-
-    Permission is hereby granted, free of charge, to any person obtaining a
-    copy of this software and associated documentation files (the "Software"),
-    to deal in the Software without restriction, including without limitation
-    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-    and/or sell copies of the Software, and to permit persons to whom the
-    Software is furnished to do so, subject to the following conditions:
-
-    The above copyright notice and this permission notice shall be included
-    in all copies or substantial portions of the Software.
-
-    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-    DEALINGS IN THE SOFTWARE.
-
---------------------------------------------------------------------------------
-
 3rdparty dependency snappy is statically linked in certain binary
 distributions, like the python wheels. snappy has the following license:
 

From ffa24439fff5a0a4fb87c5573a930d7db5c6c64f Mon Sep 17 00:00:00 2001
From: Alessandro Molina <amol@turbogears.org>
Date: Wed, 3 Nov 2021 18:13:44 +0100
Subject: [PATCH 070/194] ARROW-14388: [Python] Add unit test for pandas masks

Closes #11481 from amol-/ARROW-14388

Authored-by: Alessandro Molina <amol@turbogears.org>
Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 cpp/src/arrow/python/numpy_to_arrow.cc | 4 ++++
 python/pyarrow/tests/test_array.py     | 8 ++++++++
 2 files changed, 12 insertions(+)

diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc
index a382f76633336..466074592bf1f 100644
--- a/cpp/src/arrow/python/numpy_to_arrow.cc
+++ b/cpp/src/arrow/python/numpy_to_arrow.cc
@@ -156,6 +156,8 @@ class NumPyNullsConverter {
 int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap) {
   int64_t null_count = 0;
 
+  if (!PyArray_Check(mask)) return -1;
+
   Ndarray1DIndexer<uint8_t> mask_values(mask);
   for (int i = 0; i < length; ++i) {
     if (mask_values[i]) {
@@ -268,6 +270,7 @@ class NumPyConverter {
     if (mask_ != nullptr) {
       RETURN_NOT_OK(InitNullBitmap());
       null_count_ = MaskToBitmap(mask_, length_, null_bitmap_data_);
+      if (null_count_ == -1) return Status::Invalid("Invalid mask type");
     } else {
       RETURN_NOT_OK(NumPyNullsConverter::Convert(pool_, arr_, from_pandas_, &null_bitmap_,
                                                  &null_count_));
@@ -768,6 +771,7 @@ Status NumPyConverter::Visit(const StructType& type) {
     if (mask_ != nullptr) {
       RETURN_NOT_OK(InitNullBitmap());
       null_count = MaskToBitmap(mask_, length_, null_bitmap_data_);
+      if (null_count_ == -1) return Status::Invalid("Invalid mask type");
     }
     groups.push_back({std::make_shared<BooleanArray>(length_, null_bitmap_)});
   }
diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py
index 9a1f41efe850f..624b7fbf02b50 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -2901,6 +2901,14 @@ def test_array_supported_masks():
                        mask=pa.array([True, False, True, False]))
 
 
+@pytest.mark.pandas
+def test_array_supported_pandas_masks():
+    import pandas
+    arr = pa.array(pandas.Series([0, 1], name="a", dtype="int64"),
+                   mask=pandas.Series([True, False], dtype='bool'))
+    assert arr.to_pylist() == [None, 1]
+
+
 def test_binary_array_masked():
     # ARROW-12431
     masked_basic = pa.array([b'\x05'], type=pa.binary(1),

From faf8b827493afee60ea44893b64d36a7d5f3fadc Mon Sep 17 00:00:00 2001
From: Nic Crane <thisisnic@gmail.com>
Date: Wed, 3 Nov 2021 21:05:18 +0000
Subject: [PATCH 071/194] ARROW-14365: [R] Update README example to reflect new
 capabilities

Closes #11576 from thisisnic/ARROW-14365_readme

Authored-by: Nic Crane <thisisnic@gmail.com>
Signed-off-by: Nic Crane <thisisnic@gmail.com>
---
 r/README.md | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/r/README.md b/r/README.md
index 3bbcf4b8a444c..dcd529dae2cf7 100644
--- a/r/README.md
+++ b/r/README.md
@@ -261,8 +261,30 @@ result %>% collect()
 
 The `arrow` package works with most single-table `dplyr` verbs, including those
 that compute aggregates.
+
+```r
+sw %>%
+  group_by(species) %>%
+  summarise(mean_height = mean(height, na.rm = TRUE)) %>%
+  collect()
+```
+
 Additionally, equality joins (e.g. `left_join()`, `inner_join()`) are supported
-for joining multiple tables. Window functions (e.g. `ntile()`) are not yet
+for joining multiple tables. 
+
+```r
+jedi <- data.frame(
+  name = c("C-3PO", "Luke Skywalker", "Obi-Wan Kenobi"),
+  jedi = c(FALSE, TRUE, TRUE)
+)
+
+sw %>%
+  select(1:11) %>%
+  right_join(jedi) %>%
+  collect()
+```
+
+Window functions (e.g. `ntile()`) are not yet
 supported. Inside `dplyr` verbs, Arrow offers support for many functions and
 operators, with common functions mapped to their base R and tidyverse
 equivalents. The [changelog](https://arrow.apache.org/docs/r/news/index.html)

From e6b98c9f14276e219c2d94ebfd92a8a152fc16ba Mon Sep 17 00:00:00 2001
From: Terence Honles <terence@honles.com>
Date: Thu, 4 Nov 2021 09:32:20 +0100
Subject: [PATCH 072/194] ARROW-14580: [Python] update trove classifiers to
 include Python 3.10

Closes #11599 from terencehonles/update-trove-classifiers

Authored-by: Terence Honles <terence@honles.com>
Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 python/setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/setup.py b/python/setup.py
index 3f0953c15527c..be94c7bcf1b5d 100755
--- a/python/setup.py
+++ b/python/setup.py
@@ -623,6 +623,7 @@ def has_ext_modules(foo):
         'Programming Language :: Python :: 3.7',
         'Programming Language :: Python :: 3.8',
         'Programming Language :: Python :: 3.9',
+        'Programming Language :: Python :: 3.10',
     ],
     license='Apache License, Version 2.0',
     maintainer='Apache Arrow Developers',

From 083a8aca54769b0be0b16451383a3be6cb97ca16 Mon Sep 17 00:00:00 2001
From: Alenka Frim <frim.alenka@gmail.com>
Date: Thu, 4 Nov 2021 11:42:30 +0100
Subject: [PATCH 073/194] ARROW-13896: [Python] Print of timestamp with
 timezone errors

The initial commit fixes the issue of printing the value of a pyarrow scalar as it errored due to the mismatch of `tzinfo` between `self` and `dt` in `fromutc`.

What I also want to do is to correct the `__str__` method for array class that uses `to_string()` method and ignores tz info. I don't know how to approach this problem as the `to_string()` is not really clear to me. Will do some more research, ideas are welcome.

Due to the second part this PR is not complete yet.

Closes #11574 from AlenkaF/ARROW-13896

Authored-by: Alenka Frim <frim.alenka@gmail.com>
Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 python/pyarrow/scalar.pxi            | 2 +-
 python/pyarrow/tests/test_scalars.py | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi
index 80fcc00286154..748a6a1684fc0 100644
--- a/python/pyarrow/scalar.pxi
+++ b/python/pyarrow/scalar.pxi
@@ -395,7 +395,7 @@ def _datetime_from_int(int64_t value, TimeUnit unit, tzinfo=None):
     dt = datetime.datetime(1970, 1, 1) + delta
     # adjust timezone if set to the datatype
     if tzinfo is not None:
-        dt = tzinfo.fromutc(dt)
+        dt = dt.replace(tzinfo=datetime.timezone.utc).astimezone(tzinfo)
 
     return dt
 
diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py
index 778ce1066aba9..d64cd18239a94 100644
--- a/python/pyarrow/tests/test_scalars.py
+++ b/python/pyarrow/tests/test_scalars.py
@@ -324,6 +324,12 @@ def test_timestamp_no_overflow():
         assert s.as_py() == ts
 
 
+def test_timestamp_fixed_offset_print():
+    # ARROW-13896
+    arr = pa.array([0], pa.timestamp('s', tz='+02:00'))
+    assert str(arr[0]) == "1970-01-01 02:00:00+02:00"
+
+
 def test_duration():
     arr = np.array([0, 3600000000000], dtype='timedelta64[ns]')
 

From c05956982b7b2f4d4b584a3ce746d0a068eaff84 Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Thu, 4 Nov 2021 12:03:05 +0100
Subject: [PATCH 074/194] ARROW-14437: [Python] Make CSV cancellation test more
 robust
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Size the workload according to the machine speed, to increase the probability of actually triggering cancellation.

Closes #11598 from pitrou/ARROW-14437-csv-cancellation-test

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Krisztián Szűcs <szucs.krisztian@gmail.com>
---
 python/pyarrow/tests/test_csv.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py
index 8fd62bb3bc63a..b6cca243bbe84 100644
--- a/python/pyarrow/tests/test_csv.py
+++ b/python/pyarrow/tests/test_csv.py
@@ -1321,8 +1321,17 @@ def test_cancellation(self):
         raise_signal = util.get_raise_signal()
 
         # Make the interruptible workload large enough to not finish
-        # before the interrupt comes, even in release mode on fast machines
-        large_csv = b"a,b,c\n" + b"1,2,3\n" * 200_000_000
+        # before the interrupt comes, even in release mode on fast machines.
+        last_duration = 0.0
+        workload_size = 100_000
+
+        while last_duration < 1.0:
+            print("workload size:", workload_size)
+            large_csv = b"a,b,c\n" + b"1,2,3\n" * workload_size
+            t1 = time.time()
+            self.read_bytes(large_csv)
+            last_duration = time.time() - t1
+            workload_size = workload_size * 3
 
         def signal_from_thread():
             time.sleep(0.2)
@@ -1340,8 +1349,9 @@ def signal_from_thread():
         except KeyboardInterrupt:
             # In case KeyboardInterrupt didn't interrupt `self.read_bytes`
             # above, at least prevent it from stopping the test suite
-            self.fail("KeyboardInterrupt didn't interrupt CSV reading")
+            pytest.fail("KeyboardInterrupt didn't interrupt CSV reading")
         dt = time.time() - t1
+        # Interruption should have arrived timely
         assert dt <= 1.0
         e = exc_info.value.__context__
         assert isinstance(e, pa.ArrowCancelled)

From 48009fbf8c10eb23fa4bb26af8ba90e71b1ee324 Mon Sep 17 00:00:00 2001
From: Yibo Cai <yibo.cai@arm.com>
Date: Thu, 4 Nov 2021 12:47:19 +0100
Subject: [PATCH 075/194] ARROW-14582: [CI] Timeout asan ubsan job after 60m
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes #11603 from cyb70289/asan-ubsan-timeout

Authored-by: Yibo Cai <yibo.cai@arm.com>
Signed-off-by: Krisztián Szűcs <szucs.krisztian@gmail.com>
---
 .github/workflows/cpp.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml
index d0c476ac1f75d..e6ceccc2e0841 100644
--- a/.github/workflows/cpp.yml
+++ b/.github/workflows/cpp.yml
@@ -53,7 +53,7 @@ jobs:
     name: ${{ matrix.title }}
     runs-on: ubuntu-latest
     if: ${{ !contains(github.event.pull_request.title, 'WIP') }}
-    timeout-minutes: 45
+    timeout-minutes: 60
     strategy:
       fail-fast: false
       matrix:

From 55a8b78a1d2ead08fe96e366952c55202870eede Mon Sep 17 00:00:00 2001
From: Weston Pace <weston.pace@gmail.com>
Date: Thu, 4 Nov 2021 09:07:08 -0400
Subject: [PATCH 076/194] ARROW-14581: [C++] Fine-grained IPC reader tests are
 flaky

The footer contains a copy of the schema.  The schema only specifies endianess if the machine is big-endian.  So the footer size differs based on the endianess of the test machine.

Closes #11600 from westonpace/bugfix/ARROW-14581--partial-ipc-io-test-flaky

Authored-by: Weston Pace <weston.pace@gmail.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 cpp/src/arrow/ipc/read_write_test.cc | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/cpp/src/arrow/ipc/read_write_test.cc b/cpp/src/arrow/ipc/read_write_test.cc
index 4f2e51060d947..7bff394bcc779 100644
--- a/cpp/src/arrow/ipc/read_write_test.cc
+++ b/cpp/src/arrow/ipc/read_write_test.cc
@@ -2565,11 +2565,15 @@ void GetReadRecordBatchReadRanges(
   const int32_t magic_size = static_cast<int>(strlen(ipc::internal::kArrowMagicBytes));
   // read magic and footer length IO
   auto file_end_size = magic_size + sizeof(int32_t);
+  auto footer_length_offset = buffer->size() - file_end_size;
+  auto footer_length = BitUtil::FromLittleEndian(
+      util::SafeLoadAs<int32_t>(buffer->data() + footer_length_offset));
   ASSERT_EQ(read_ranges[0].length, file_end_size);
   // read footer IO
-  ASSERT_EQ(read_ranges[1].length, 256);
-  // read record batch metadata
-  ASSERT_EQ(read_ranges[2].length, 240);
+  ASSERT_EQ(read_ranges[1].length, footer_length);
+  // read record batch metadata.  The exact size is tricky to determine but it doesn't
+  // matter for this test and it should be smaller than the footer.
+  ASSERT_LT(read_ranges[2].length, footer_length);
   for (uint32_t i = 0; i < expected_body_read_lengths.size(); i++) {
     ASSERT_EQ(read_ranges[3 + i].length, expected_body_read_lengths[i]);
   }
@@ -2583,7 +2587,7 @@ void GetReadRecordBatchReadRanges(
 
 TEST(TestRecordBatchFileReaderIo, LoadAllFieldsShouldReadTheEntireBody) {
   // read the entire record batch body in single read
-  // the batch has 5 * bool + 5 * int32 + 5 * int32
+  // the batch has 5 * bool + 5 * int32 + 5 * int64
   // ==>
   // + 5 bool:  5 bits      (aligned to  8 bytes)
   // + 5 int32: 5 * 4 bytes (aligned to 24 bytes)

From 5897217ec5ee6f4f58373362a76a70618921c128 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dragos=20Moldovan-Gr=C3=BCnfeld?= <dragos.mold@gmail.com>
Date: Thu, 4 Nov 2021 14:09:09 +0000
Subject: [PATCH 077/194] ARROW-13362 [R] Clean up in/by Arrow messaging
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes #11502 from dragosmg/ARROW-13362_clean_up_in_by_arrow

Authored-by: Dragos Moldovan-Grünfeld <dragos.mold@gmail.com>
Signed-off-by: Nic Crane <thisisnic@gmail.com>
---
 r/R/dplyr-eval.R                           |  4 ++--
 r/R/dplyr-functions.R                      |  2 +-
 r/tests/testthat/helper-expectation.R      |  3 +--
 r/tests/testthat/test-dplyr-filter.R       |  2 +-
 r/tests/testthat/test-dplyr-funcs-math.R   |  4 ++--
 r/tests/testthat/test-dplyr-funcs-string.R | 16 ++++++++--------
 r/tests/testthat/test-dplyr-mutate.R       |  2 +-
 r/tests/testthat/test-dplyr-summarize.R    |  6 +++---
 8 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/r/R/dplyr-eval.R b/r/R/dplyr-eval.R
index 9d944ab808b02..a8fb7c43300c2 100644
--- a/r/R/dplyr-eval.R
+++ b/r/R/dplyr-eval.R
@@ -25,7 +25,7 @@ arrow_eval <- function(expr, mask) {
   tryCatch(eval_tidy(expr, mask), error = function(e) {
     # Look for the cases where bad input was given, i.e. this would fail
     # in regular dplyr anyway, and let those raise those as errors;
-    # else, for things not supported by Arrow return a "try-error",
+    # else, for things not supported in Arrow return a "try-error",
     # which we'll handle differently
     msg <- conditionMessage(e)
     if (getOption("arrow.debug", FALSE)) print(msg)
@@ -72,7 +72,7 @@ i18ize_error_messages <- function() {
 # Helper to raise a common error
 arrow_not_supported <- function(msg) {
   # TODO: raise a classed error?
-  stop(paste(msg, "not supported by Arrow"), call. = FALSE)
+  stop(paste(msg, "not supported in Arrow"), call. = FALSE)
 }
 
 # Create a data mask for evaluating a dplyr expression
diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R
index 717cdae966275..c9e306b961f8e 100644
--- a/r/R/dplyr-functions.R
+++ b/r/R/dplyr-functions.R
@@ -345,7 +345,7 @@ arrow_string_join_function <- function(null_handling, null_replacement = NULL) {
 # Arrow locale will be supported with ARROW-14126
 stop_if_locale_provided <- function(locale) {
   if (!identical(locale, "en")) {
-    stop("Providing a value for 'locale' other than the default ('en') is not supported by Arrow. ",
+    stop("Providing a value for 'locale' other than the default ('en') is not supported in Arrow. ",
       "To change locale, use 'Sys.setlocale()'",
       call. = FALSE
     )
diff --git a/r/tests/testthat/helper-expectation.R b/r/tests/testthat/helper-expectation.R
index ef6142bb4fb45..3a255fcda2cb1 100644
--- a/r/tests/testthat/helper-expectation.R
+++ b/r/tests/testthat/helper-expectation.R
@@ -98,8 +98,7 @@ compare_dplyr_binding <- function(expr,
 
   if (isTRUE(warning)) {
     # Special-case the simple warning:
-    # TODO: ARROW-13362 pick one of in or by and use it everywhere
-    warning <- "not supported (in|by) Arrow; pulling data into R"
+    warning <- "not supported in Arrow; pulling data into R"
   }
 
   skip_msg <- NULL
diff --git a/r/tests/testthat/test-dplyr-filter.R b/r/tests/testthat/test-dplyr-filter.R
index 72a64229cdffe..546c0b118c090 100644
--- a/r/tests/testthat/test-dplyr-filter.R
+++ b/r/tests/testthat/test-dplyr-filter.R
@@ -343,7 +343,7 @@ test_that("Filtering with unsupported functions", {
       ) %>%
       collect(),
     tbl,
-    warning = '\\* In nchar\\(chr, type = "bytes", allowNA = TRUE\\) == 1, allowNA = TRUE not supported by Arrow
+    warning = '\\* In nchar\\(chr, type = "bytes", allowNA = TRUE\\) == 1, allowNA = TRUE not supported in Arrow
 \\* Expression pnorm\\(dbl\\) > 0.99 not supported in Arrow
 pulling data into R'
   )
diff --git a/r/tests/testthat/test-dplyr-funcs-math.R b/r/tests/testthat/test-dplyr-funcs-math.R
index b6663067510f7..045ac72e8f022 100644
--- a/r/tests/testthat/test-dplyr-funcs-math.R
+++ b/r/tests/testthat/test-dplyr-funcs-math.R
@@ -178,14 +178,14 @@ test_that("log functions", {
   # test log(, base = (length != 1))
   expect_error(
     nse_funcs$log(10, base = 5:6),
-    "base must be a column or a length-1 numeric; other values not supported by Arrow",
+    "base must be a column or a length-1 numeric; other values not supported in Arrow",
     fixed = TRUE
   )
 
   # test log(x = (length != 1))
   expect_error(
     nse_funcs$log(10:11),
-    "x must be a column or a length-1 numeric; other values not supported by Arrow",
+    "x must be a column or a length-1 numeric; other values not supported in Arrow",
     fixed = TRUE
   )
 
diff --git a/r/tests/testthat/test-dplyr-funcs-string.R b/r/tests/testthat/test-dplyr-funcs-string.R
index 5e092f4e335f7..05cf319978829 100644
--- a/r/tests/testthat/test-dplyr-funcs-string.R
+++ b/r/tests/testthat/test-dplyr-funcs-string.R
@@ -483,7 +483,7 @@ test_that("str_to_lower, str_to_upper, and str_to_title", {
   # Error checking a single function because they all use the same code path.
   expect_error(
     nse_funcs$str_to_lower("Apache Arrow", locale = "sp"),
-    "Providing a value for 'locale' other than the default ('en') is not supported by Arrow",
+    "Providing a value for 'locale' other than the default ('en') is not supported in Arrow",
     fixed = TRUE
   )
 })
@@ -547,21 +547,21 @@ test_that("errors and warnings in string splitting", {
   x <- Expression$field_ref("x")
   expect_error(
     nse_funcs$str_split(x, fixed("and", ignore_case = TRUE)),
-    "Case-insensitive string splitting not supported by Arrow"
+    "Case-insensitive string splitting not supported in Arrow"
   )
   expect_error(
     nse_funcs$str_split(x, coll("and.?")),
-    "Pattern modifier `coll()` not supported by Arrow",
+    "Pattern modifier `coll()` not supported in Arrow",
     fixed = TRUE
   )
   expect_error(
     nse_funcs$str_split(x, boundary(type = "word")),
-    "Pattern modifier `boundary()` not supported by Arrow",
+    "Pattern modifier `boundary()` not supported in Arrow",
     fixed = TRUE
   )
   expect_error(
     nse_funcs$str_split(x, "and", n = 0),
-    "Splitting strings into zero parts not supported by Arrow"
+    "Splitting strings into zero parts not supported in Arrow"
   )
 
   # This condition generates a warning
@@ -576,12 +576,12 @@ test_that("errors and warnings in string detection and replacement", {
 
   expect_error(
     nse_funcs$str_detect(x, boundary(type = "character")),
-    "Pattern modifier `boundary()` not supported by Arrow",
+    "Pattern modifier `boundary()` not supported in Arrow",
     fixed = TRUE
   )
   expect_error(
     nse_funcs$str_replace_all(x, coll("o", locale = "en"), "ó"),
-    "Pattern modifier `coll()` not supported by Arrow",
+    "Pattern modifier `coll()` not supported in Arrow",
     fixed = TRUE
   )
 
@@ -748,7 +748,7 @@ test_that("errors in strptime", {
   x <- Expression$field_ref("x")
   expect_error(
     nse_funcs$strptime(x, tz = "PDT"),
-    "Time zone argument not supported by Arrow"
+    "Time zone argument not supported in Arrow"
   )
 })
 
diff --git a/r/tests/testthat/test-dplyr-mutate.R b/r/tests/testthat/test-dplyr-mutate.R
index 886ec9e42ba2c..1f5f68d3e9caf 100644
--- a/r/tests/testthat/test-dplyr-mutate.R
+++ b/r/tests/testthat/test-dplyr-mutate.R
@@ -166,7 +166,7 @@ test_that("nchar() arguments", {
     tbl,
     warning = paste0(
       "In nchar\\(verses, type = \"bytes\", allowNA = TRUE\\), ",
-      "allowNA = TRUE not supported by Arrow; pulling data into R"
+      "allowNA = TRUE not supported in Arrow; pulling data into R"
     )
   )
 })
diff --git a/r/tests/testthat/test-dplyr-summarize.R b/r/tests/testthat/test-dplyr-summarize.R
index 3988412b8aa90..f7b7772ae4eb8 100644
--- a/r/tests/testthat/test-dplyr-summarize.R
+++ b/r/tests/testthat/test-dplyr-summarize.R
@@ -437,7 +437,7 @@ test_that("quantile()", {
   expect_warning(
     Table$create(tbl) %>%
       summarize(q = quantile(dbl, probs = c(0.2, 0.8), na.rm = TRUE)),
-    "quantile() with length(probs) != 1 not supported by Arrow",
+    "quantile() with length(probs) != 1 not supported in Arrow",
     fixed = TRUE
   )
 })
@@ -496,7 +496,7 @@ test_that("summarize() with min() and max()", {
       summarize(min_mult = min(dbl, int)) %>%
       collect(),
     tbl,
-    warning = "Multiple arguments to min\\(\\) not supported by Arrow"
+    warning = "Multiple arguments to min\\(\\) not supported in Arrow"
   )
   compare_dplyr_binding(
     .input %>%
@@ -504,7 +504,7 @@ test_that("summarize() with min() and max()", {
       summarize(max_mult = max(int, dbl, dbl2)) %>%
       collect(),
     tbl,
-    warning = "Multiple arguments to max\\(\\) not supported by Arrow"
+    warning = "Multiple arguments to max\\(\\) not supported in Arrow"
   )
 
   # min(logical) or max(logical) yields integer in R

From 0ead7c906dafb73c2b2829681845fe5a808a54e9 Mon Sep 17 00:00:00 2001
From: Eduardo Ponce <edponce00@gmail.com>
Date: Thu, 4 Nov 2021 10:43:21 -0400
Subject: [PATCH 078/194] ARROW-12712: [C++] String repeat kernel

This PR adds the string repeat compute function named "string_repeat". String repeat is a binary function that accepts Binary/StringType(s) and the repetition value(s). Repetition values can be:
* a single value applied to all strings
* an array of values where each repeat count corresponds to the string in the same position

To support inputs of different shapes for this kernel, kernel exec generators and base classes for binary string transforms are also included.

Closes #11023 from edponce/ARROW-12712-String-repeat-kernel

Authored-by: Eduardo Ponce <edponce00@gmail.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 .../arrow/compute/kernels/scalar_string.cc    | 567 ++++++++++++++++--
 .../kernels/scalar_string_benchmark.cc        |  25 +
 .../compute/kernels/scalar_string_test.cc     |  79 ++-
 cpp/src/arrow/util/bit_block_counter.h        |  48 ++
 docs/source/cpp/compute.rst                   |  96 +--
 docs/source/python/api/compute.rst            |   1 +
 r/R/expression.R                              |   4 +-
 r/R/type.R                                    |   4 +-
 r/tests/testthat/test-dplyr-funcs-string.R    |  19 +
 9 files changed, 750 insertions(+), 93 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index 3fce9dd8e4aac..09e189bc562cb 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -31,11 +31,11 @@
 #include "arrow/array/builder_binary.h"
 #include "arrow/array/builder_nested.h"
 #include "arrow/buffer_builder.h"
-
 #include "arrow/builder.h"
 #include "arrow/compute/api_scalar.h"
 #include "arrow/compute/kernels/common.h"
 #include "arrow/util/checked_cast.h"
+#include "arrow/util/string.h"
 #include "arrow/util/utf8.h"
 #include "arrow/util/value_parsing.h"
 #include "arrow/visitor_inline.h"
@@ -330,15 +330,25 @@ struct StringTransformBase {
     return input_ncodeunits;
   }
 
-  virtual Status InvalidStatus() {
+  virtual Status InvalidInputSequence() {
     return Status::Invalid("Invalid UTF8 sequence in input");
   }
-
-  // Derived classes should also define this method:
-  //   int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
-  //                     uint8_t* output);
 };
 
+/// Kernel exec generator for unary string transforms. Types of template
+/// parameter StringTransform need to define a transform method with the
+/// following signature:
+///
+/// int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+///                   uint8_t* output);
+///
+/// where
+///   * `input` is the input sequence (binary or string)
+///   * `input_string_ncodeunits` is the length of input sequence in codeunits
+///   * `output` is the output sequence (binary or string)
+///
+/// and returns the number of codeunits of the `output` sequence or a negative
+/// value if an invalid input sequence is detected.
 template <typename Type, typename StringTransform>
 struct StringTransformExecBase {
   using offset_type = typename Type::offset_type;
@@ -356,27 +366,21 @@ struct StringTransformExecBase {
   static Status ExecArray(KernelContext* ctx, StringTransform* transform,
                           const std::shared_ptr<ArrayData>& data, Datum* out) {
     ArrayType input(data);
-    ArrayData* output = out->mutable_array();
-
     const int64_t input_ncodeunits = input.total_values_length();
     const int64_t input_nstrings = input.length();
-
-    const int64_t output_ncodeunits_max =
+    const int64_t max_output_ncodeunits =
         transform->MaxCodeunits(input_nstrings, input_ncodeunits);
-    if (output_ncodeunits_max > std::numeric_limits<offset_type>::max()) {
-      return Status::CapacityError(
-          "Result might not fit in a 32bit utf8 array, convert to large_utf8");
-    }
+    RETURN_NOT_OK(CheckOutputCapacity(max_output_ncodeunits));
 
-    ARROW_ASSIGN_OR_RAISE(auto values_buffer, ctx->Allocate(output_ncodeunits_max));
+    ArrayData* output = out->mutable_array();
+    ARROW_ASSIGN_OR_RAISE(auto values_buffer, ctx->Allocate(max_output_ncodeunits));
     output->buffers[2] = values_buffer;
 
     // String offsets are preallocated
     offset_type* output_string_offsets = output->GetMutableValues<offset_type>(1);
     uint8_t* output_str = output->buffers[2]->mutable_data();
     offset_type output_ncodeunits = 0;
-
-    output_string_offsets[0] = 0;
+    output_string_offsets[0] = output_ncodeunits;
     for (int64_t i = 0; i < input_nstrings; i++) {
       if (!input.IsNull(i)) {
         offset_type input_string_ncodeunits;
@@ -384,15 +388,15 @@ struct StringTransformExecBase {
         auto encoded_nbytes = static_cast<offset_type>(transform->Transform(
             input_string, input_string_ncodeunits, output_str + output_ncodeunits));
         if (encoded_nbytes < 0) {
-          return transform->InvalidStatus();
+          return transform->InvalidInputSequence();
         }
         output_ncodeunits += encoded_nbytes;
       }
       output_string_offsets[i + 1] = output_ncodeunits;
     }
-    DCHECK_LE(output_ncodeunits, output_ncodeunits_max);
+    DCHECK_LE(output_ncodeunits, max_output_ncodeunits);
 
-    // Trim the codepoint buffer, since we allocated too much
+    // Trim the codepoint buffer, since we may have allocated too much
     return values_buffer->Resize(output_ncodeunits, /*shrink_to_fit=*/true);
   }
 
@@ -402,25 +406,30 @@ struct StringTransformExecBase {
     if (!input.is_valid) {
       return Status::OK();
     }
-    auto* result = checked_cast<BaseBinaryScalar*>(out->scalar().get());
-    result->is_valid = true;
     const int64_t data_nbytes = static_cast<int64_t>(input.value->size());
+    const int64_t max_output_ncodeunits = transform->MaxCodeunits(1, data_nbytes);
+    RETURN_NOT_OK(CheckOutputCapacity(max_output_ncodeunits));
 
-    const int64_t output_ncodeunits_max = transform->MaxCodeunits(1, data_nbytes);
-    if (output_ncodeunits_max > std::numeric_limits<offset_type>::max()) {
-      return Status::CapacityError(
-          "Result might not fit in a 32bit utf8 array, convert to large_utf8");
-    }
-    ARROW_ASSIGN_OR_RAISE(auto value_buffer, ctx->Allocate(output_ncodeunits_max));
+    ARROW_ASSIGN_OR_RAISE(auto value_buffer, ctx->Allocate(max_output_ncodeunits));
+    auto* result = checked_cast<BaseBinaryScalar*>(out->scalar().get());
+    result->is_valid = true;
     result->value = value_buffer;
     auto encoded_nbytes = static_cast<offset_type>(transform->Transform(
         input.value->data(), data_nbytes, value_buffer->mutable_data()));
     if (encoded_nbytes < 0) {
-      return transform->InvalidStatus();
+      return transform->InvalidInputSequence();
     }
-    DCHECK_LE(encoded_nbytes, output_ncodeunits_max);
+    DCHECK_LE(encoded_nbytes, max_output_ncodeunits);
     return value_buffer->Resize(encoded_nbytes, /*shrink_to_fit=*/true);
   }
+
+  static Status CheckOutputCapacity(int64_t ncodeunits) {
+    if (ncodeunits > std::numeric_limits<offset_type>::max()) {
+      return Status::CapacityError(
+          "Result might not fit in a 32bit utf8 array, convert to large_utf8");
+    }
+    return Status::OK();
+  }
 };
 
 template <typename Type, typename StringTransform>
@@ -478,7 +487,7 @@ struct FixedSizeBinaryTransformExecBase {
         auto encoded_nbytes = static_cast<int32_t>(
             transform->Transform(input_string, input_width, output_str));
         if (encoded_nbytes != output_width) {
-          return transform->InvalidStatus();
+          return transform->InvalidInputSequence();
         }
       } else {
         std::memset(output_str, 0x00, output_width);
@@ -505,7 +514,7 @@ struct FixedSizeBinaryTransformExecBase {
     auto encoded_nbytes = static_cast<int32_t>(transform->Transform(
         input.value->data(), data_nbytes, value_buffer->mutable_data()));
     if (encoded_nbytes != out_width) {
-      return transform->InvalidStatus();
+      return transform->InvalidInputSequence();
     }
 
     result->is_valid = true;
@@ -537,6 +546,362 @@ struct FixedSizeBinaryTransformExecWithState
   }
 };
 
+template <typename Type1, typename Type2>
+struct StringBinaryTransformBase {
+  using ViewType2 = typename GetViewType<Type2>::T;
+  using ArrayType1 = typename TypeTraits<Type1>::ArrayType;
+  using ArrayType2 = typename TypeTraits<Type2>::ArrayType;
+
+  virtual ~StringBinaryTransformBase() = default;
+
+  virtual Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    return Status::OK();
+  }
+
+  virtual Status InvalidInputSequence() {
+    return Status::Invalid("Invalid UTF8 sequence in input");
+  }
+
+  // Return the maximum total size of the output in codeunits (i.e. bytes)
+  // given input characteristics for different input shapes.
+  // The Status parameter should only be set if an error needs to be signaled.
+
+  // Scalar-Scalar
+  virtual Result<int64_t> MaxCodeunits(const int64_t input1_ncodeunits, const ViewType2) {
+    return input1_ncodeunits;
+  }
+
+  // Scalar-Array
+  virtual Result<int64_t> MaxCodeunits(const int64_t input1_ncodeunits,
+                                       const ArrayType2&) {
+    return input1_ncodeunits;
+  }
+
+  // Array-Scalar
+  virtual Result<int64_t> MaxCodeunits(const ArrayType1& input1, const ViewType2) {
+    return input1.total_values_length();
+  }
+
+  // Array-Array
+  virtual Result<int64_t> MaxCodeunits(const ArrayType1& input1, const ArrayType2&) {
+    return input1.total_values_length();
+  }
+
+  // Not all combinations of input shapes are meaningful to string binary
+  // transforms, so these flags serve as control toggles for enabling/disabling
+  // the corresponding ones. These flags should be set in the PreExec() method.
+  //
+  // This is an example of a StringTransform that disables support for arguments
+  // with mixed Scalar/Array shapes.
+  //
+  // template <typename Type1, typename Type2>
+  // struct MyStringTransform : public StringBinaryTransformBase<Type1, Type2> {
+  //   Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override {
+  //     enable_scalar_array_ = false;
+  //     enable_array_scalar_ = false;
+  //     return StringBinaryTransformBase::PreExec(ctx, batch, out);
+  //   }
+  //   ...
+  // };
+  bool enable_scalar_scalar_ = true;
+  bool enable_scalar_array_ = true;
+  bool enable_array_scalar_ = true;
+  bool enable_array_array_ = true;
+};
+
+/// Kernel exec generator for binary (two parameters) string transforms.
+/// The first parameter is expected to always be a Binary/StringType while the
+/// second parameter is generic. Types of template parameter StringTransform
+/// need to define a transform method with the following signature:
+///
+/// Result<int64_t> Transform(
+///    const uint8_t* input, const int64_t input_string_ncodeunits,
+///    const ViewType2 value2, uint8_t* output);
+///
+/// where
+///   * `input` - input sequence (binary or string)
+///   * `input_string_ncodeunits` - length of input sequence in codeunits
+///   * `value2` - second argument to the string transform
+///   * `output` - output sequence (binary or string)
+///   * `st` - Status code, only set if transform needs to signal an error
+///
+/// and returns the number of codeunits of the `output` sequence or a negative
+/// value if an invalid input sequence is detected.
+template <typename Type1, typename Type2, typename StringTransform>
+struct StringBinaryTransformExecBase {
+  using offset_type = typename Type1::offset_type;
+  using ViewType2 = typename GetViewType<Type2>::T;
+  using ArrayType1 = typename TypeTraits<Type1>::ArrayType;
+  using ArrayType2 = typename TypeTraits<Type2>::ArrayType;
+
+  static Status Execute(KernelContext* ctx, StringTransform* transform,
+                        const ExecBatch& batch, Datum* out) {
+    if (batch[0].is_scalar()) {
+      if (batch[1].is_scalar()) {
+        if (transform->enable_scalar_scalar_) {
+          return ExecScalarScalar(ctx, transform, batch[0].scalar(), batch[1].scalar(),
+                                  out);
+        }
+      } else if (batch[1].is_array()) {
+        if (transform->enable_scalar_array_) {
+          return ExecScalarArray(ctx, transform, batch[0].scalar(), batch[1].array(),
+                                 out);
+        }
+      }
+    } else if (batch[0].is_array()) {
+      if (batch[1].is_scalar()) {
+        if (transform->enable_array_scalar_) {
+          return ExecArrayScalar(ctx, transform, batch[0].array(), batch[1].scalar(),
+                                 out);
+        }
+      } else if (batch[1].is_array()) {
+        if (transform->enable_array_array_) {
+          return ExecArrayArray(ctx, transform, batch[0].array(), batch[1].array(), out);
+        }
+      }
+    }
+
+    if (!(transform->enable_scalar_scalar_ && transform->enable_scalar_array_ &&
+          transform->enable_array_scalar_ && transform->enable_array_array_)) {
+      return Status::Invalid(
+          "Binary string transform has no combination of operand kinds enabled.");
+    }
+
+    return Status::TypeError("Invalid combination of operands (", batch[0].ToString(),
+                             ", ", batch[1].ToString(), ") for binary string transform.");
+  }
+
+  static Status ExecScalarScalar(KernelContext* ctx, StringTransform* transform,
+                                 const std::shared_ptr<Scalar>& scalar1,
+                                 const std::shared_ptr<Scalar>& scalar2, Datum* out) {
+    if (!scalar1->is_valid || !scalar2->is_valid) {
+      return Status::OK();
+    }
+    const auto& binary_scalar1 = checked_cast<const BaseBinaryScalar&>(*scalar1);
+    const auto input_string = binary_scalar1.value->data();
+    const auto input_ncodeunits = binary_scalar1.value->size();
+    const auto value2 = UnboxScalar<Type2>::Unbox(*scalar2);
+
+    // Calculate max number of output codeunits
+    ARROW_ASSIGN_OR_RAISE(const auto max_output_ncodeunits,
+                          transform->MaxCodeunits(input_ncodeunits, value2));
+    RETURN_NOT_OK(CheckOutputCapacity(max_output_ncodeunits));
+
+    // Allocate output string
+    const auto output = checked_cast<BaseBinaryScalar*>(out->scalar().get());
+    output->is_valid = true;
+    ARROW_ASSIGN_OR_RAISE(auto value_buffer, ctx->Allocate(max_output_ncodeunits));
+    output->value = value_buffer;
+    auto output_string = output->value->mutable_data();
+
+    // Apply transform
+    ARROW_ASSIGN_OR_RAISE(
+        auto encoded_nbytes_,
+        transform->Transform(input_string, input_ncodeunits, value2, output_string));
+    auto encoded_nbytes = static_cast<offset_type>(encoded_nbytes_);
+    if (encoded_nbytes < 0) {
+      return transform->InvalidInputSequence();
+    }
+    DCHECK_LE(encoded_nbytes, max_output_ncodeunits);
+
+    // Trim the codepoint buffer, since we may have allocated too much
+    return value_buffer->Resize(encoded_nbytes, /*shrink_to_fit=*/true);
+  }
+
+  static Status ExecArrayScalar(KernelContext* ctx, StringTransform* transform,
+                                const std::shared_ptr<ArrayData>& data1,
+                                const std::shared_ptr<Scalar>& scalar2, Datum* out) {
+    if (!scalar2->is_valid) {
+      return Status::OK();
+    }
+    const ArrayType1 array1(data1);
+    const auto value2 = UnboxScalar<Type2>::Unbox(*scalar2);
+
+    // Calculate max number of output codeunits
+    ARROW_ASSIGN_OR_RAISE(const auto max_output_ncodeunits,
+                          transform->MaxCodeunits(array1, value2));
+    RETURN_NOT_OK(CheckOutputCapacity(max_output_ncodeunits));
+
+    // Allocate output strings
+    const auto output = out->mutable_array();
+    ARROW_ASSIGN_OR_RAISE(auto values_buffer, ctx->Allocate(max_output_ncodeunits));
+    output->buffers[2] = values_buffer;
+    const auto output_string = output->buffers[2]->mutable_data();
+
+    // String offsets are preallocated
+    auto output_offsets = output->GetMutableValues<offset_type>(1);
+    output_offsets[0] = 0;
+    offset_type output_ncodeunits = 0;
+
+    // Apply transform
+    RETURN_NOT_OK(VisitArrayDataInline<Type1>(
+        *data1,
+        [&](util::string_view input_string_view) {
+          auto input_ncodeunits = static_cast<offset_type>(input_string_view.length());
+          auto input_string = reinterpret_cast<const uint8_t*>(input_string_view.data());
+          ARROW_ASSIGN_OR_RAISE(
+              auto encoded_nbytes_,
+              transform->Transform(input_string, input_ncodeunits, value2,
+                                   output_string + output_ncodeunits));
+          auto encoded_nbytes = static_cast<offset_type>(encoded_nbytes_);
+          if (encoded_nbytes < 0) {
+            return transform->InvalidInputSequence();
+          }
+          output_ncodeunits += encoded_nbytes;
+          *(++output_offsets) = output_ncodeunits;
+          return Status::OK();
+        },
+        [&]() {
+          *(++output_offsets) = output_ncodeunits;
+          return Status::OK();
+        }));
+    DCHECK_LE(output_ncodeunits, max_output_ncodeunits);
+
+    // Trim the codepoint buffer, since we may have allocated too much
+    return values_buffer->Resize(output_ncodeunits, /*shrink_to_fit=*/true);
+  }
+
+  static Status ExecScalarArray(KernelContext* ctx, StringTransform* transform,
+                                const std::shared_ptr<Scalar>& scalar1,
+                                const std::shared_ptr<ArrayData>& data2, Datum* out) {
+    if (!scalar1->is_valid) {
+      return Status::OK();
+    }
+    const auto& binary_scalar1 = checked_cast<const BaseBinaryScalar&>(*scalar1);
+    const auto input_string = binary_scalar1.value->data();
+    const auto input_ncodeunits = binary_scalar1.value->size();
+    const ArrayType2 array2(data2);
+
+    // Calculate max number of output codeunits
+    ARROW_ASSIGN_OR_RAISE(const auto max_output_ncodeunits,
+                          transform->MaxCodeunits(input_ncodeunits, array2));
+    RETURN_NOT_OK(CheckOutputCapacity(max_output_ncodeunits));
+
+    // Allocate output strings
+    const auto output = out->mutable_array();
+    ARROW_ASSIGN_OR_RAISE(auto values_buffer, ctx->Allocate(max_output_ncodeunits));
+    output->buffers[2] = values_buffer;
+    const auto output_string = output->buffers[2]->mutable_data();
+
+    // String offsets are preallocated
+    auto output_offsets = output->GetMutableValues<offset_type>(1);
+    output_offsets[0] = 0;
+    offset_type output_ncodeunits = 0;
+
+    // Apply transform
+    RETURN_NOT_OK(arrow::internal::VisitBitBlocks(
+        data2->buffers[0], data2->offset, data2->length,
+        [&](int64_t i) {
+          auto value2 = array2.GetView(i);
+          ARROW_ASSIGN_OR_RAISE(
+              auto encoded_nbytes_,
+              transform->Transform(input_string, input_ncodeunits, value2,
+                                   output_string + output_ncodeunits));
+          auto encoded_nbytes = static_cast<offset_type>(encoded_nbytes_);
+          if (encoded_nbytes < 0) {
+            return transform->InvalidInputSequence();
+          }
+          output_ncodeunits += encoded_nbytes;
+          *(++output_offsets) = output_ncodeunits;
+          return Status::OK();
+        },
+        [&]() {
+          *(++output_offsets) = output_ncodeunits;
+          return Status::OK();
+        }));
+    DCHECK_LE(output_ncodeunits, max_output_ncodeunits);
+
+    // Trim the codepoint buffer, since we may have allocated too much
+    return values_buffer->Resize(output_ncodeunits, /*shrink_to_fit=*/true);
+  }
+
+  static Status ExecArrayArray(KernelContext* ctx, StringTransform* transform,
+                               const std::shared_ptr<ArrayData>& data1,
+                               const std::shared_ptr<ArrayData>& data2, Datum* out) {
+    const ArrayType1 array1(data1);
+    const ArrayType2 array2(data2);
+
+    // Calculate max number of output codeunits
+    ARROW_ASSIGN_OR_RAISE(const auto max_output_ncodeunits,
+                          transform->MaxCodeunits(array1, array2));
+    RETURN_NOT_OK(CheckOutputCapacity(max_output_ncodeunits));
+
+    // Allocate output strings
+    const auto output = out->mutable_array();
+    ARROW_ASSIGN_OR_RAISE(auto values_buffer, ctx->Allocate(max_output_ncodeunits));
+    output->buffers[2] = values_buffer;
+    const auto output_string = output->buffers[2]->mutable_data();
+
+    // String offsets are preallocated
+    auto output_offsets = output->GetMutableValues<offset_type>(1);
+    output_offsets[0] = 0;
+    offset_type output_ncodeunits = 0;
+
+    // Apply transform
+    RETURN_NOT_OK(arrow::internal::VisitTwoBitBlocks(
+        data1->buffers[0], data1->offset, data2->buffers[0], data2->offset, data1->length,
+        [&](int64_t i) {
+          auto input_string_view = array1.GetView(i);
+          auto input_ncodeunits = static_cast<offset_type>(input_string_view.length());
+          auto input_string = reinterpret_cast<const uint8_t*>(input_string_view.data());
+          auto value2 = array2.GetView(i);
+          ARROW_ASSIGN_OR_RAISE(
+              auto encoded_nbytes_,
+              transform->Transform(input_string, input_ncodeunits, value2,
+                                   output_string + output_ncodeunits));
+          auto encoded_nbytes = static_cast<offset_type>(encoded_nbytes_);
+          if (encoded_nbytes < 0) {
+            return transform->InvalidInputSequence();
+          }
+          output_ncodeunits += encoded_nbytes;
+          *(++output_offsets) = output_ncodeunits;
+          return Status::OK();
+        },
+        [&]() {
+          *(++output_offsets) = output_ncodeunits;
+          return Status::OK();
+        }));
+    DCHECK_LE(output_ncodeunits, max_output_ncodeunits);
+
+    // Trim the codepoint buffer, since we may have allocated too much
+    return values_buffer->Resize(output_ncodeunits, /*shrink_to_fit=*/true);
+  }
+
+  static Status CheckOutputCapacity(int64_t ncodeunits) {
+    if (ncodeunits > std::numeric_limits<offset_type>::max()) {
+      return Status::CapacityError(
+          "Result might not fit in requested binary/string array. "
+          "If possible, convert to a large binary/string.");
+    }
+    return Status::OK();
+  }
+};
+
+template <typename Type1, typename Type2, typename StringTransform>
+struct StringBinaryTransformExec
+    : public StringBinaryTransformExecBase<Type1, Type2, StringTransform> {
+  using StringBinaryTransformExecBase<Type1, Type2, StringTransform>::Execute;
+
+  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    StringTransform transform;
+    RETURN_NOT_OK(transform.PreExec(ctx, batch, out));
+    return Execute(ctx, &transform, batch, out);
+  }
+};
+
+template <typename Type1, typename Type2, typename StringTransform>
+struct StringBinaryTransformExecWithState
+    : public StringBinaryTransformExecBase<Type1, Type2, StringTransform> {
+  using State = typename StringTransform::State;
+  using StringBinaryTransformExecBase<Type1, Type2, StringTransform>::Execute;
+
+  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    StringTransform transform(State::Get(ctx));
+    RETURN_NOT_OK(transform.PreExec(ctx, batch, out));
+    return Execute(ctx, &transform, batch, out);
+  }
+};
+
 #ifdef ARROW_WITH_UTF8PROC
 
 struct FunctionalCaseMappingTransform : public StringTransformBase {
@@ -552,7 +917,7 @@ struct FunctionalCaseMappingTransform : public StringTransformBase {
     // in bytes is actually only at max 3/2 (as covered by the unittest).
     // Note that rounding down the 3/2 is ok, since only codepoints encoded by
     // two code units (even) can grow to 3 code units.
-    return static_cast<int64_t>(input_ncodeunits) * 3 / 2;
+    return input_ncodeunits * 3 / 2;
   }
 };
 
@@ -686,7 +1051,7 @@ struct AsciiReverseTransform : public StringTransformBase {
     return utf8_char_found ? kTransformError : input_string_ncodeunits;
   }
 
-  Status InvalidStatus() override {
+  Status InvalidInputSequence() override {
     return Status::Invalid("Non-ASCII sequence in input");
   }
 };
@@ -2513,6 +2878,137 @@ void AddSplit(FunctionRegistry* registry) {
 #endif
 }
 
+/// An ScalarFunction that promotes integer arguments to Int64.
+struct ScalarCTypeToInt64Function : public ScalarFunction {
+  using ScalarFunction::ScalarFunction;
+
+  Result<const Kernel*> DispatchBest(std::vector<ValueDescr>* values) const override {
+    RETURN_NOT_OK(CheckArity(*values));
+
+    using arrow::compute::detail::DispatchExactImpl;
+    if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
+
+    EnsureDictionaryDecoded(values);
+
+    for (auto& descr : *values) {
+      if (is_integer(descr.type->id())) {
+        descr.type = int64();
+      }
+    }
+
+    if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
+    return arrow::compute::detail::NoMatchingKernel(this, *values);
+  }
+};
+
+template <typename Type1, typename Type2>
+struct BinaryRepeatTransform : public StringBinaryTransformBase<Type1, Type2> {
+  using ArrayType1 = typename TypeTraits<Type1>::ArrayType;
+  using ArrayType2 = typename TypeTraits<Type2>::ArrayType;
+
+  Result<int64_t> MaxCodeunits(const int64_t input1_ncodeunits,
+                               const int64_t num_repeats) override {
+    ARROW_RETURN_NOT_OK(ValidateRepeatCount(num_repeats));
+    return input1_ncodeunits * num_repeats;
+  }
+
+  Result<int64_t> MaxCodeunits(const int64_t input1_ncodeunits,
+                               const ArrayType2& input2) override {
+    int64_t total_num_repeats = 0;
+    for (int64_t i = 0; i < input2.length(); ++i) {
+      auto num_repeats = input2.GetView(i);
+      ARROW_RETURN_NOT_OK(ValidateRepeatCount(num_repeats));
+      total_num_repeats += num_repeats;
+    }
+    return input1_ncodeunits * total_num_repeats;
+  }
+
+  Result<int64_t> MaxCodeunits(const ArrayType1& input1,
+                               const int64_t num_repeats) override {
+    ARROW_RETURN_NOT_OK(ValidateRepeatCount(num_repeats));
+    return input1.total_values_length() * num_repeats;
+  }
+
+  Result<int64_t> MaxCodeunits(const ArrayType1& input1,
+                               const ArrayType2& input2) override {
+    int64_t total_codeunits = 0;
+    for (int64_t i = 0; i < input2.length(); ++i) {
+      auto num_repeats = input2.GetView(i);
+      ARROW_RETURN_NOT_OK(ValidateRepeatCount(num_repeats));
+      total_codeunits += input1.GetView(i).length() * num_repeats;
+    }
+    return total_codeunits;
+  }
+
+  static Result<int64_t> TransformSimpleLoop(const uint8_t* input,
+                                             const int64_t input_string_ncodeunits,
+                                             const int64_t num_repeats, uint8_t* output) {
+    uint8_t* output_start = output;
+    for (int64_t i = 0; i < num_repeats; ++i) {
+      std::memcpy(output, input, input_string_ncodeunits);
+      output += input_string_ncodeunits;
+    }
+    return output - output_start;
+  }
+
+  static Result<int64_t> TransformDoublingString(const uint8_t* input,
+                                                 const int64_t input_string_ncodeunits,
+                                                 const int64_t num_repeats,
+                                                 uint8_t* output) {
+    uint8_t* output_start = output;
+    // Repeated doubling of string
+    // NB: This implementation expects `num_repeats > 0`.
+    std::memcpy(output, input, input_string_ncodeunits);
+    output += input_string_ncodeunits;
+    int64_t irep = 1;
+    for (int64_t ilen = input_string_ncodeunits; irep <= (num_repeats / 2);
+         irep *= 2, ilen *= 2) {
+      std::memcpy(output, output_start, ilen);
+      output += ilen;
+    }
+
+    // Epilogue remainder
+    int64_t rem = (num_repeats - irep) * input_string_ncodeunits;
+    std::memcpy(output, output_start, rem);
+    output += rem;
+    return output - output_start;
+  }
+
+  static Result<int64_t> Transform(const uint8_t* input,
+                                   const int64_t input_string_ncodeunits,
+                                   const int64_t num_repeats, uint8_t* output) {
+    auto transform = (num_repeats < 4) ? TransformSimpleLoop : TransformDoublingString;
+    return transform(input, input_string_ncodeunits, num_repeats, output);
+  }
+
+  static Status ValidateRepeatCount(const int64_t num_repeats) {
+    if (num_repeats < 0) {
+      return Status::Invalid("Repeat count must be a non-negative integer");
+    }
+    return Status::OK();
+  }
+};
+
+template <typename Type1, typename Type2>
+using BinaryRepeat =
+    StringBinaryTransformExec<Type1, Type2, BinaryRepeatTransform<Type1, Type2>>;
+
+const FunctionDoc binary_repeat_doc(
+    "Repeat a binary string",
+    ("For each binary string in `strings`, return a replicated version."),
+    {"strings", "num_repeats"});
+
+void AddBinaryRepeat(FunctionRegistry* registry) {
+  auto func = std::make_shared<ScalarCTypeToInt64Function>(
+      "binary_repeat", Arity::Binary(), &binary_repeat_doc);
+  for (const auto& ty : BaseBinaryTypes()) {
+    auto exec = GenerateVarBinaryToVarBinary<BinaryRepeat, Int64Type>(ty);
+    ScalarKernel kernel{{ty, int64()}, ty, exec};
+    DCHECK_OK(func->AddKernel(std::move(kernel)));
+  }
+  DCHECK_OK(registry->AddFunction(std::move(func)));
+}
+
 // ----------------------------------------------------------------------
 // Replace substring (plain, regex)
 
@@ -4430,7 +4926,6 @@ const FunctionDoc utf8_reverse_doc(
      "clusters. Hence, it will not correctly reverse grapheme clusters\n"
      "composed of multiple codepoints."),
     {"strings"});
-
 }  // namespace
 
 void RegisterScalarStringAscii(FunctionRegistry* registry) {
@@ -4454,7 +4949,6 @@ void RegisterScalarStringAscii(FunctionRegistry* registry) {
                                                    &ascii_rtrim_whitespace_doc);
   MakeUnaryStringBatchKernel<AsciiReverse>("ascii_reverse", registry, &ascii_reverse_doc);
   MakeUnaryStringBatchKernel<Utf8Reverse>("utf8_reverse", registry, &utf8_reverse_doc);
-
   MakeUnaryStringBatchKernelWithState<AsciiCenter>("ascii_center", registry,
                                                    &ascii_center_doc);
   MakeUnaryStringBatchKernelWithState<AsciiLPad>("ascii_lpad", registry, &ascii_lpad_doc);
@@ -4534,6 +5028,7 @@ void RegisterScalarStringAscii(FunctionRegistry* registry) {
   AddSplit(registry);
   AddStrptime(registry);
   AddBinaryJoin(registry);
+  AddBinaryRepeat(registry);
 }
 
 }  // namespace internal
diff --git a/cpp/src/arrow/compute/kernels/scalar_string_benchmark.cc b/cpp/src/arrow/compute/kernels/scalar_string_benchmark.cc
index ddc3a56f00fdc..0977ea7806cb4 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_benchmark.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_benchmark.cc
@@ -210,6 +210,29 @@ static void BinaryJoinElementWiseArrayArray(benchmark::State& state) {
   });
 }
 
+static void BinaryRepeat(benchmark::State& state) {
+  const int64_t array_length = 1 << 20;
+  const int64_t value_min_size = 0;
+  const int64_t value_max_size = 32;
+  const double null_probability = 0.01;
+  const int64_t repeat_min_size = 0;
+  const int64_t repeat_max_size = 8;
+  random::RandomArrayGenerator rng(kSeed);
+
+  // NOTE: this produces only-Ascii data
+  auto values =
+      rng.String(array_length, value_min_size, value_max_size, null_probability);
+  auto num_repeats = rng.Int64(array_length, repeat_min_size, repeat_max_size, 0);
+  // Make sure lookup tables are initialized before measuring
+  ABORT_NOT_OK(CallFunction("binary_repeat", {values, num_repeats}));
+
+  for (auto _ : state) {
+    ABORT_NOT_OK(CallFunction("binary_repeat", {values, num_repeats}));
+  }
+  state.SetItemsProcessed(state.iterations() * array_length);
+  state.SetBytesProcessed(state.iterations() * values->data()->buffers[2]->size());
+}
+
 BENCHMARK(AsciiLower);
 BENCHMARK(AsciiUpper);
 BENCHMARK(IsAlphaNumericAscii);
@@ -236,5 +259,7 @@ BENCHMARK(BinaryJoinArrayArray);
 BENCHMARK(BinaryJoinElementWiseArrayScalar)->RangeMultiplier(8)->Range(2, 128);
 BENCHMARK(BinaryJoinElementWiseArrayArray)->RangeMultiplier(8)->Range(2, 128);
 
+BENCHMARK(BinaryRepeat);
+
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
index be22ef4a7c1b3..4551e8c61e58f 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
@@ -17,6 +17,8 @@
 
 #include <memory>
 #include <string>
+#include <utility>
+#include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
@@ -26,8 +28,10 @@
 #endif
 
 #include "arrow/compute/api_scalar.h"
+#include "arrow/compute/kernels/codegen_internal.h"
 #include "arrow/compute/kernels/test_util.h"
 #include "arrow/testing/gtest_util.h"
+#include "arrow/type.h"
 
 namespace arrow {
 namespace compute {
@@ -64,14 +68,6 @@ class BaseTestStringKernels : public ::testing::Test {
     CheckScalar(func_name, {Datum(input)}, Datum(expected), options);
   }
 
-  void CheckBinaryScalar(std::string func_name, std::string json_left_input,
-                         std::string json_right_scalar, std::shared_ptr<DataType> out_ty,
-                         std::string json_expected,
-                         const FunctionOptions* options = nullptr) {
-    CheckScalarBinaryScalar(func_name, type(), json_left_input, json_right_scalar, out_ty,
-                            json_expected, options);
-  }
-
   void CheckVarArgsScalar(std::string func_name, std::string json_input,
                           std::shared_ptr<DataType> out_ty, std::string json_expected,
                           const FunctionOptions* options = nullptr) {
@@ -1041,6 +1037,73 @@ TYPED_TEST(TestStringKernels, Utf8Title) {
       R"([null, "", "B", "Aaaz;Zææ&", "Ɑɽɽow", "Ii", "Ⱥ.Ⱥ.Ⱥ..Ⱥ", "Hello, World!", "Foo   Bar;Héhé0Zop", "!%$^.,;"])");
 }
 
+TYPED_TEST(TestStringKernels, BinaryRepeatWithScalarRepeat) {
+  auto values = ArrayFromJSON(this->type(),
+                              R"(["aAazZæÆ&", null, "", "b", "ɑɽⱤoW", "ıI",
+                                  "ⱥⱥⱥȺ", "hEllO, WoRld!", "$. A3", "!ɑⱤⱤow"])");
+  std::vector<std::pair<int, std::string>> nrepeats_and_expected{{
+      {0, R"(["", null, "", "", "", "", "", "", "", ""])"},
+      {1, R"(["aAazZæÆ&", null, "", "b", "ɑɽⱤoW", "ıI", "ⱥⱥⱥȺ", "hEllO, WoRld!",
+              "$. A3", "!ɑⱤⱤow"])"},
+      {4, R"(["aAazZæÆ&aAazZæÆ&aAazZæÆ&aAazZæÆ&", null, "", "bbbb",
+              "ɑɽⱤoWɑɽⱤoWɑɽⱤoWɑɽⱤoW", "ıIıIıIıI", "ⱥⱥⱥȺⱥⱥⱥȺⱥⱥⱥȺⱥⱥⱥȺ",
+              "hEllO, WoRld!hEllO, WoRld!hEllO, WoRld!hEllO, WoRld!",
+              "$. A3$. A3$. A3$. A3", "!ɑⱤⱤow!ɑⱤⱤow!ɑⱤⱤow!ɑⱤⱤow"])"},
+  }};
+
+  for (const auto& pair : nrepeats_and_expected) {
+    auto num_repeat = pair.first;
+    auto expected = pair.second;
+    for (const auto& ty : IntTypes()) {
+      this->CheckVarArgs("binary_repeat",
+                         {values, Datum(*arrow::MakeScalar(ty, num_repeat))},
+                         this->type(), expected);
+    }
+  }
+
+  // Negative repeat count
+  for (auto num_repeat_ : {-1, -2, -5}) {
+    auto num_repeat = *arrow::MakeScalar(int64(), num_repeat_);
+    EXPECT_RAISES_WITH_MESSAGE_THAT(
+        Invalid, ::testing::HasSubstr("Repeat count must be a non-negative integer"),
+        CallFunction("binary_repeat", {values, num_repeat}));
+  }
+
+  // Floating-point repeat count
+  for (auto num_repeat_ : {0.0, 1.2, -1.3}) {
+    auto num_repeat = *arrow::MakeScalar(float64(), num_repeat_);
+    EXPECT_RAISES_WITH_MESSAGE_THAT(
+        NotImplemented, ::testing::HasSubstr("has no kernel matching input types"),
+        CallFunction("binary_repeat", {values, num_repeat}));
+  }
+}
+
+TYPED_TEST(TestStringKernels, BinaryRepeatWithArrayRepeat) {
+  auto values = ArrayFromJSON(this->type(),
+                              R"([null, "aAazZæÆ&", "", "b", "ɑɽⱤoW", "ıI",
+                                  "ⱥⱥⱥȺ", "hEllO, WoRld!", "$. A3", "!ɑⱤⱤow"])");
+  for (const auto& ty : IntTypes()) {
+    auto num_repeats = ArrayFromJSON(ty, R"([100, 1, 2, 5, 2, 0, 1, 3, null, 3])");
+    std::string expected =
+        R"([null, "aAazZæÆ&", "", "bbbbb", "ɑɽⱤoWɑɽⱤoW", "", "ⱥⱥⱥȺ",
+            "hEllO, WoRld!hEllO, WoRld!hEllO, WoRld!", null,
+            "!ɑⱤⱤow!ɑⱤⱤow!ɑⱤⱤow"])";
+    this->CheckVarArgs("binary_repeat", {values, num_repeats}, this->type(), expected);
+  }
+
+  // Negative repeat count
+  auto num_repeats = ArrayFromJSON(int64(), R"([100, -1, 2, -5, 2, -1, 3, -2, 3, -100])");
+  EXPECT_RAISES_WITH_MESSAGE_THAT(
+      Invalid, ::testing::HasSubstr("Repeat count must be a non-negative integer"),
+      CallFunction("binary_repeat", {values, num_repeats}));
+
+  // Floating-point repeat count
+  num_repeats = ArrayFromJSON(float64(), R"([0.0, 1.2, -1.3])");
+  EXPECT_RAISES_WITH_MESSAGE_THAT(
+      NotImplemented, ::testing::HasSubstr("has no kernel matching input types"),
+      CallFunction("binary_repeat", {values, num_repeats}));
+}
+
 TYPED_TEST(TestStringKernels, IsAlphaNumericUnicode) {
   // U+08BE (utf8: 	\xE0\xA2\xBE) is undefined, but utf8proc things it is
   // UTF8PROC_CATEGORY_LO
diff --git a/cpp/src/arrow/util/bit_block_counter.h b/cpp/src/arrow/util/bit_block_counter.h
index 460799036050c..9d3a75a5a3174 100644
--- a/cpp/src/arrow/util/bit_block_counter.h
+++ b/cpp/src/arrow/util/bit_block_counter.h
@@ -491,6 +491,54 @@ static void VisitBitBlocksVoid(const std::shared_ptr<Buffer>& bitmap_buf, int64_
   }
 }
 
+template <typename VisitNotNull, typename VisitNull>
+static Status VisitTwoBitBlocks(const std::shared_ptr<Buffer>& left_bitmap_buf,
+                                int64_t left_offset,
+                                const std::shared_ptr<Buffer>& right_bitmap_buf,
+                                int64_t right_offset, int64_t length,
+                                VisitNotNull&& visit_not_null, VisitNull&& visit_null) {
+  if (left_bitmap_buf == NULLPTR || right_bitmap_buf == NULLPTR) {
+    // At most one bitmap is present
+    if (left_bitmap_buf == NULLPTR) {
+      return VisitBitBlocks(right_bitmap_buf, right_offset, length,
+                            std::forward<VisitNotNull>(visit_not_null),
+                            std::forward<VisitNull>(visit_null));
+    } else {
+      return VisitBitBlocks(left_bitmap_buf, left_offset, length,
+                            std::forward<VisitNotNull>(visit_not_null),
+                            std::forward<VisitNull>(visit_null));
+    }
+  }
+  // Both bitmaps are present
+  const uint8_t* left_bitmap = left_bitmap_buf->data();
+  const uint8_t* right_bitmap = right_bitmap_buf->data();
+  BinaryBitBlockCounter bit_counter(left_bitmap, left_offset, right_bitmap, right_offset,
+                                    length);
+  int64_t position = 0;
+  while (position < length) {
+    BitBlockCount block = bit_counter.NextAndWord();
+    if (block.AllSet()) {
+      for (int64_t i = 0; i < block.length; ++i, ++position) {
+        ARROW_RETURN_NOT_OK(visit_not_null(position));
+      }
+    } else if (block.NoneSet()) {
+      for (int64_t i = 0; i < block.length; ++i, ++position) {
+        ARROW_RETURN_NOT_OK(visit_null());
+      }
+    } else {
+      for (int64_t i = 0; i < block.length; ++i, ++position) {
+        if (BitUtil::GetBit(left_bitmap, left_offset + position) &&
+            BitUtil::GetBit(right_bitmap, right_offset + position)) {
+          ARROW_RETURN_NOT_OK(visit_not_null(position));
+        } else {
+          ARROW_RETURN_NOT_OK(visit_null());
+        }
+      }
+    }
+  }
+  return Status::OK();
+}
+
 template <typename VisitNotNull, typename VisitNull>
 static void VisitTwoBitBlocksVoid(const std::shared_ptr<Buffer>& left_bitmap_buf,
                                   int64_t left_offset,
diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst
index 0a87752e92d4b..34b1f3448da7c 100644
--- a/docs/source/cpp/compute.rst
+++ b/docs/source/cpp/compute.rst
@@ -812,45 +812,47 @@ The third set of functions examines string elements on a byte-per-byte basis:
 String transforms
 ~~~~~~~~~~~~~~~~~
 
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| Function name           | Arity | Input types            | Output type            | Options class                     | Notes |
-+=========================+=======+========================+========================+===================================+=======+
-| ascii_capitalize        | Unary | String-like            | String-like            |                                   | \(1)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| ascii_lower             | Unary | String-like            | String-like            |                                   | \(1)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| ascii_reverse           | Unary | String-like            | String-like            |                                   | \(2)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| ascii_swapcase          | Unary | String-like            | String-like            |                                   | \(1)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| ascii_title             | Unary | String-like            | String-like            |                                   | \(1)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| ascii_upper             | Unary | String-like            | String-like            |                                   | \(1)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| binary_length           | Unary | Binary- or String-like | Int32 or Int64         |                                   | \(3)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| binary_replace_slice    | Unary | Binary- or String-like | Binary- or String-like | :struct:`ReplaceSliceOptions`     | \(4)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| replace_substring       | Unary | Binary- or String-like | Binary- or String-like | :struct:`ReplaceSubstringOptions` | \(5)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| replace_substring_regex | Unary | Binary- or String-like | Binary- or String-like | :struct:`ReplaceSubstringOptions` | \(6)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| utf8_capitalize         | Unary | String-like            | String-like            |                                   | \(8)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| utf8_length             | Unary | String-like            | Int32 or Int64         |                                   | \(7)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| utf8_lower              | Unary | String-like            | String-like            |                                   | \(8)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| utf8_replace_slice      | Unary | String-like            | String-like            | :struct:`ReplaceSliceOptions`     | \(4)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| utf8_reverse            | Unary | String-like            | String-like            |                                   | \(9)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| utf8_swapcase           | Unary | String-like            | String-like            |                                   | \(8)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| utf8_title              | Unary | String-like            | String-like            |                                   | \(8)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| utf8_upper              | Unary | String-like            | String-like            |                                   | \(8)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
++-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+
+| Function name           | Arity  | Input types                             | Output type            | Options class                     | Notes |
++=========================+========+=========================================+========================+===================================+=======+
+| ascii_capitalize        | Unary  | String-like                             | String-like            |                                   | \(1)  |
++-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+
+| ascii_lower             | Unary  | String-like                             | String-like            |                                   | \(1)  |
++-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+
+| ascii_reverse           | Unary  | String-like                             | String-like            |                                   | \(2)  |
++-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+
+| ascii_swapcase          | Unary  | String-like                             | String-like            |                                   | \(1)  |
++-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+
+| ascii_title             | Unary  | String-like                             | String-like            |                                   | \(1)  |
++-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+
+| ascii_upper             | Unary  | String-like                             | String-like            |                                   | \(1)  |
++-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+
+| binary_length           | Unary  | Binary- or String-like                  | Int32 or Int64         |                                   | \(3)  |
++-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+
+| binary_repeat           | Binary | Binary/String (Arg 0); Integral (Arg 1) | Binary- or String-like |                                   | \(4)  |
++-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+
+| binary_replace_slice    | Unary  | String-like                             | Binary- or String-like | :struct:`ReplaceSliceOptions`     | \(5)  |
++-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+
+| replace_substring       | Unary  | String-like                             | String-like            | :struct:`ReplaceSubstringOptions` | \(6)  |
++-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+
+| replace_substring_regex | Unary  | String-like                             | String-like            | :struct:`ReplaceSubstringOptions` | \(7)  |
++-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+
+| utf8_capitalize         | Unary  | String-like                             | String-like            |                                   | \(8)  |
++-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+
+| utf8_length             | Unary  | String-like                             | Int32 or Int64         |                                   | \(9)  |
++-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+
+| utf8_lower              | Unary  | String-like                             | String-like            |                                   | \(8)  |
++-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+
+| utf8_replace_slice      | Unary  | String-like                             | String-like            | :struct:`ReplaceSliceOptions`     | \(6)  |
++-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+
+| utf8_reverse            | Unary  | String-like                             | String-like            |                                   | \(10) |
++-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+
+| utf8_swapcase           | Unary  | String-like                             | String-like            |                                   | \(8)  |
++-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+
+| utf8_title              | Unary  | String-like                             | String-like            |                                   | \(8)  |
++-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+
+| utf8_upper              | Unary  | String-like                             | String-like            |                                   | \(8)  |
++-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+
 
 * \(1) Each ASCII character in the input is converted to lowercase or
   uppercase.  Non-ASCII characters are left untouched.
@@ -861,31 +863,33 @@ String transforms
 * \(3) Output is the physical length in bytes of each input element.  Output
   type is Int32 for Binary/String, Int64 for LargeBinary/LargeString.
 
-* \(4) Replace the slice of the substring from :member:`ReplaceSliceOptions::start`
+* \(4) Repeat the input binary string a given number of times.
+
+* \(5) Replace the slice of the substring from :member:`ReplaceSliceOptions::start`
   (inclusive) to :member:`ReplaceSliceOptions::stop` (exclusive) by
   :member:`ReplaceSubstringOptions::replacement`. The binary kernel measures the
   slice in bytes, while the UTF8 kernel measures the slice in codeunits.
 
-* \(5) Replace non-overlapping substrings that match to
+* \(6) Replace non-overlapping substrings that match to
   :member:`ReplaceSubstringOptions::pattern` by
   :member:`ReplaceSubstringOptions::replacement`. If
   :member:`ReplaceSubstringOptions::max_replacements` != -1, it determines the
   maximum number of replacements made, counting from the left.
 
-* \(6) Replace non-overlapping substrings that match to the regular expression
+* \(7) Replace non-overlapping substrings that match to the regular expression
   :member:`ReplaceSubstringOptions::pattern` by
   :member:`ReplaceSubstringOptions::replacement`, using the Google RE2 library. If
   :member:`ReplaceSubstringOptions::max_replacements` != -1, it determines the
   maximum number of replacements made, counting from the left. Note that if the
   pattern contains groups, backreferencing can be used.
 
-* \(7) Output is the number of characters (not bytes) of each input element.
-  Output type is Int32 for String, Int64 for LargeString.
-
 * \(8) Each UTF8-encoded character in the input is converted to lowercase or
   uppercase.
 
-* \(9) Each UTF8-encoded code unit is written in reverse order to the output.
+* \(9) Output is the number of characters (not bytes) of each input element.
+  Output type is Int32 for String, Int64 for LargeString.
+
+* \(10) Each UTF8-encoded code unit is written in reverse order to the output.
   If the input is not valid UTF8, then the output is undefined (but the size of output
   buffers will be preserved).
 
diff --git a/docs/source/python/api/compute.rst b/docs/source/python/api/compute.rst
index 521182f8a41f5..225d853718fe1 100644
--- a/docs/source/python/api/compute.rst
+++ b/docs/source/python/api/compute.rst
@@ -267,6 +267,7 @@ String Transforms
    ascii_title
    ascii_upper
    binary_length
+   binary_repeat
    binary_replace_slice
    replace_substring
    replace_substring_regex
diff --git a/r/R/expression.R b/r/R/expression.R
index b1b6635f53812..9c2554f9e05c7 100644
--- a/r/R/expression.R
+++ b/r/R/expression.R
@@ -100,7 +100,9 @@
   # use `%/%` above.
   "%%" = "divide_checked",
   "^" = "power_checked",
-  "%in%" = "is_in_meta_binary"
+  "%in%" = "is_in_meta_binary",
+  "strrep" = "binary_repeat",
+  "str_dup" = "binary_repeat"
 )
 
 .array_function_map <- c(.unary_function_map, .binary_function_map)
diff --git a/r/R/type.R b/r/R/type.R
index 4ef7cefb56e2d..afa9a094af15f 100644
--- a/r/R/type.R
+++ b/r/R/type.R
@@ -481,12 +481,12 @@ canonical_type_str <- function(type_str) {
 }
 
 # vctrs support -----------------------------------------------------------
-str_dup <- function(x, times) {
+duplicate_string <- function(x, times) {
   paste0(rep(x, times = times), collapse = "")
 }
 
 indent <- function(x, n) {
-  pad <- str_dup(" ", n)
+  pad <- duplicate_string(" ", n)
   sapply(x, gsub, pattern = "(\n+)", replacement = paste0("\\1", pad))
 }
 
diff --git a/r/tests/testthat/test-dplyr-funcs-string.R b/r/tests/testthat/test-dplyr-funcs-string.R
index 05cf319978829..f0965926f291c 100644
--- a/r/tests/testthat/test-dplyr-funcs-string.R
+++ b/r/tests/testthat/test-dplyr-funcs-string.R
@@ -467,6 +467,25 @@ test_that("strsplit and str_split", {
   )
 })
 
+test_that("strrep and str_dup", {
+  df <- tibble(x = c("foo1", " \tB a R\n", "!apACHe aRroW!"))
+  for (times in 0:8) {
+    compare_dplyr_binding(
+      .input %>%
+        mutate(x = strrep(x, times)) %>%
+        collect(),
+      df
+    )
+
+    compare_dplyr_binding(
+      .input %>%
+        mutate(x = str_dup(x, times)) %>%
+        collect(),
+      df
+    )
+  }
+})
+
 test_that("str_to_lower, str_to_upper, and str_to_title", {
   df <- tibble(x = c("foo1", " \tB a R\n", "!apACHe aRroW!"))
   compare_dplyr_binding(

From 3626a08d04faf57fc123ef8d94bdbe65a60cbbb6 Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Thu, 4 Nov 2021 16:14:55 +0100
Subject: [PATCH 079/194] ARROW-14523: [C++] Fix potential data loss in S3
 multipart upload

Work around a critical bug in the AWS SDK for C++ that fails to detect errors returned by CompleteMultipartUpload in the body of a 200 OK response:
https://github.com/aws/aws-sdk-cpp/issues/658

Closes #11594 from pitrou/ARROW-14523-s3-cmu-error-fix

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/arrow/filesystem/s3fs.cc | 114 ++++++++++++++++++++++++++++---
 1 file changed, 106 insertions(+), 8 deletions(-)

diff --git a/cpp/src/arrow/filesystem/s3fs.cc b/cpp/src/arrow/filesystem/s3fs.cc
index 314abdf33934c..49766d1758656 100644
--- a/cpp/src/arrow/filesystem/s3fs.cc
+++ b/cpp/src/arrow/filesystem/s3fs.cc
@@ -19,10 +19,13 @@
 
 #include <algorithm>
 #include <atomic>
+#include <chrono>
 #include <condition_variable>
 #include <functional>
+#include <memory>
 #include <mutex>
 #include <sstream>
+#include <thread>
 #include <unordered_map>
 #include <utility>
 
@@ -41,10 +44,12 @@
 #include <aws/core/auth/AWSCredentials.h>
 #include <aws/core/auth/AWSCredentialsProviderChain.h>
 #include <aws/core/auth/STSCredentialsProvider.h>
+#include <aws/core/client/DefaultRetryStrategy.h>
 #include <aws/core/client/RetryStrategy.h>
 #include <aws/core/http/HttpResponse.h>
 #include <aws/core/utils/logging/ConsoleLogSystem.h>
 #include <aws/core/utils/stream/PreallocatedStreamBuf.h>
+#include <aws/core/utils/xml/XmlSerializer.h>
 #include <aws/identity-management/auth/STSAssumeRoleCredentialsProvider.h>
 #include <aws/s3/S3Client.h>
 #include <aws/s3/model/AbortMultipartUploadRequest.h>
@@ -563,6 +568,98 @@ class S3Client : public Aws::S3::S3Client {
     req.SetBucket(ToAwsString(bucket));
     return GetBucketRegion(req);
   }
+
+  S3Model::CompleteMultipartUploadOutcome CompleteMultipartUploadWithErrorFixup(
+      S3Model::CompleteMultipartUploadRequest&& request) const {
+    // CompletedMultipartUpload can return a 200 OK response with an error
+    // encoded in the response body, in which case we should either retry
+    // or propagate the error to the user (see
+    // https://docs.aws.amazon.com/AmazonS3/latest/API/API_CompleteMultipartUpload.html).
+    //
+    // Unfortunately the AWS SDK doesn't detect such situations but lets them
+    // return successfully (see https://github.com/aws/aws-sdk-cpp/issues/658).
+    //
+    // We work around the issue by registering a DataReceivedEventHandler
+    // which parses the XML response for embedded errors.
+
+    util::optional<AWSError<Aws::Client::CoreErrors>> aws_error;
+
+    auto handler = [&](const Aws::Http::HttpRequest* http_req,
+                       Aws::Http::HttpResponse* http_resp,
+                       long long) {  // NOLINT runtime/int
+      auto& stream = http_resp->GetResponseBody();
+      const auto pos = stream.tellg();
+      const auto doc = Aws::Utils::Xml::XmlDocument::CreateFromXmlStream(stream);
+      // Rewind stream for later
+      stream.clear();
+      stream.seekg(pos);
+
+      if (doc.WasParseSuccessful()) {
+        auto root = doc.GetRootElement();
+        if (!root.IsNull()) {
+          // Detect something that looks like an abnormal CompletedMultipartUpload
+          // response.
+          if (root.GetName() != "CompleteMultipartUploadResult" ||
+              !root.FirstChild("Error").IsNull() || !root.FirstChild("Errors").IsNull()) {
+            // Make sure the error marshaller doesn't see a 200 OK
+            http_resp->SetResponseCode(
+                Aws::Http::HttpResponseCode::INTERNAL_SERVER_ERROR);
+            aws_error = GetErrorMarshaller()->Marshall(*http_resp);
+            // Rewind stream for later
+            stream.clear();
+            stream.seekg(pos);
+          }
+        }
+      }
+    };
+
+    request.SetDataReceivedEventHandler(std::move(handler));
+
+    // We don't have access to the configured AWS retry strategy
+    // (m_retryStrategy is a private member of AwsClient), so don't use that.
+    std::unique_ptr<Aws::Client::RetryStrategy> retry_strategy;
+    if (s3_retry_strategy_) {
+      retry_strategy.reset(new WrappedRetryStrategy(s3_retry_strategy_));
+    } else {
+      // Note that DefaultRetryStrategy, unlike StandardRetryStrategy,
+      // has empty definitions for RequestBookkeeping() and GetSendToken(),
+      // which simplifies the code below.
+      retry_strategy.reset(new Aws::Client::DefaultRetryStrategy());
+    }
+
+    for (int32_t retries = 0;; retries++) {
+      aws_error.reset();
+      auto outcome = Aws::S3::S3Client::S3Client::CompleteMultipartUpload(request);
+      if (!outcome.IsSuccess()) {
+        // Error returned in HTTP headers (or client failure)
+        return outcome;
+      }
+      if (!aws_error.has_value()) {
+        // Genuinely successful outcome
+        return outcome;
+      }
+
+      const bool should_retry = retry_strategy->ShouldRetry(*aws_error, retries);
+
+      ARROW_LOG(WARNING)
+          << "CompletedMultipartUpload got error embedded in a 200 OK response: "
+          << aws_error->GetExceptionName() << " (\"" << aws_error->GetMessage()
+          << "\"), retry = " << should_retry;
+
+      if (!should_retry) {
+        break;
+      }
+      const auto delay = std::chrono::milliseconds(
+          retry_strategy->CalculateDelayBeforeNextRetry(*aws_error, retries));
+      std::this_thread::sleep_for(delay);
+    }
+
+    DCHECK(aws_error.has_value());
+    auto s3_error = AWSError<S3Errors>(std::move(aws_error).value());
+    return S3Model::CompleteMultipartUploadOutcome(std::move(s3_error));
+  }
+
+  std::shared_ptr<S3RetryStrategy> s3_retry_strategy_;
 };
 
 // In AWS SDK < 1.8, Aws::Client::ClientConfiguration::followRedirects is a bool.
@@ -617,7 +714,7 @@ class ClientBuilder {
 
     const bool use_virtual_addressing = options_.endpoint_override.empty();
 
-    /// Set proxy options if provided
+    // Set proxy options if provided
     if (!options_.proxy_options.scheme.empty()) {
       if (options_.proxy_options.scheme == "http") {
         client_config_.proxyScheme = Aws::Http::Scheme::HTTP;
@@ -641,10 +738,12 @@ class ClientBuilder {
       client_config_.proxyPassword = ToAwsString(options_.proxy_options.password);
     }
 
-    return std::make_shared<S3Client>(
+    auto client = std::make_shared<S3Client>(
         credentials_provider_, client_config_,
         Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy::Never,
         use_virtual_addressing);
+    client->s3_retry_strategy_ = options_.retry_strategy;
+    return client;
   }
 
   const S3Options& options() const { return options_; }
@@ -1021,9 +1120,8 @@ class ObjectOutputStream final : public io::OutputStream {
   struct UploadState;
 
  public:
-  ObjectOutputStream(std::shared_ptr<Aws::S3::S3Client> client,
-                     const io::IOContext& io_context, const S3Path& path,
-                     const S3Options& options,
+  ObjectOutputStream(std::shared_ptr<S3Client> client, const io::IOContext& io_context,
+                     const S3Path& path, const S3Options& options,
                      const std::shared_ptr<const KeyValueMetadata>& metadata)
       : client_(std::move(client)),
         io_context_(io_context),
@@ -1118,7 +1216,7 @@ class ObjectOutputStream final : public io::OutputStream {
     req.SetUploadId(upload_id_);
     req.SetMultipartUpload(std::move(completed_upload));
 
-    auto outcome = client_->CompleteMultipartUpload(req);
+    auto outcome = client_->CompleteMultipartUploadWithErrorFixup(std::move(req));
     if (!outcome.IsSuccess()) {
       return ErrorToStatus(
           std::forward_as_tuple("When completing multiple part upload for key '",
@@ -1314,7 +1412,7 @@ class ObjectOutputStream final : public io::OutputStream {
   }
 
  protected:
-  std::shared_ptr<Aws::S3::S3Client> client_;
+  std::shared_ptr<S3Client> client_;
   const io::IOContext io_context_;
   const S3Path path_;
   const std::shared_ptr<const KeyValueMetadata> metadata_;
@@ -1503,7 +1601,7 @@ class S3FileSystem::Impl : public std::enable_shared_from_this<S3FileSystem::Imp
  public:
   ClientBuilder builder_;
   io::IOContext io_context_;
-  std::shared_ptr<Aws::S3::S3Client> client_;
+  std::shared_ptr<S3Client> client_;
   util::optional<S3Backend> backend_;
 
   const int32_t kListObjectsMaxKeys = 1000;

From 6f4c9919019926f0f042efa91b9b112d11ebaa61 Mon Sep 17 00:00:00 2001
From: David Li <li.davidm96@gmail.com>
Date: Thu, 4 Nov 2021 16:48:14 +0100
Subject: [PATCH 080/194] ARROW-13130: [C++] Add decimal support to arithmetic
 kernels

This adds decimal support for the following kernels (and _checked variants where applicable): abs, acos, add/sub/mul/div, asin, atan, atan2, ceil, cos, floor, hash_stddev, hash_tdigest, hash_variance, is_finite, is_inf, is_nan, ln, log1p, log2, logb, mode, negate, power, quantile, round, round_to_multiple, sign, sin, stddev/variance, tan, tdigest, trunc

Most kernels cast decimals to double and proceed. Some, including rounding, directly operate on decimals. Aggregate kernels directly operate on decimals (and cast to double inline) since DispatchBest is not usable for the aggregate nodes (at least, unless we also reimplement implicit casting there).

Additionally, ValidateFull for scalars/arrays now checks FitsInPrecision. A number of tests were adjusted to account for this.

Closes #11313 from lidavidm/arrow-13130

Authored-by: David Li <li.davidm96@gmail.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 .../arrow/compute/kernels/aggregate_basic.cc  |   5 +-
 .../compute/kernels/aggregate_internal.h      |  21 ++
 .../arrow/compute/kernels/aggregate_mode.cc   |  95 +++--
 .../compute/kernels/aggregate_quantile.cc     |  50 ++-
 .../compute/kernels/aggregate_tdigest.cc      |  36 +-
 .../arrow/compute/kernels/aggregate_test.cc   | 193 +++++++++--
 .../compute/kernels/aggregate_var_std.cc      |  53 ++-
 .../arrow/compute/kernels/hash_aggregate.cc   | 303 ++++++++--------
 .../compute/kernels/hash_aggregate_test.cc    |  91 +++++
 .../compute/kernels/scalar_arithmetic.cc      | 124 +++++--
 .../compute/kernels/scalar_arithmetic_test.cc | 327 +++++++++++++++++-
 .../arrow/compute/kernels/scalar_validity.cc  |  39 +++
 .../compute/kernels/scalar_validity_test.cc   |  39 +++
 cpp/src/arrow/util/basic_decimal.cc           |  12 +
 cpp/src/arrow/util/basic_decimal.h            |   6 +
 docs/source/cpp/compute.rst                   | 162 +++++----
 16 files changed, 1197 insertions(+), 359 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc
index 0bc839be8e3cc..53fa2d2d7f702 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc
@@ -422,9 +422,8 @@ void AddMinOrMaxAggKernel(ScalarAggregateFunction* func,
   auto init = [min_max_func](
                   KernelContext* ctx,
                   const KernelInitArgs& args) -> Result<std::unique_ptr<KernelState>> {
-    std::vector<ValueDescr> inputs = args.inputs;
-    ARROW_ASSIGN_OR_RAISE(auto kernel, min_max_func->DispatchBest(&inputs));
-    KernelInitArgs new_args{kernel, inputs, args.options};
+    ARROW_ASSIGN_OR_RAISE(auto kernel, min_max_func->DispatchExact(args.inputs));
+    KernelInitArgs new_args{kernel, args.inputs, args.options};
     return kernel->init(ctx, new_args);
   };
 
diff --git a/cpp/src/arrow/compute/kernels/aggregate_internal.h b/cpp/src/arrow/compute/kernels/aggregate_internal.h
index 22a54558f4e8a..946ec01900c5b 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_internal.h
+++ b/cpp/src/arrow/compute/kernels/aggregate_internal.h
@@ -21,6 +21,7 @@
 #include "arrow/type.h"
 #include "arrow/type_traits.h"
 #include "arrow/util/bit_run_reader.h"
+#include "arrow/util/int128_internal.h"
 #include "arrow/util/logging.h"
 
 namespace arrow {
@@ -111,6 +112,26 @@ void AddAggKernel(std::shared_ptr<KernelSignature> sig, KernelInit init,
                   ScalarAggregateFinalize finalize, ScalarAggregateFunction* func,
                   SimdLevel::type simd_level = SimdLevel::NONE);
 
+using arrow::internal::VisitSetBitRunsVoid;
+
+template <typename T, typename Enable = void>
+struct GetSumType;
+
+template <typename T>
+struct GetSumType<T, enable_if_floating_point<T>> {
+  using SumType = double;
+};
+
+template <typename T>
+struct GetSumType<T, enable_if_integer<T>> {
+  using SumType = arrow::internal::int128_t;
+};
+
+template <typename T>
+struct GetSumType<T, enable_if_decimal<T>> {
+  using SumType = typename TypeTraits<T>::CType;
+};
+
 // SumArray must be parameterized with the SIMD level since it's called both from
 // translation units with and without vectorization. Normally it gets inlined but
 // if not, without the parameter, we'll have multiple definitions of the same
diff --git a/cpp/src/arrow/compute/kernels/aggregate_mode.cc b/cpp/src/arrow/compute/kernels/aggregate_mode.cc
index f225f6bf569c3..6a50556a13efd 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_mode.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_mode.cc
@@ -40,10 +40,13 @@ constexpr char kCountFieldName[] = "count";
 
 constexpr uint64_t kCountEOF = ~0ULL;
 
-template <typename InType, typename CType = typename InType::c_type>
+template <typename InType, typename CType = typename TypeTraits<InType>::CType>
 Result<std::pair<CType*, int64_t*>> PrepareOutput(int64_t n, KernelContext* ctx,
                                                   Datum* out) {
-  const auto& mode_type = TypeTraits<InType>::type_singleton();
+  DCHECK_EQ(Type::STRUCT, out->type()->id());
+  const auto& out_type = checked_cast<const StructType&>(*out->type());
+  DCHECK_EQ(2, out_type.num_fields());
+  const auto& mode_type = out_type.field(0)->type();
   const auto& count_type = int64();
 
   auto mode_data = ArrayData::Make(mode_type, /*length=*/n, /*null_count=*/0);
@@ -61,10 +64,7 @@ Result<std::pair<CType*, int64_t*>> PrepareOutput(int64_t n, KernelContext* ctx,
     count_buffer = count_data->template GetMutableValues<int64_t>(1);
   }
 
-  const auto& out_type =
-      struct_({field(kModeFieldName, mode_type), field(kCountFieldName, count_type)});
-  *out = Datum(ArrayData::Make(out_type, n, {nullptr}, {mode_data, count_data}, 0));
-
+  *out = Datum(ArrayData::Make(out->type(), n, {nullptr}, {mode_data, count_data}, 0));
   return std::make_pair(mode_buffer, count_buffer);
 }
 
@@ -72,7 +72,7 @@ Result<std::pair<CType*, int64_t*>> PrepareOutput(int64_t n, KernelContext* ctx,
 // suboptimal for tiny or large n, possibly okay as we're not in hot path
 template <typename InType, typename Generator>
 Status Finalize(KernelContext* ctx, Datum* out, Generator&& gen) {
-  using CType = typename InType::c_type;
+  using CType = typename TypeTraits<InType>::CType;
 
   using ValueCountPair = std::pair<CType, uint64_t>;
   auto gt = [](const ValueCountPair& lhs, const ValueCountPair& rhs) {
@@ -203,13 +203,25 @@ struct CountModer<BooleanType> {
   }
 };
 
-// copy and sort approach for floating points or integers with wide value range
+// copy and sort approach for floating points, decimals, or integers with wide
+// value range
 // O(n) space, O(nlogn) time
 template <typename T>
 struct SortModer {
-  using CType = typename T::c_type;
+  using CType = typename TypeTraits<T>::CType;
   using Allocator = arrow::stl::allocator<CType>;
 
+  template <typename Type = T>
+  static enable_if_floating_point<Type, CType> GetNan() {
+    return static_cast<CType>(NAN);
+  }
+
+  template <typename Type = T>
+  static enable_if_t<!is_floating_type<Type>::value, CType> GetNan() {
+    DCHECK(false);
+    return static_cast<CType>(0);
+  }
+
   Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
     const Datum& datum = batch[0];
     const int64_t in_length = datum.length() - datum.null_count();
@@ -246,7 +258,7 @@ struct SortModer {
       if (ARROW_PREDICT_FALSE(it == in_buffer.cend())) {
         // handle NAN at last
         if (nan_count > 0) {
-          auto value_count = std::make_pair(static_cast<CType>(NAN), nan_count);
+          auto value_count = std::make_pair(GetNan(), nan_count);
           nan_count = 0;
           return value_count;
         }
@@ -318,13 +330,18 @@ struct Moder<InType, enable_if_t<(is_integer_type<InType>::value &&
 };
 
 template <typename InType>
-struct Moder<InType, enable_if_t<is_floating_type<InType>::value>> {
+struct Moder<InType, enable_if_floating_point<InType>> {
+  SortModer<InType> impl;
+};
+
+template <typename InType>
+struct Moder<InType, enable_if_decimal<InType>> {
   SortModer<InType> impl;
 };
 
 template <typename T>
 Status ScalarMode(KernelContext* ctx, const Scalar& scalar, Datum* out) {
-  using CType = typename T::c_type;
+  using CType = typename TypeTraits<T>::CType;
 
   const ModeOptions& options = ModeState::Get(ctx);
   if ((!options.skip_nulls && !scalar.is_valid) ||
@@ -366,30 +383,33 @@ struct ModeExecutor {
   }
 };
 
-VectorKernel NewModeKernel(const std::shared_ptr<DataType>& in_type) {
+Result<ValueDescr> ModeType(KernelContext*, const std::vector<ValueDescr>& descrs) {
+  return ValueDescr::Array(
+      struct_({field(kModeFieldName, descrs[0].type), field(kCountFieldName, int64())}));
+}
+
+VectorKernel NewModeKernel(const std::shared_ptr<DataType>& in_type,
+                           ArrayKernelExec exec) {
   VectorKernel kernel;
   kernel.init = ModeState::Init;
   kernel.can_execute_chunkwise = false;
   kernel.output_chunked = false;
-  auto out_type =
-      struct_({field(kModeFieldName, in_type), field(kCountFieldName, int64())});
-  kernel.signature =
-      KernelSignature::Make({InputType(in_type)}, ValueDescr::Array(out_type));
-  return kernel;
-}
-
-void AddBooleanModeKernel(VectorFunction* func) {
-  VectorKernel kernel = NewModeKernel(boolean());
-  kernel.exec = ModeExecutor<StructType, BooleanType>::Exec;
-  DCHECK_OK(func->AddKernel(kernel));
-}
-
-void AddNumericModeKernels(VectorFunction* func) {
-  for (const auto& type : NumericTypes()) {
-    VectorKernel kernel = NewModeKernel(type);
-    kernel.exec = GenerateNumeric<ModeExecutor, StructType>(*type);
-    DCHECK_OK(func->AddKernel(kernel));
+  switch (in_type->id()) {
+    case Type::DECIMAL128:
+    case Type::DECIMAL256:
+      kernel.signature =
+          KernelSignature::Make({InputType(in_type->id())}, OutputType(ModeType));
+      break;
+    default: {
+      auto out_type =
+          struct_({field(kModeFieldName, in_type), field(kCountFieldName, int64())});
+      kernel.signature = KernelSignature::Make({InputType(in_type->id())},
+                                               ValueDescr::Array(std::move(out_type)));
+      break;
+    }
   }
+  kernel.exec = std::move(exec);
+  return kernel;
 }
 
 const FunctionDoc mode_doc{
@@ -409,8 +429,17 @@ void RegisterScalarAggregateMode(FunctionRegistry* registry) {
   static auto default_options = ModeOptions::Defaults();
   auto func = std::make_shared<VectorFunction>("mode", Arity::Unary(), &mode_doc,
                                                &default_options);
-  AddBooleanModeKernel(func.get());
-  AddNumericModeKernels(func.get());
+  DCHECK_OK(func->AddKernel(
+      NewModeKernel(boolean(), ModeExecutor<StructType, BooleanType>::Exec)));
+  for (const auto& type : NumericTypes()) {
+    DCHECK_OK(func->AddKernel(
+        NewModeKernel(type, GenerateNumeric<ModeExecutor, StructType>(*type))));
+  }
+  // Type parameters are ignored
+  DCHECK_OK(func->AddKernel(
+      NewModeKernel(decimal128(1, 0), ModeExecutor<StructType, Decimal128Type>::Exec)));
+  DCHECK_OK(func->AddKernel(
+      NewModeKernel(decimal256(1, 0), ModeExecutor<StructType, Decimal256Type>::Exec)));
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
 
diff --git a/cpp/src/arrow/compute/kernels/aggregate_quantile.cc b/cpp/src/arrow/compute/kernels/aggregate_quantile.cc
index 62e375e695087..cd2410cc9eb75 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_quantile.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_quantile.cc
@@ -71,10 +71,21 @@ uint64_t QuantileToDataPoint(size_t length, double q,
   return datapoint_index;
 }
 
+template <typename T>
+double DataPointToDouble(T value, const DataType&) {
+  return static_cast<double>(value);
+}
+double DataPointToDouble(const Decimal128& value, const DataType& ty) {
+  return value.ToDouble(checked_cast<const DecimalType&>(ty).scale());
+}
+double DataPointToDouble(const Decimal256& value, const DataType& ty) {
+  return value.ToDouble(checked_cast<const DecimalType&>(ty).scale());
+}
+
 // copy and nth_element approach, large memory footprint
 template <typename InType>
 struct SortQuantiler {
-  using CType = typename InType::c_type;
+  using CType = typename TypeTraits<InType>::CType;
   using Allocator = arrow::stl::allocator<CType>;
 
   Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
@@ -106,8 +117,7 @@ struct SortQuantiler {
     // prepare out array
     // out type depends on options
     const bool is_datapoint = IsDataPoint(options);
-    const std::shared_ptr<DataType> out_type =
-        is_datapoint ? TypeTraits<InType>::type_singleton() : float64();
+    const std::shared_ptr<DataType> out_type = is_datapoint ? datum.type() : float64();
     int64_t out_length = options.q.size();
     if (in_buffer.empty()) {
       return MakeArrayOfNull(out_type, out_length, ctx->memory_pool()).Value(out);
@@ -142,8 +152,9 @@ struct SortQuantiler {
         double* out_buffer = out_data->template GetMutableValues<double>(1);
         for (int64_t i = 0; i < out_length; ++i) {
           const int64_t q_index = q_indices[i];
-          out_buffer[q_index] = GetQuantileByInterp(
-              in_buffer, &last_index, options.q[q_index], options.interpolation);
+          out_buffer[q_index] =
+              GetQuantileByInterp(in_buffer, &last_index, options.q[q_index],
+                                  options.interpolation, *datum.type());
         }
       }
     }
@@ -170,8 +181,8 @@ struct SortQuantiler {
 
   // return quantile interpolated from adjacent input data points
   double GetQuantileByInterp(std::vector<CType, Allocator>& in, uint64_t* last_index,
-                             double q,
-                             enum QuantileOptions::Interpolation interpolation) {
+                             double q, enum QuantileOptions::Interpolation interpolation,
+                             const DataType& in_type) {
     const double index = (in.size() - 1) * q;
     const uint64_t lower_index = static_cast<uint64_t>(index);
     const double fraction = index - lower_index;
@@ -181,7 +192,7 @@ struct SortQuantiler {
       std::nth_element(in.begin(), in.begin() + lower_index, in.begin() + *last_index);
     }
 
-    const double lower_value = static_cast<double>(in[lower_index]);
+    const double lower_value = DataPointToDouble(in[lower_index], in_type);
     if (fraction == 0) {
       *last_index = lower_index;
       return lower_value;
@@ -197,7 +208,7 @@ struct SortQuantiler {
     }
     *last_index = lower_index;
 
-    const double higher_value = static_cast<double>(in[higher_index]);
+    const double higher_value = DataPointToDouble(in[higher_index], in_type);
 
     if (interpolation == QuantileOptions::LINEAR) {
       // more stable than naive linear interpolation
@@ -399,10 +410,15 @@ struct ExactQuantiler<InType, enable_if_t<is_floating_type<InType>::value>> {
   SortQuantiler<InType> impl;
 };
 
+template <typename InType>
+struct ExactQuantiler<InType, enable_if_t<is_decimal_type<InType>::value>> {
+  SortQuantiler<InType> impl;
+};
+
 template <typename T>
 Status ScalarQuantile(KernelContext* ctx, const QuantileOptions& options,
                       const Scalar& scalar, Datum* out) {
-  using CType = typename T::c_type;
+  using CType = typename TypeTraits<T>::CType;
   ArrayData* output = out->mutable_array();
   output->length = options.q.size();
   auto out_type = IsDataPoint(options) ? scalar.type : float64();
@@ -433,7 +449,7 @@ Status ScalarQuantile(KernelContext* ctx, const QuantileOptions& options,
   } else {
     double* out_buffer = output->template GetMutableValues<double>(1);
     for (int64_t i = 0; i < output->length; i++) {
-      out_buffer[i] = static_cast<double>(UnboxScalar<T>::Unbox(scalar));
+      out_buffer[i] = DataPointToDouble(UnboxScalar<T>::Unbox(scalar), *scalar.type);
     }
   }
   return Status::OK();
@@ -486,6 +502,18 @@ void AddQuantileKernels(VectorFunction* func) {
     base.exec = GenerateNumeric<QuantileExecutor, NullType>(*ty);
     DCHECK_OK(func->AddKernel(base));
   }
+  {
+    base.signature =
+        KernelSignature::Make({InputType(Type::DECIMAL128)}, OutputType(ResolveOutput));
+    base.exec = QuantileExecutor<NullType, Decimal128Type>::Exec;
+    DCHECK_OK(func->AddKernel(base));
+  }
+  {
+    base.signature =
+        KernelSignature::Make({InputType(Type::DECIMAL256)}, OutputType(ResolveOutput));
+    base.exec = QuantileExecutor<NullType, Decimal256Type>::Exec;
+    DCHECK_OK(func->AddKernel(base));
+  }
 }
 
 const FunctionDoc quantile_doc{
diff --git a/cpp/src/arrow/compute/kernels/aggregate_tdigest.cc b/cpp/src/arrow/compute/kernels/aggregate_tdigest.cc
index 0fddf38f575c9..7c86267d94006 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_tdigest.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_tdigest.cc
@@ -34,13 +34,25 @@ template <typename ArrowType>
 struct TDigestImpl : public ScalarAggregator {
   using ThisType = TDigestImpl<ArrowType>;
   using ArrayType = typename TypeTraits<ArrowType>::ArrayType;
-  using CType = typename ArrowType::c_type;
+  using CType = typename TypeTraits<ArrowType>::CType;
 
-  explicit TDigestImpl(const TDigestOptions& options)
+  TDigestImpl(const TDigestOptions& options, const DataType& in_type)
       : options{options},
         tdigest{options.delta, options.buffer_size},
         count{0},
-        all_valid{true} {}
+        decimal_scale{0},
+        all_valid{true} {
+    if (is_decimal_type<ArrowType>::value) {
+      decimal_scale = checked_cast<const DecimalType&>(in_type).scale();
+    }
+  }
+
+  template <typename T>
+  double ToDouble(T value) const {
+    return static_cast<double>(value);
+  }
+  double ToDouble(const Decimal128& value) const { return value.ToDouble(decimal_scale); }
+  double ToDouble(const Decimal256& value) const { return value.ToDouble(decimal_scale); }
 
   Status Consume(KernelContext*, const ExecBatch& batch) override {
     if (!this->all_valid) return Status::OK();
@@ -57,7 +69,7 @@ struct TDigestImpl : public ScalarAggregator {
         VisitSetBitRunsVoid(data.buffers[0], data.offset, data.length,
                             [&](int64_t pos, int64_t len) {
                               for (int64_t i = 0; i < len; ++i) {
-                                this->tdigest.NanAdd(values[pos + i]);
+                                this->tdigest.NanAdd(ToDouble(values[pos + i]));
                               }
                             });
       }
@@ -66,7 +78,7 @@ struct TDigestImpl : public ScalarAggregator {
       if (batch[0].scalar()->is_valid) {
         this->count += 1;
         for (int64_t i = 0; i < batch.length; i++) {
-          this->tdigest.NanAdd(value);
+          this->tdigest.NanAdd(ToDouble(value));
         }
       }
     }
@@ -110,6 +122,7 @@ struct TDigestImpl : public ScalarAggregator {
   const TDigestOptions options;
   TDigest tdigest;
   int64_t count;
+  int32_t decimal_scale;
   bool all_valid;
 };
 
@@ -132,8 +145,14 @@ struct TDigestInitState {
   }
 
   template <typename Type>
-  enable_if_t<is_number_type<Type>::value, Status> Visit(const Type&) {
-    state.reset(new TDigestImpl<Type>(options));
+  enable_if_number<Type, Status> Visit(const Type&) {
+    state.reset(new TDigestImpl<Type>(options, in_type));
+    return Status::OK();
+  }
+
+  template <typename Type>
+  enable_if_decimal<Type, Status> Visit(const Type&) {
+    state.reset(new TDigestImpl<Type>(options, in_type));
     return Status::OK();
   }
 
@@ -154,7 +173,7 @@ void AddTDigestKernels(KernelInit init,
                        const std::vector<std::shared_ptr<DataType>>& types,
                        ScalarAggregateFunction* func) {
   for (const auto& ty : types) {
-    auto sig = KernelSignature::Make({InputType(ty)}, float64());
+    auto sig = KernelSignature::Make({InputType(ty->id())}, float64());
     AddAggKernel(std::move(sig), init, func);
   }
 }
@@ -179,6 +198,7 @@ std::shared_ptr<ScalarAggregateFunction> AddTDigestAggKernels() {
   auto func = std::make_shared<ScalarAggregateFunction>(
       "tdigest", Arity::Unary(), &tdigest_doc, &default_tdigest_options);
   AddTDigestKernels(TDigestInit, NumericTypes(), func.get());
+  AddTDigestKernels(TDigestInit, {decimal128(1, 1), decimal256(1, 1)}, func.get());
   return func;
 }
 
diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc
index c5355a8f4521f..c8b13862ae361 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_test.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc
@@ -2334,6 +2334,26 @@ TEST(TestIndexKernel, Errors) {
 // Mode
 //
 
+template <typename CType>
+void CheckModes(const Datum& array, const ModeOptions options,
+                const std::vector<CType>& expected_modes,
+                const std::vector<int64_t>& expected_counts) {
+  ASSERT_OK_AND_ASSIGN(Datum out, Mode(array, options));
+  ValidateOutput(out);
+  const StructArray out_array(out.array());
+  ASSERT_EQ(out_array.length(), expected_modes.size());
+  ASSERT_EQ(out_array.num_fields(), 2);
+
+  const CType* out_modes = out_array.field(0)->data()->GetValues<CType>(1);
+  const int64_t* out_counts = out_array.field(1)->data()->GetValues<int64_t>(1);
+  for (int i = 0; i < out_array.length(); ++i) {
+    // equal or nan equal
+    ASSERT_TRUE((expected_modes[i] == out_modes[i]) ||
+                (expected_modes[i] != expected_modes[i] && out_modes[i] != out_modes[i]));
+    ASSERT_EQ(expected_counts[i], out_counts[i]);
+  }
+}
+
 template <typename T>
 class TestPrimitiveModeKernel : public ::testing::Test {
  public:
@@ -2344,21 +2364,7 @@ class TestPrimitiveModeKernel : public ::testing::Test {
   void AssertModesAre(const Datum& array, const ModeOptions options,
                       const std::vector<CType>& expected_modes,
                       const std::vector<int64_t>& expected_counts) {
-    ASSERT_OK_AND_ASSIGN(Datum out, Mode(array, options));
-    ValidateOutput(out);
-    const StructArray out_array(out.array());
-    ASSERT_EQ(out_array.length(), expected_modes.size());
-    ASSERT_EQ(out_array.num_fields(), 2);
-
-    const CType* out_modes = out_array.field(0)->data()->GetValues<CType>(1);
-    const int64_t* out_counts = out_array.field(1)->data()->GetValues<int64_t>(1);
-    for (int i = 0; i < out_array.length(); ++i) {
-      // equal or nan equal
-      ASSERT_TRUE(
-          (expected_modes[i] == out_modes[i]) ||
-          (expected_modes[i] != expected_modes[i] && out_modes[i] != out_modes[i]));
-      ASSERT_EQ(expected_counts[i], out_counts[i]);
-    }
+    CheckModes(array, options, expected_modes, expected_counts);
   }
 
   void AssertModesAre(const std::string& json, const int n,
@@ -2587,6 +2593,89 @@ TYPED_TEST(TestFloatingModeKernel, Floats) {
   this->AssertModesEmpty(ScalarFromJSON(in_ty, "null"), ModeOptions(/*n=*/1));
 }
 
+template <typename ArrowType>
+class TestDecimalModeKernel : public ::testing::Test {
+ public:
+  using CType = typename TypeTraits<ArrowType>::CType;
+
+  void AssertModesAre(const Datum& array, const ModeOptions options,
+                      const std::vector<std::string>& expected_modes,
+                      const std::vector<int64_t>& expected_counts) {
+    CheckModes<CType>(array, options, values(expected_modes), expected_counts);
+  }
+
+  CType value(const std::string& s) const {
+    EXPECT_OK_AND_ASSIGN(auto out, CType::FromString(s));
+    return out;
+  }
+
+  std::vector<CType> values(const std::vector<std::string>& strings) const {
+    std::vector<CType> values;
+    for (const auto& s : strings) {
+      values.push_back(value(s));
+    }
+    return values;
+  }
+
+  std::shared_ptr<DataType> type_instance() { return std::make_shared<ArrowType>(4, 2); }
+};
+
+TYPED_TEST_SUITE(TestDecimalModeKernel, DecimalArrowTypes);
+
+TYPED_TEST(TestDecimalModeKernel, Decimals) {
+  auto ty = this->type_instance();
+  this->AssertModesAre(ArrayFromJSON(ty, R"(["5.01", "-1.42", "-1.42", "5.01", "5.01"])"),
+                       ModeOptions(1), {"5.01"}, {3});
+  this->AssertModesAre(
+      ArrayFromJSON(ty, R"(["5.01", "-1.42", "-1.42", "5.01", "5.01", "-1.42"])"),
+      ModeOptions(1), {"-1.42"}, {3});
+  this->AssertModesAre(
+      ArrayFromJSON(ty, R"(["5.01", "-1.42", "-1.42", "5.01", "5.01", "-1.42"])"),
+      ModeOptions(2), {"-1.42", "5.01"}, {3, 3});
+
+  this->AssertModesAre(ArrayFromJSON(ty, "[]"), ModeOptions(1), {}, {});
+
+  this->AssertModesAre(ArrayFromJSON(ty, R"(["1.86", "-2.00", "-2.00", null])"),
+                       ModeOptions(/*n=*/1), {"-2.00"}, {2});
+  this->AssertModesAre(ArrayFromJSON(ty, R"(["1.86", "-2.00", "-2.00", null])"),
+                       ModeOptions(/*n=*/1, /*skip_nulls=*/false), {}, {});
+  this->AssertModesAre(ArrayFromJSON(ty, R"(["1.86", "-2.00", "-2.00", null])"),
+                       ModeOptions(/*n=*/1, /*skip_nulls=*/true, /*min_count=*/3),
+                       {"-2.00"}, {2});
+  this->AssertModesAre(ArrayFromJSON(ty, R"(["-2.00", "-2.00", null])"),
+                       ModeOptions(/*n=*/1, /*skip_nulls=*/true, /*min_count=*/3), {},
+                       {});
+  this->AssertModesAre(ArrayFromJSON(ty, R"(["1.86", "-2.00", "-2.00"])"),
+                       ModeOptions(/*n=*/1, /*skip_nulls=*/false, /*min_count=*/3),
+                       {"-2.00"}, {2});
+  this->AssertModesAre(ArrayFromJSON(ty, R"(["1.86", "-2.00", "-2.00", null])"),
+                       ModeOptions(/*n=*/1, /*skip_nulls=*/false, /*min_count=*/3), {},
+                       {});
+  this->AssertModesAre(ArrayFromJSON(ty, R"(["1.86", "-2.00"])"),
+                       ModeOptions(/*n=*/1, /*skip_nulls=*/false, /*min_count=*/3), {},
+                       {});
+
+  this->AssertModesAre(ScalarFromJSON(ty, R"("0.00")"),
+                       ModeOptions(/*n=*/1, /*skip_nulls=*/false), {"0.00"}, {1});
+  this->AssertModesAre(ScalarFromJSON(ty, "null"),
+                       ModeOptions(/*n=*/1, /*skip_nulls=*/false), {}, {});
+  this->AssertModesAre(ScalarFromJSON(ty, R"("0.00")"),
+                       ModeOptions(/*n=*/1, /*skip_nulls=*/true, /*min_count=*/2), {},
+                       {});
+  this->AssertModesAre(ScalarFromJSON(ty, "null"),
+                       ModeOptions(/*n=*/1, /*skip_nulls=*/true, /*min_count=*/2), {},
+                       {});
+  this->AssertModesAre(ScalarFromJSON(ty, R"("0.00")"),
+                       ModeOptions(/*n=*/1, /*skip_nulls=*/false, /*min_count=*/2), {},
+                       {});
+  this->AssertModesAre(ScalarFromJSON(ty, "null"),
+                       ModeOptions(/*n=*/1, /*skip_nulls=*/false, /*min_count=*/2), {},
+                       {});
+  this->AssertModesAre(ScalarFromJSON(ty, R"("5.00")"), ModeOptions(/*n=*/1), {"5.00"},
+                       {1});
+  this->AssertModesAre(ScalarFromJSON(ty, "null"), ModeOptions(/*n=*/1), {}, {});
+}
+
 TEST_F(TestInt8ModeKernelValueRange, Basics) {
   this->AssertModeIs("[0, 127, -128, -128]", -128, 2);
   this->AssertModeIs("[127, 127, 127]", 127, 3);
@@ -2689,6 +2778,24 @@ TEST_F(TestInt32ModeKernel, Sliced) {
 // Variance/Stddev
 //
 
+void CheckVarStd(const Datum& array, const VarianceOptions& options,
+                 double expected_var) {
+  ASSERT_OK_AND_ASSIGN(Datum out_var, Variance(array, options));
+  ASSERT_OK_AND_ASSIGN(Datum out_std, Stddev(array, options));
+  auto var = checked_cast<const DoubleScalar*>(out_var.scalar().get());
+  auto std = checked_cast<const DoubleScalar*>(out_std.scalar().get());
+  ASSERT_TRUE(var->is_valid && std->is_valid);
+  // Near zero these macros don't work as well
+  // (and MinGW can give results slightly off from zero)
+  if (std::abs(expected_var) < 1e-20) {
+    ASSERT_NEAR(std->value * std->value, var->value, 1e-20);
+    ASSERT_NEAR(var->value, expected_var, 1e-20);
+  } else {
+    ASSERT_DOUBLE_EQ(std->value * std->value, var->value);
+    ASSERT_DOUBLE_EQ(var->value, expected_var);  // < 4ULP
+  }
+}
+
 template <typename ArrowType>
 class TestPrimitiveVarStdKernel : public ::testing::Test {
  public:
@@ -2697,12 +2804,12 @@ class TestPrimitiveVarStdKernel : public ::testing::Test {
 
   void AssertVarStdIs(const Array& array, const VarianceOptions& options,
                       double expected_var) {
-    AssertVarStdIsInternal(array, options, expected_var);
+    CheckVarStd(array, options, expected_var);
   }
 
   void AssertVarStdIs(const std::shared_ptr<ChunkedArray>& array,
                       const VarianceOptions& options, double expected_var) {
-    AssertVarStdIsInternal(array, options, expected_var);
+    CheckVarStd(array, options, expected_var);
   }
 
   void AssertVarStdIs(const std::string& json, const VarianceOptions& options,
@@ -2740,17 +2847,6 @@ class TestPrimitiveVarStdKernel : public ::testing::Test {
   std::shared_ptr<DataType> type_singleton() { return Traits::type_singleton(); }
 
  private:
-  void AssertVarStdIsInternal(const Datum& array, const VarianceOptions& options,
-                              double expected_var) {
-    ASSERT_OK_AND_ASSIGN(Datum out_var, Variance(array, options));
-    ASSERT_OK_AND_ASSIGN(Datum out_std, Stddev(array, options));
-    auto var = checked_cast<const ScalarType*>(out_var.scalar().get());
-    auto std = checked_cast<const ScalarType*>(out_std.scalar().get());
-    ASSERT_TRUE(var->is_valid && std->is_valid);
-    ASSERT_DOUBLE_EQ(std->value * std->value, var->value);
-    ASSERT_DOUBLE_EQ(var->value, expected_var);  // < 4ULP
-  }
-
   void AssertVarStdIsInvalidInternal(const Datum& array, const VarianceOptions& options) {
     ASSERT_OK_AND_ASSIGN(Datum out_var, Variance(array, options));
     ASSERT_OK_AND_ASSIGN(Datum out_std, Stddev(array, options));
@@ -3000,6 +3096,18 @@ TEST_F(TestVarStdKernelIntegerLength, Basics) {
 }
 #endif
 
+TEST(TestVarStdKernel, Decimal) {
+  // Effectively treated as double, sanity check results here
+  for (const auto& ty : {decimal128(3, 2), decimal256(3, 2)}) {
+    CheckVarStd(ArrayFromJSON(ty, R"(["1.00"])"), VarianceOptions(), 0);
+    CheckVarStd(ArrayFromJSON(ty, R"([null, "1.00", "2.00", "3.00"])"), VarianceOptions(),
+                0.6666666666666666);
+    CheckVarStd(ScalarFromJSON(ty, R"("1.00")"), VarianceOptions(), 0);
+    CheckVarStd(ArrayFromJSON(ty, R"([null, "1.00", "2.00"])"),
+                VarianceOptions(/*ddof=*/1), 0.5);
+  }
+}
+
 //
 // Quantile
 //
@@ -3541,6 +3649,24 @@ TEST(TestQuantileKernel, AllNullsOrNaNs) {
   }
 }
 
+TEST(TestQuantileKernel, Decimal) {
+  auto check = [](const std::shared_ptr<Array>& input, QuantileOptions options,
+                  const std::shared_ptr<Array>& expected) {
+    ASSERT_OK_AND_ASSIGN(Datum out, Quantile(input, options));
+    auto out_array = out.make_array();
+    ValidateOutput(*out_array);
+    AssertArraysEqual(*expected, *out_array, /*verbose=*/true);
+  };
+  for (const auto& ty : {decimal128(3, 2), decimal256(3, 2)}) {
+    check(ArrayFromJSON(ty, R"(["1.00", "5.00", null])"),
+          QuantileOptions(0.5, QuantileOptions::LINEAR),
+          ArrayFromJSON(float64(), R"([3.00])"));
+    check(ArrayFromJSON(ty, R"(["1.00", "2.00", "5.00"])"),
+          QuantileOptions(0.5, QuantileOptions::NEAREST),
+          ArrayFromJSON(ty, R"(["2.00"])"));
+  }
+}
+
 TEST(TestQuantileKernel, Scalar) {
   for (const auto& ty : {float64(), int64(), uint64()}) {
     QuantileOptions options(std::vector<double>{0.0, 0.5, 1.0});
@@ -3608,6 +3734,17 @@ TEST(TestTDigestKernel, AllNullsOrNaNs) {
   }
 }
 
+TEST(TestTDigestKernel, Decimal) {
+  for (const auto& ty : {decimal128(3, 2), decimal256(3, 2)}) {
+    ASSERT_OK_AND_ASSIGN(auto decimal_array,
+                         TDigest(ArrayFromJSON(ty, R"(["1.00", "2.00", "3.25"])")));
+    ASSERT_OK_AND_ASSIGN(auto float_array,
+                         TDigest(ArrayFromJSON(float64(), "[1, 2, 3.25]")));
+    AssertArraysApproxEqual(*float_array.make_array(), *decimal_array.make_array(),
+                            /*verbose=*/true);
+  }
+}
+
 TEST(TestTDigestKernel, Scalar) {
   for (const auto& ty : {float64(), int64(), uint64()}) {
     TDigestOptions options(std::vector<double>{0.0, 0.5, 1.0});
diff --git a/cpp/src/arrow/compute/kernels/aggregate_var_std.cc b/cpp/src/arrow/compute/kernels/aggregate_var_std.cc
index d0d3c514fae2e..feb98718aee3c 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_var_std.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_var_std.cc
@@ -36,12 +36,21 @@ using arrow::internal::VisitSetBitRunsVoid;
 template <typename ArrowType>
 struct VarStdState {
   using ArrayType = typename TypeTraits<ArrowType>::ArrayType;
-  using CType = typename ArrowType::c_type;
+  using CType = typename TypeTraits<ArrowType>::CType;
   using ThisType = VarStdState<ArrowType>;
 
-  explicit VarStdState(VarianceOptions options) : options(options) {}
+  explicit VarStdState(int32_t decimal_scale, VarianceOptions options)
+      : decimal_scale(decimal_scale), options(options) {}
 
-  // float/double/int64: calculate `m2` (sum((X-mean)^2)) with `two pass algorithm`
+  template <typename T>
+  double ToDouble(T value) const {
+    return static_cast<double>(value);
+  }
+  double ToDouble(const Decimal128& value) const { return value.ToDouble(decimal_scale); }
+  double ToDouble(const Decimal256& value) const { return value.ToDouble(decimal_scale); }
+
+  // float/double/int64/decimal: calculate `m2` (sum((X-mean)^2)) with `two pass
+  // algorithm`
   // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Two-pass_algorithm
   template <typename T = ArrowType>
   enable_if_t<is_floating_type<T>::value || (sizeof(CType) > 4)> Consume(
@@ -52,14 +61,13 @@ struct VarStdState {
       return;
     }
 
-    using SumType =
-        typename std::conditional<is_floating_type<T>::value, double, int128_t>::type;
-    SumType sum = SumArray<CType, SumType, SimdLevel::NONE>(*array.data());
+    using SumType = typename internal::GetSumType<T>::SumType;
+    SumType sum = internal::SumArray<CType, SumType, SimdLevel::NONE>(*array.data());
 
-    const double mean = static_cast<double>(sum) / count;
-    const double m2 =
-        SumArray<CType, double, SimdLevel::NONE>(*array.data(), [mean](CType value) {
-          const double v = static_cast<double>(value);
+    const double mean = ToDouble(sum) / count;
+    const double m2 = internal::SumArray<CType, double, SimdLevel::NONE>(
+        *array.data(), [this, mean](CType value) {
+          const double v = ToDouble(value);
           return (v - mean) * (v - mean);
         });
 
@@ -102,7 +110,7 @@ struct VarStdState {
                             });
 
         // merge variance
-        ThisType state(options);
+        ThisType state(decimal_scale, options);
         state.count = var_std.count;
         state.mean = var_std.mean();
         state.m2 = var_std.m2();
@@ -116,7 +124,7 @@ struct VarStdState {
     this->m2 = 0;
     if (scalar.is_valid) {
       this->count = count;
-      this->mean = static_cast<double>(UnboxScalar<ArrowType>::Unbox(scalar));
+      this->mean = ToDouble(UnboxScalar<ArrowType>::Unbox(scalar));
     } else {
       this->count = 0;
       this->mean = 0;
@@ -141,6 +149,7 @@ struct VarStdState {
                 &this->mean, &this->m2);
   }
 
+  const int32_t decimal_scale;
   const VarianceOptions options;
   int64_t count = 0;
   double mean = 0;
@@ -153,9 +162,9 @@ struct VarStdImpl : public ScalarAggregator {
   using ThisType = VarStdImpl<ArrowType>;
   using ArrayType = typename TypeTraits<ArrowType>::ArrayType;
 
-  explicit VarStdImpl(const std::shared_ptr<DataType>& out_type,
+  explicit VarStdImpl(int32_t decimal_scale, const std::shared_ptr<DataType>& out_type,
                       const VarianceOptions& options, VarOrStd return_type)
-      : out_type(out_type), state(options), return_type(return_type) {}
+      : out_type(out_type), state(decimal_scale, options), return_type(return_type) {}
 
   Status Consume(KernelContext*, const ExecBatch& batch) override {
     if (batch[0].is_array()) {
@@ -216,8 +225,16 @@ struct VarStdInitState {
   }
 
   template <typename Type>
-  enable_if_t<is_number_type<Type>::value, Status> Visit(const Type&) {
-    state.reset(new VarStdImpl<Type>(out_type, options, return_type));
+  enable_if_number<Type, Status> Visit(const Type&) {
+    state.reset(
+        new VarStdImpl<Type>(/*decimal_scale=*/0, out_type, options, return_type));
+    return Status::OK();
+  }
+
+  template <typename Type>
+  enable_if_decimal<Type, Status> Visit(const Type&) {
+    state.reset(new VarStdImpl<Type>(checked_cast<const DecimalType&>(in_type).scale(),
+                                     out_type, options, return_type));
     return Status::OK();
   }
 
@@ -247,7 +264,7 @@ void AddVarStdKernels(KernelInit init,
                       const std::vector<std::shared_ptr<DataType>>& types,
                       ScalarAggregateFunction* func) {
   for (const auto& ty : types) {
-    auto sig = KernelSignature::Make({InputType(ty)}, float64());
+    auto sig = KernelSignature::Make({InputType(ty->id())}, float64());
     AddAggKernel(std::move(sig), init, func);
   }
 }
@@ -275,6 +292,7 @@ std::shared_ptr<ScalarAggregateFunction> AddStddevAggKernels() {
   auto func = std::make_shared<ScalarAggregateFunction>(
       "stddev", Arity::Unary(), &stddev_doc, &default_std_options);
   AddVarStdKernels(StddevInit, NumericTypes(), func.get());
+  AddVarStdKernels(StddevInit, {decimal128(1, 1), decimal256(1, 1)}, func.get());
   return func;
 }
 
@@ -283,6 +301,7 @@ std::shared_ptr<ScalarAggregateFunction> AddVarianceAggKernels() {
   auto func = std::make_shared<ScalarAggregateFunction>(
       "variance", Arity::Unary(), &variance_doc, &default_var_options);
   AddVarStdKernels(VarianceInit, NumericTypes(), func.get());
+  AddVarStdKernels(VarianceInit, {decimal128(1, 1), decimal256(1, 1)}, func.get());
   return func;
 }
 
diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate.cc b/cpp/src/arrow/compute/kernels/hash_aggregate.cc
index 73c8f9d26c0e0..9f53267535511 100644
--- a/cpp/src/arrow/compute/kernels/hash_aggregate.cc
+++ b/cpp/src/arrow/compute/kernels/hash_aggregate.cc
@@ -519,7 +519,8 @@ struct GrouperFastImpl : Grouper {
 /// Implementations should be default constructible and perform initialization in
 /// Init().
 struct GroupedAggregator : KernelState {
-  virtual Status Init(ExecContext*, const FunctionOptions*) = 0;
+  virtual Status Init(ExecContext*, const std::vector<ValueDescr>& inputs,
+                      const FunctionOptions*) = 0;
 
   virtual Status Resize(int64_t new_num_groups) = 0;
 
@@ -536,7 +537,7 @@ template <typename Impl>
 Result<std::unique_ptr<KernelState>> HashAggregateInit(KernelContext* ctx,
                                                        const KernelInitArgs& args) {
   auto impl = ::arrow::internal::make_unique<Impl>();
-  RETURN_NOT_OK(impl->Init(ctx->exec_context(), args.options));
+  RETURN_NOT_OK(impl->Init(ctx->exec_context(), args.inputs, args.options));
   return std::move(impl);
 }
 
@@ -636,7 +637,8 @@ void VisitGroupedValuesNonNull(const ExecBatch& batch, ConsumeValue&& valid_func
 // Count implementation
 
 struct GroupedCountImpl : public GroupedAggregator {
-  Status Init(ExecContext* ctx, const FunctionOptions* options) override {
+  Status Init(ExecContext* ctx, const std::vector<ValueDescr>&,
+              const FunctionOptions* options) override {
     options_ = checked_cast<const CountOptions&>(*options);
     counts_ = BufferBuilder(ctx->memory_pool());
     return Status::OK();
@@ -725,13 +727,14 @@ struct GroupedReducingAggregator : public GroupedAggregator {
   using CType = typename TypeTraits<AccType>::CType;
   using InputCType = typename TypeTraits<Type>::CType;
 
-  Status Init(ExecContext* ctx, const FunctionOptions* options) override {
+  Status Init(ExecContext* ctx, const std::vector<ValueDescr>& inputs,
+              const FunctionOptions* options) override {
     pool_ = ctx->memory_pool();
     options_ = checked_cast<const ScalarAggregateOptions&>(*options);
     reduced_ = TypedBufferBuilder<CType>(pool_);
     counts_ = TypedBufferBuilder<int64_t>(pool_);
     no_nulls_ = TypedBufferBuilder<bool>(pool_);
-    // out_type_ initialized by SumInit
+    out_type_ = GetOutType(inputs[0].type);
     return Status::OK();
   }
 
@@ -829,6 +832,18 @@ struct GroupedReducingAggregator : public GroupedAggregator {
 
   std::shared_ptr<DataType> out_type() const override { return out_type_; }
 
+  template <typename T = Type>
+  static enable_if_t<!is_decimal_type<T>::value, std::shared_ptr<DataType>> GetOutType(
+      const std::shared_ptr<DataType>& in_type) {
+    return TypeTraits<AccType>::type_singleton();
+  }
+
+  template <typename T = Type>
+  static enable_if_decimal<T, std::shared_ptr<DataType>> GetOutType(
+      const std::shared_ptr<DataType>& in_type) {
+    return in_type;
+  }
+
   int64_t num_groups_ = 0;
   ScalarAggregateOptions options_;
   TypedBufferBuilder<CType> reduced_;
@@ -838,76 +853,35 @@ struct GroupedReducingAggregator : public GroupedAggregator {
   MemoryPool* pool_;
 };
 
-// ----------------------------------------------------------------------
-// Sum implementation
-
-template <typename Type>
-struct GroupedSumImpl : public GroupedReducingAggregator<Type, GroupedSumImpl<Type>> {
-  using Base = GroupedReducingAggregator<Type, GroupedSumImpl<Type>>;
-  using CType = typename Base::CType;
-  using InputCType = typename Base::InputCType;
-
-  // Default value for a group
-  static CType NullValue(const DataType&) { return CType(0); }
-
-  template <typename T = Type>
-  static enable_if_number<T, CType> Reduce(const DataType&, const CType u,
-                                           const InputCType v) {
-    return static_cast<CType>(to_unsigned(u) + to_unsigned(static_cast<CType>(v)));
-  }
-
-  static CType Reduce(const DataType&, const CType u, const CType v) {
-    return static_cast<CType>(to_unsigned(u) + to_unsigned(v));
-  }
-
-  using Base::Finish;
-};
-
-template <template <typename T> class Impl, typename T>
-Result<std::unique_ptr<KernelState>> SumInit(KernelContext* ctx,
-                                             const KernelInitArgs& args) {
-  ARROW_ASSIGN_OR_RAISE(auto impl, HashAggregateInit<Impl<T>>(ctx, args));
-  static_cast<Impl<T>*>(impl.get())->out_type_ =
-      TypeTraits<typename Impl<T>::AccType>::type_singleton();
-  return std::move(impl);
-}
-
-template <typename Impl>
-Result<std::unique_ptr<KernelState>> DecimalSumInit(KernelContext* ctx,
-                                                    const KernelInitArgs& args) {
-  ARROW_ASSIGN_OR_RAISE(auto impl, HashAggregateInit<Impl>(ctx, args));
-  static_cast<Impl*>(impl.get())->out_type_ = args.inputs[0].type;
-  return std::move(impl);
-}
-
-struct GroupedSumFactory {
+template <template <typename> class Impl, const char* kFriendlyName>
+struct GroupedReducingFactory {
   template <typename T, typename AccType = typename FindAccumulatorType<T>::Type>
   Status Visit(const T&) {
-    kernel = MakeKernel(std::move(argument_type), SumInit<GroupedSumImpl, T>);
+    kernel = MakeKernel(std::move(argument_type), HashAggregateInit<Impl<T>>);
     return Status::OK();
   }
 
   Status Visit(const Decimal128Type&) {
-    kernel = MakeKernel(std::move(argument_type),
-                        DecimalSumInit<GroupedSumImpl<Decimal128Type>>);
+    kernel =
+        MakeKernel(std::move(argument_type), HashAggregateInit<Impl<Decimal128Type>>);
     return Status::OK();
   }
   Status Visit(const Decimal256Type&) {
-    kernel = MakeKernel(std::move(argument_type),
-                        DecimalSumInit<GroupedSumImpl<Decimal256Type>>);
+    kernel =
+        MakeKernel(std::move(argument_type), HashAggregateInit<Impl<Decimal256Type>>);
     return Status::OK();
   }
 
   Status Visit(const HalfFloatType& type) {
-    return Status::NotImplemented("Summing data of type ", type);
+    return Status::NotImplemented("Computing ", kFriendlyName, " of type ", type);
   }
 
   Status Visit(const DataType& type) {
-    return Status::NotImplemented("Summing data of type ", type);
+    return Status::NotImplemented("Computing ", kFriendlyName, " of type ", type);
   }
 
   static Result<HashAggregateKernel> Make(const std::shared_ptr<DataType>& type) {
-    GroupedSumFactory factory;
+    GroupedReducingFactory<Impl, kFriendlyName> factory;
     factory.argument_type = InputType::Array(type->id());
     RETURN_NOT_OK(VisitTypeInline(*type, &factory));
     return std::move(factory.kernel);
@@ -917,6 +891,34 @@ struct GroupedSumFactory {
   InputType argument_type;
 };
 
+// ----------------------------------------------------------------------
+// Sum implementation
+
+template <typename Type>
+struct GroupedSumImpl : public GroupedReducingAggregator<Type, GroupedSumImpl<Type>> {
+  using Base = GroupedReducingAggregator<Type, GroupedSumImpl<Type>>;
+  using CType = typename Base::CType;
+  using InputCType = typename Base::InputCType;
+
+  // Default value for a group
+  static CType NullValue(const DataType&) { return CType(0); }
+
+  template <typename T = Type>
+  static enable_if_number<T, CType> Reduce(const DataType&, const CType u,
+                                           const InputCType v) {
+    return static_cast<CType>(to_unsigned(u) + to_unsigned(static_cast<CType>(v)));
+  }
+
+  static CType Reduce(const DataType&, const CType u, const CType v) {
+    return static_cast<CType>(to_unsigned(u) + to_unsigned(v));
+  }
+
+  using Base::Finish;
+};
+
+static constexpr const char kSumName[] = "sum";
+using GroupedSumFactory = GroupedReducingFactory<GroupedSumImpl, kSumName>;
+
 // ----------------------------------------------------------------------
 // Product implementation
 
@@ -945,43 +947,8 @@ struct GroupedProductImpl final
   using Base::Finish;
 };
 
-struct GroupedProductFactory {
-  template <typename T, typename AccType = typename FindAccumulatorType<T>::Type>
-  Status Visit(const T&) {
-    kernel = MakeKernel(std::move(argument_type), SumInit<GroupedProductImpl, T>);
-    return Status::OK();
-  }
-
-  Status Visit(const Decimal128Type&) {
-    kernel = MakeKernel(std::move(argument_type),
-                        DecimalSumInit<GroupedProductImpl<Decimal128Type>>);
-    return Status::OK();
-  }
-
-  Status Visit(const Decimal256Type&) {
-    kernel = MakeKernel(std::move(argument_type),
-                        DecimalSumInit<GroupedProductImpl<Decimal256Type>>);
-    return Status::OK();
-  }
-
-  Status Visit(const HalfFloatType& type) {
-    return Status::NotImplemented("Taking product of data of type ", type);
-  }
-
-  Status Visit(const DataType& type) {
-    return Status::NotImplemented("Taking product of data of type ", type);
-  }
-
-  static Result<HashAggregateKernel> Make(const std::shared_ptr<DataType>& type) {
-    GroupedProductFactory factory;
-    factory.argument_type = InputType::Array(type->id());
-    RETURN_NOT_OK(VisitTypeInline(*type, &factory));
-    return std::move(factory.kernel);
-  }
-
-  HashAggregateKernel kernel;
-  InputType argument_type;
-};
+static constexpr const char kProductName[] = "product";
+using GroupedProductFactory = GroupedReducingFactory<GroupedProductImpl, kProductName>;
 
 // ----------------------------------------------------------------------
 // Mean implementation
@@ -1040,43 +1007,8 @@ struct GroupedMeanImpl : public GroupedReducingAggregator<Type, GroupedMeanImpl<
   }
 };
 
-struct GroupedMeanFactory {
-  template <typename T, typename AccType = typename FindAccumulatorType<T>::Type>
-  Status Visit(const T&) {
-    kernel = MakeKernel(std::move(argument_type), SumInit<GroupedMeanImpl, T>);
-    return Status::OK();
-  }
-
-  Status Visit(const Decimal128Type&) {
-    kernel = MakeKernel(std::move(argument_type),
-                        DecimalSumInit<GroupedMeanImpl<Decimal128Type>>);
-    return Status::OK();
-  }
-
-  Status Visit(const Decimal256Type&) {
-    kernel = MakeKernel(std::move(argument_type),
-                        DecimalSumInit<GroupedMeanImpl<Decimal256Type>>);
-    return Status::OK();
-  }
-
-  Status Visit(const HalfFloatType& type) {
-    return Status::NotImplemented("Computing mean of type ", type);
-  }
-
-  Status Visit(const DataType& type) {
-    return Status::NotImplemented("Computing mean of type ", type);
-  }
-
-  static Result<HashAggregateKernel> Make(const std::shared_ptr<DataType>& type) {
-    GroupedMeanFactory factory;
-    factory.argument_type = InputType::Array(type->id());
-    RETURN_NOT_OK(VisitTypeInline(*type, &factory));
-    return std::move(factory.kernel);
-  }
-
-  HashAggregateKernel kernel;
-  InputType argument_type;
-};
+static constexpr const char kMeanName[] = "mean";
+using GroupedMeanFactory = GroupedReducingFactory<GroupedMeanImpl, kMeanName>;
 
 // Variance/Stdev implementation
 
@@ -1084,10 +1016,22 @@ using arrow::internal::int128_t;
 
 template <typename Type>
 struct GroupedVarStdImpl : public GroupedAggregator {
-  using CType = typename Type::c_type;
+  using CType = typename TypeTraits<Type>::CType;
+
+  Status Init(ExecContext* ctx, const std::vector<ValueDescr>& inputs,
+              const FunctionOptions* options) override {
+    options_ = *checked_cast<const VarianceOptions*>(options);
+    if (is_decimal_type<Type>::value) {
+      const int32_t scale = checked_cast<const DecimalType&>(*inputs[0].type).scale();
+      return InitInternal(ctx, scale, options);
+    }
+    return InitInternal(ctx, 0, options);
+  }
 
-  Status Init(ExecContext* ctx, const FunctionOptions* options) override {
+  Status InitInternal(ExecContext* ctx, int32_t decimal_scale,
+                      const FunctionOptions* options) {
     options_ = *checked_cast<const VarianceOptions*>(options);
+    decimal_scale_ = decimal_scale;
     ctx_ = ctx;
     pool_ = ctx->memory_pool();
     counts_ = TypedBufferBuilder<int64_t>(pool_);
@@ -1107,18 +1051,28 @@ struct GroupedVarStdImpl : public GroupedAggregator {
     return Status::OK();
   }
 
+  template <typename T>
+  double ToDouble(T value) const {
+    return static_cast<double>(value);
+  }
+  double ToDouble(const Decimal128& value) const {
+    return value.ToDouble(decimal_scale_);
+  }
+  double ToDouble(const Decimal256& value) const {
+    return value.ToDouble(decimal_scale_);
+  }
+
   Status Consume(const ExecBatch& batch) override { return ConsumeImpl(batch); }
 
-  // float/double/int64: calculate `m2` (sum((X-mean)^2)) with `two pass algorithm`
-  // (see aggregate_var_std.cc)
+  // float/double/int64/decimal: calculate `m2` (sum((X-mean)^2)) with
+  // `two pass algorithm` (see aggregate_var_std.cc)
   template <typename T = Type>
   enable_if_t<is_floating_type<T>::value || (sizeof(CType) > 4), Status> ConsumeImpl(
       const ExecBatch& batch) {
-    using SumType =
-        typename std::conditional<is_floating_type<T>::value, double, int128_t>::type;
+    using SumType = typename internal::GetSumType<T>::SumType;
 
     GroupedVarStdImpl<Type> state;
-    RETURN_NOT_OK(state.Init(ctx_, &options_));
+    RETURN_NOT_OK(state.InitInternal(ctx_, decimal_scale_, &options_));
     RETURN_NOT_OK(state.Resize(num_groups_));
     int64_t* counts = state.counts_.mutable_data();
     double* means = state.means_.mutable_data();
@@ -1137,12 +1091,12 @@ struct GroupedVarStdImpl : public GroupedAggregator {
         [&](uint32_t g) { BitUtil::ClearBit(no_nulls, g); });
 
     for (int64_t i = 0; i < num_groups_; i++) {
-      means[i] = static_cast<double>(sums[i]) / counts[i];
+      means[i] = ToDouble(sums[i]) / counts[i];
     }
 
     VisitGroupedValuesNonNull<Type>(
         batch, [&](uint32_t g, typename TypeTraits<Type>::CType value) {
-          const double v = static_cast<double>(value);
+          const double v = ToDouble(value);
           m2s[g] += (v - means[g]) * (v - means[g]);
         });
 
@@ -1192,7 +1146,7 @@ struct GroupedVarStdImpl : public GroupedAggregator {
       var_std.clear();
       var_std.resize(num_groups_);
       GroupedVarStdImpl<Type> state;
-      RETURN_NOT_OK(state.Init(ctx_, &options_));
+      RETURN_NOT_OK(state.InitInternal(ctx_, decimal_scale_, &options_));
       RETURN_NOT_OK(state.Resize(num_groups_));
       int64_t* other_counts = state.counts_.mutable_data();
       double* other_means = state.means_.mutable_data();
@@ -1319,6 +1273,7 @@ struct GroupedVarStdImpl : public GroupedAggregator {
   std::shared_ptr<DataType> out_type() const override { return float64(); }
 
   VarOrStd result_type_;
+  int32_t decimal_scale_;
   VarianceOptions options_;
   int64_t num_groups_ = 0;
   // m2 = count * s2 = sum((X-mean)^2)
@@ -1334,14 +1289,15 @@ Result<std::unique_ptr<KernelState>> VarStdInit(KernelContext* ctx,
                                                 const KernelInitArgs& args) {
   auto impl = ::arrow::internal::make_unique<GroupedVarStdImpl<T>>();
   impl->result_type_ = result_type;
-  RETURN_NOT_OK(impl->Init(ctx->exec_context(), args.options));
+  RETURN_NOT_OK(impl->Init(ctx->exec_context(), args.inputs, args.options));
   return std::move(impl);
 }
 
 template <VarOrStd result_type>
 struct GroupedVarStdFactory {
   template <typename T, typename Enable = enable_if_t<is_integer_type<T>::value ||
-                                                      is_floating_type<T>::value>>
+                                                      is_floating_type<T>::value ||
+                                                      is_decimal_type<T>::value>>
   Status Visit(const T&) {
     kernel = MakeKernel(std::move(argument_type), VarStdInit<T, result_type>);
     return Status::OK();
@@ -1357,7 +1313,7 @@ struct GroupedVarStdFactory {
 
   static Result<HashAggregateKernel> Make(const std::shared_ptr<DataType>& type) {
     GroupedVarStdFactory factory;
-    factory.argument_type = InputType::Array(type);
+    factory.argument_type = InputType::Array(type->id());
     RETURN_NOT_OK(VisitTypeInline(*type, &factory));
     return std::move(factory.kernel);
   }
@@ -1373,10 +1329,16 @@ using arrow::internal::TDigest;
 
 template <typename Type>
 struct GroupedTDigestImpl : public GroupedAggregator {
-  using CType = typename Type::c_type;
+  using CType = typename TypeTraits<Type>::CType;
 
-  Status Init(ExecContext* ctx, const FunctionOptions* options) override {
+  Status Init(ExecContext* ctx, const std::vector<ValueDescr>& inputs,
+              const FunctionOptions* options) override {
     options_ = *checked_cast<const TDigestOptions*>(options);
+    if (is_decimal_type<Type>::value) {
+      decimal_scale_ = checked_cast<const DecimalType&>(*inputs[0].type).scale();
+    } else {
+      decimal_scale_ = 0;
+    }
     ctx_ = ctx;
     pool_ = ctx->memory_pool();
     counts_ = TypedBufferBuilder<int64_t>(pool_);
@@ -1395,13 +1357,24 @@ struct GroupedTDigestImpl : public GroupedAggregator {
     return Status::OK();
   }
 
+  template <typename T>
+  double ToDouble(T value) const {
+    return static_cast<double>(value);
+  }
+  double ToDouble(const Decimal128& value) const {
+    return value.ToDouble(decimal_scale_);
+  }
+  double ToDouble(const Decimal256& value) const {
+    return value.ToDouble(decimal_scale_);
+  }
+
   Status Consume(const ExecBatch& batch) override {
     int64_t* counts = counts_.mutable_data();
     uint8_t* no_nulls = no_nulls_.mutable_data();
     VisitGroupedValues<Type>(
         batch,
         [&](uint32_t g, CType value) {
-          tdigests_[g].NanAdd(value);
+          tdigests_[g].NanAdd(ToDouble(value));
           counts[g]++;
         },
         [&](uint32_t g) { BitUtil::SetBitTo(no_nulls, g, false); });
@@ -1470,6 +1443,7 @@ struct GroupedTDigestImpl : public GroupedAggregator {
   }
 
   TDigestOptions options_;
+  int32_t decimal_scale_;
   std::vector<TDigest> tdigests_;
   TypedBufferBuilder<int64_t> counts_;
   TypedBufferBuilder<bool> no_nulls_;
@@ -1485,6 +1459,13 @@ struct GroupedTDigestFactory {
     return Status::OK();
   }
 
+  template <typename T>
+  enable_if_decimal<T, Status> Visit(const T&) {
+    kernel =
+        MakeKernel(std::move(argument_type), HashAggregateInit<GroupedTDigestImpl<T>>);
+    return Status::OK();
+  }
+
   Status Visit(const HalfFloatType& type) {
     return Status::NotImplemented("Computing t-digest of data of type ", type);
   }
@@ -1495,7 +1476,7 @@ struct GroupedTDigestFactory {
 
   static Result<HashAggregateKernel> Make(const std::shared_ptr<DataType>& type) {
     GroupedTDigestFactory factory;
-    factory.argument_type = InputType::Array(type);
+    factory.argument_type = InputType::Array(type->id());
     RETURN_NOT_OK(VisitTypeInline(*type, &factory));
     return std::move(factory.kernel);
   }
@@ -1509,15 +1490,14 @@ HashAggregateKernel MakeApproximateMedianKernel(HashAggregateFunction* tdigest_f
   kernel.init = [tdigest_func](
                     KernelContext* ctx,
                     const KernelInitArgs& args) -> Result<std::unique_ptr<KernelState>> {
-    std::vector<ValueDescr> inputs = args.inputs;
-    ARROW_ASSIGN_OR_RAISE(auto kernel, tdigest_func->DispatchBest(&inputs));
+    ARROW_ASSIGN_OR_RAISE(auto kernel, tdigest_func->DispatchExact(args.inputs));
     const auto& scalar_options =
         checked_cast<const ScalarAggregateOptions&>(*args.options);
     TDigestOptions options;
     // Default q = 0.5
     options.min_count = scalar_options.min_count;
     options.skip_nulls = scalar_options.skip_nulls;
-    KernelInitArgs new_args{kernel, inputs, &options};
+    KernelInitArgs new_args{kernel, args.inputs, &options};
     return kernel->init(ctx, new_args);
   };
   kernel.signature =
@@ -1581,7 +1561,8 @@ struct GroupedMinMaxImpl final : public GroupedAggregator {
   using ArrType =
       typename std::conditional<is_boolean_type<Type>::value, uint8_t, CType>::type;
 
-  Status Init(ExecContext* ctx, const FunctionOptions* options) override {
+  Status Init(ExecContext* ctx, const std::vector<ValueDescr>&,
+              const FunctionOptions* options) override {
     options_ = *checked_cast<const ScalarAggregateOptions*>(options);
     // type_ initialized by MinMaxInit
     mins_ = TypedBufferBuilder<CType>(ctx->memory_pool());
@@ -1678,7 +1659,10 @@ struct GroupedMinMaxImpl final : public GroupedAggregator {
 };
 
 struct GroupedNullMinMaxImpl final : public GroupedAggregator {
-  Status Init(ExecContext* ctx, const FunctionOptions*) override { return Status::OK(); }
+  Status Init(ExecContext* ctx, const std::vector<ValueDescr>&,
+              const FunctionOptions*) override {
+    return Status::OK();
+  }
 
   Status Resize(int64_t new_num_groups) override {
     num_groups_ = new_num_groups;
@@ -1723,7 +1707,7 @@ HashAggregateKernel MakeMinOrMaxKernel(HashAggregateFunction* min_max_func) {
                     KernelContext* ctx,
                     const KernelInitArgs& args) -> Result<std::unique_ptr<KernelState>> {
     std::vector<ValueDescr> inputs = args.inputs;
-    ARROW_ASSIGN_OR_RAISE(auto kernel, min_max_func->DispatchBest(&inputs));
+    ARROW_ASSIGN_OR_RAISE(auto kernel, min_max_func->DispatchExact(args.inputs));
     KernelInitArgs new_args{kernel, inputs, args.options};
     return kernel->init(ctx, new_args);
   };
@@ -1806,7 +1790,8 @@ struct GroupedMinMaxFactory {
 
 template <typename Impl>
 struct GroupedBooleanAggregator : public GroupedAggregator {
-  Status Init(ExecContext* ctx, const FunctionOptions* options) override {
+  Status Init(ExecContext* ctx, const std::vector<ValueDescr>&,
+              const FunctionOptions* options) override {
     options_ = checked_cast<const ScalarAggregateOptions&>(*options);
     pool_ = ctx->memory_pool();
     reduced_ = TypedBufferBuilder<bool>(pool_);
@@ -1976,7 +1961,8 @@ struct GroupedAllImpl : public GroupedBooleanAggregator<GroupedAllImpl> {
 // CountDistinct/Distinct implementation
 
 struct GroupedCountDistinctImpl : public GroupedAggregator {
-  Status Init(ExecContext* ctx, const FunctionOptions* options) override {
+  Status Init(ExecContext* ctx, const std::vector<ValueDescr>&,
+              const FunctionOptions* options) override {
     ctx_ = ctx;
     pool_ = ctx->memory_pool();
     options_ = checked_cast<const CountOptions&>(*options);
@@ -2554,6 +2540,8 @@ void RegisterHashAggregateBasic(FunctionRegistry* registry) {
                                 GroupedVarStdFactory<VarOrStd::Std>::Make, func.get()));
     DCHECK_OK(AddHashAggKernels(FloatingPointTypes(),
                                 GroupedVarStdFactory<VarOrStd::Std>::Make, func.get()));
+    DCHECK_OK(AddHashAggKernels({decimal128(1, 1), decimal256(1, 1)},
+                                GroupedVarStdFactory<VarOrStd::Std>::Make, func.get()));
     DCHECK_OK(registry->AddFunction(std::move(func)));
   }
 
@@ -2566,6 +2554,8 @@ void RegisterHashAggregateBasic(FunctionRegistry* registry) {
                                 GroupedVarStdFactory<VarOrStd::Var>::Make, func.get()));
     DCHECK_OK(AddHashAggKernels(FloatingPointTypes(),
                                 GroupedVarStdFactory<VarOrStd::Var>::Make, func.get()));
+    DCHECK_OK(AddHashAggKernels({decimal128(1, 1), decimal256(1, 1)},
+                                GroupedVarStdFactory<VarOrStd::Var>::Make, func.get()));
     DCHECK_OK(registry->AddFunction(std::move(func)));
   }
 
@@ -2579,6 +2569,9 @@ void RegisterHashAggregateBasic(FunctionRegistry* registry) {
         AddHashAggKernels(UnsignedIntTypes(), GroupedTDigestFactory::Make, func.get()));
     DCHECK_OK(
         AddHashAggKernels(FloatingPointTypes(), GroupedTDigestFactory::Make, func.get()));
+    // Type parameters are ignored
+    DCHECK_OK(AddHashAggKernels({decimal128(1, 1), decimal256(1, 1)},
+                                GroupedTDigestFactory::Make, func.get()));
     tdigest_func = func.get();
     DCHECK_OK(registry->AddFunction(std::move(func)));
   }
diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc b/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc
index 412290aa777fc..e53c5d43ca88d 100644
--- a/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc
+++ b/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc
@@ -1130,6 +1130,55 @@ TEST(GroupBy, VarianceAndStddev) {
                           /*verbose=*/true);
 }
 
+TEST(GroupBy, VarianceAndStddevDecimal) {
+  auto batch = RecordBatchFromJSON(
+      schema({field("argument0", decimal128(3, 2)), field("argument1", decimal128(3, 2)),
+              field("key", int64())}),
+      R"([
+    ["1.00",  "1.00",  1],
+    [null,    null,    1],
+    ["0.00",  "0.00",  2],
+    ["4.00",  "4.00",  null],
+    ["3.00",  "3.00",  1],
+    ["0.00",  "0.00",  2],
+    ["-1.00", "-1.00", 2],
+    ["1.00",  "1.00",  null]
+  ])");
+
+  ASSERT_OK_AND_ASSIGN(Datum aggregated_and_grouped,
+                       internal::GroupBy(
+                           {
+                               batch->GetColumnByName("argument0"),
+                               batch->GetColumnByName("argument0"),
+                               batch->GetColumnByName("argument1"),
+                               batch->GetColumnByName("argument1"),
+                           },
+                           {
+                               batch->GetColumnByName("key"),
+                           },
+                           {
+                               {"hash_variance", nullptr},
+                               {"hash_stddev", nullptr},
+                               {"hash_variance", nullptr},
+                               {"hash_stddev", nullptr},
+                           }));
+
+  AssertDatumsApproxEqual(ArrayFromJSON(struct_({
+                                            field("hash_variance", float64()),
+                                            field("hash_stddev", float64()),
+                                            field("hash_variance", float64()),
+                                            field("hash_stddev", float64()),
+                                            field("key_0", int64()),
+                                        }),
+                                        R"([
+    [1.0,                 1.0,                1.0,                 1.0,                1],
+    [0.22222222222222224, 0.4714045207910317, 0.22222222222222224, 0.4714045207910317, 2],
+    [2.25,                1.5,                2.25,                1.5,                null]
+  ])"),
+                          aggregated_and_grouped,
+                          /*verbose=*/true);
+}
+
 TEST(GroupBy, TDigest) {
   auto batch = RecordBatchFromJSON(
       schema({field("argument", float64()), field("key", int64())}), R"([
@@ -1201,6 +1250,48 @@ TEST(GroupBy, TDigest) {
       /*verbose=*/true);
 }
 
+TEST(GroupBy, TDigestDecimal) {
+  auto batch = RecordBatchFromJSON(
+      schema({field("argument0", decimal128(3, 2)), field("argument1", decimal256(3, 2)),
+              field("key", int64())}),
+      R"([
+    ["1.01",  "1.01",  1],
+    [null,    null,    1],
+    ["0.00",  "0.00",  2],
+    ["4.42",  "4.42",  null],
+    ["3.86",  "3.86",  1],
+    ["0.00",  "0.00",  2],
+    ["-1.93", "-1.93", 2],
+    ["1.85",  "1.85",  null]
+  ])");
+
+  ASSERT_OK_AND_ASSIGN(Datum aggregated_and_grouped,
+                       internal::GroupBy(
+                           {
+                               batch->GetColumnByName("argument0"),
+                               batch->GetColumnByName("argument1"),
+                           },
+                           {batch->GetColumnByName("key")},
+                           {
+                               {"hash_tdigest", nullptr},
+                               {"hash_tdigest", nullptr},
+                           }));
+
+  AssertDatumsApproxEqual(
+      ArrayFromJSON(struct_({
+                        field("hash_tdigest", fixed_size_list(float64(), 1)),
+                        field("hash_tdigest", fixed_size_list(float64(), 1)),
+                        field("key_0", int64()),
+                    }),
+                    R"([
+    [[1.01], [1.01], 1],
+    [[0.0],  [0.0],  2],
+    [[1.85], [1.85], null]
+  ])"),
+      aggregated_and_grouped,
+      /*verbose=*/true);
+}
+
 TEST(GroupBy, ApproximateMedian) {
   for (const auto& type : {float64(), int8()}) {
     auto batch =
diff --git a/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc b/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc
index 1a60ed31e0a47..db122ca81f3eb 100644
--- a/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc
@@ -71,6 +71,12 @@ struct AbsoluteValue {
                                                                Status* st) {
     return (arg < 0) ? arrow::internal::SafeSignedNegate(arg) : arg;
   }
+
+  template <typename T, typename Arg>
+  static constexpr enable_if_decimal_value<Arg, T> Call(KernelContext*, Arg arg,
+                                                        Status*) {
+    return arg.Abs();
+  }
 };
 
 struct AbsoluteValueChecked {
@@ -98,6 +104,12 @@ struct AbsoluteValueChecked {
     static_assert(std::is_same<T, Arg>::value, "");
     return std::fabs(arg);
   }
+
+  template <typename T, typename Arg>
+  static constexpr enable_if_decimal_value<Arg, T> Call(KernelContext*, Arg arg,
+                                                        Status*) {
+    return arg.Abs();
+  }
 };
 
 struct Add {
@@ -363,6 +375,12 @@ struct Negate {
                                                           Status*) {
     return arrow::internal::SafeSignedNegate(arg);
   }
+
+  template <typename T, typename Arg>
+  static constexpr enable_if_decimal_value<Arg, T> Call(KernelContext*, Arg arg,
+                                                        Status*) {
+    return arg.Negate();
+  }
 };
 
 struct NegateChecked {
@@ -392,6 +410,12 @@ struct NegateChecked {
     static_assert(std::is_same<T, Arg>::value, "");
     return -arg;
   }
+
+  template <typename T, typename Arg>
+  static constexpr enable_if_decimal_value<Arg, T> Call(KernelContext*, Arg arg,
+                                                        Status*) {
+    return arg.Negate();
+  }
 };
 
 struct Power {
@@ -475,6 +499,12 @@ struct Sign {
                                                                Status*) {
     return (arg > 0) ? 1 : ((arg == 0) ? 0 : -1);
   }
+
+  template <typename T, typename Arg>
+  static constexpr enable_if_decimal_value<Arg, T> Call(KernelContext*, Arg arg,
+                                                        Status*) {
+    return (arg == 0) ? 0 : arg.Sign();
+  }
 };
 
 // Bitwise operations
@@ -1583,8 +1613,18 @@ Result<ValueDescr> ResolveDecimalDivisionOutput(KernelContext*,
 }
 
 template <typename Op>
-void AddDecimalBinaryKernels(const std::string& name,
-                             std::shared_ptr<ScalarFunction>* func) {
+void AddDecimalUnaryKernels(ScalarFunction* func) {
+  OutputType out_type(FirstType);
+  auto in_type128 = InputType(Type::DECIMAL128);
+  auto in_type256 = InputType(Type::DECIMAL256);
+  auto exec128 = ScalarUnaryNotNull<Decimal128Type, Decimal128Type, Op>::Exec;
+  auto exec256 = ScalarUnaryNotNull<Decimal256Type, Decimal256Type, Op>::Exec;
+  DCHECK_OK(func->AddKernel({in_type128}, out_type, exec128));
+  DCHECK_OK(func->AddKernel({in_type256}, out_type, exec256));
+}
+
+template <typename Op>
+void AddDecimalBinaryKernels(const std::string& name, ScalarFunction* func) {
   OutputType out_type(null());
   const std::string op = name.substr(0, name.find("_"));
   if (op == "add" || op == "subtract") {
@@ -1601,8 +1641,8 @@ void AddDecimalBinaryKernels(const std::string& name,
   auto in_type256 = InputType(Type::DECIMAL256);
   auto exec128 = ScalarBinaryNotNullEqualTypes<Decimal128Type, Decimal128Type, Op>::Exec;
   auto exec256 = ScalarBinaryNotNullEqualTypes<Decimal256Type, Decimal256Type, Op>::Exec;
-  DCHECK_OK((*func)->AddKernel({in_type128, in_type128}, out_type, exec128));
-  DCHECK_OK((*func)->AddKernel({in_type256, in_type256}, out_type, exec256));
+  DCHECK_OK(func->AddKernel({in_type128, in_type128}, out_type, exec128));
+  DCHECK_OK(func->AddKernel({in_type256, in_type256}, out_type, exec256));
 }
 
 // Generate a kernel given an arithmetic functor
@@ -1683,6 +1723,36 @@ struct ArithmeticFunction : ScalarFunction {
   }
 };
 
+/// An ArithmeticFunction that promotes only decimal arguments to double.
+struct ArithmeticDecimalToFloatingPointFunction : public ArithmeticFunction {
+  using ArithmeticFunction::ArithmeticFunction;
+
+  Result<const Kernel*> DispatchBest(std::vector<ValueDescr>* values) const override {
+    RETURN_NOT_OK(CheckArity(*values));
+
+    using arrow::compute::detail::DispatchExactImpl;
+    if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
+
+    EnsureDictionaryDecoded(values);
+
+    if (values->size() == 2) {
+      ReplaceNullWithOtherType(values);
+    }
+
+    for (auto& descr : *values) {
+      if (is_decimal(descr.type->id())) {
+        descr.type = float64();
+      }
+    }
+    if (auto type = CommonNumeric(*values)) {
+      ReplaceTypes(type, values);
+    }
+
+    if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
+    return arrow::compute::detail::NoMatchingKernel(this, *values);
+  }
+};
+
 /// An ArithmeticFunction that promotes only integer arguments to double.
 struct ArithmeticIntegerToFloatingPointFunction : public ArithmeticFunction {
   using ArithmeticFunction::ArithmeticFunction;
@@ -1714,13 +1784,12 @@ struct ArithmeticIntegerToFloatingPointFunction : public ArithmeticFunction {
   }
 };
 
-/// An ArithmeticFunction that promotes integer arguments to double.
+/// An ArithmeticFunction that promotes integer and decimal arguments to double.
 struct ArithmeticFloatingPointFunction : public ArithmeticFunction {
   using ArithmeticFunction::ArithmeticFunction;
 
   Result<const Kernel*> DispatchBest(std::vector<ValueDescr>* values) const override {
     RETURN_NOT_OK(CheckArity(*values));
-    RETURN_NOT_OK(CheckDecimals(values));
 
     using arrow::compute::detail::DispatchExactImpl;
     if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
@@ -1732,7 +1801,7 @@ struct ArithmeticFloatingPointFunction : public ArithmeticFunction {
     }
 
     for (auto& descr : *values) {
-      if (is_integer(descr.type->id())) {
+      if (is_integer(descr.type->id()) || is_decimal(descr.type->id())) {
         descr.type = float64();
       }
     }
@@ -1755,10 +1824,10 @@ void AddNullExec(ScalarFunction* func) {
   DCHECK_OK(func->AddKernel(std::move(input_types), OutputType(null()), NullToNullExec));
 }
 
-template <typename Op>
+template <typename Op, typename FunctionImpl = ArithmeticFunction>
 std::shared_ptr<ScalarFunction> MakeArithmeticFunction(std::string name,
                                                        const FunctionDoc* doc) {
-  auto func = std::make_shared<ArithmeticFunction>(name, Arity::Binary(), doc);
+  auto func = std::make_shared<FunctionImpl>(name, Arity::Binary(), doc);
   for (const auto& ty : NumericTypes()) {
     auto exec = ArithmeticExecFromOp<ScalarBinaryEqualTypes, Op>(ty);
     DCHECK_OK(func->AddKernel({ty, ty}, ty, exec));
@@ -1769,10 +1838,10 @@ std::shared_ptr<ScalarFunction> MakeArithmeticFunction(std::string name,
 
 // Like MakeArithmeticFunction, but for arithmetic ops that need to run
 // only on non-null output.
-template <typename Op>
+template <typename Op, typename FunctionImpl = ArithmeticFunction>
 std::shared_ptr<ScalarFunction> MakeArithmeticFunctionNotNull(std::string name,
                                                               const FunctionDoc* doc) {
-  auto func = std::make_shared<ArithmeticFunction>(name, Arity::Binary(), doc);
+  auto func = std::make_shared<FunctionImpl>(name, Arity::Binary(), doc);
   for (const auto& ty : NumericTypes()) {
     auto exec = ArithmeticExecFromOp<ScalarBinaryNotNullEqualTypes, Op>(ty);
     DCHECK_OK(func->AddKernel({ty, ty}, ty, exec));
@@ -1805,6 +1874,12 @@ std::shared_ptr<ScalarFunction> MakeUnaryArithmeticFunctionWithFixedIntOutType(
     auto exec = GenerateArithmeticWithFixedIntOutType<ScalarUnary, IntOutType, Op>(ty);
     DCHECK_OK(func->AddKernel({ty}, out_ty, exec));
   }
+  {
+    auto exec = ScalarUnary<Int64Type, Decimal128Type, Op>::Exec;
+    DCHECK_OK(func->AddKernel({InputType(Type::DECIMAL128)}, int64(), exec));
+    exec = ScalarUnary<Int64Type, Decimal256Type, Op>::Exec;
+    DCHECK_OK(func->AddKernel({InputType(Type::DECIMAL256)}, int64(), exec));
+  }
   AddNullExec(func.get());
   return func;
 }
@@ -2338,27 +2413,29 @@ void RegisterScalarArithmetic(FunctionRegistry* registry) {
   // ----------------------------------------------------------------------
   auto absolute_value =
       MakeUnaryArithmeticFunction<AbsoluteValue>("abs", &absolute_value_doc);
+  AddDecimalUnaryKernels<AbsoluteValue>(absolute_value.get());
   DCHECK_OK(registry->AddFunction(std::move(absolute_value)));
 
   // ----------------------------------------------------------------------
   auto absolute_value_checked = MakeUnaryArithmeticFunctionNotNull<AbsoluteValueChecked>(
       "abs_checked", &absolute_value_checked_doc);
+  AddDecimalUnaryKernels<AbsoluteValueChecked>(absolute_value_checked.get());
   DCHECK_OK(registry->AddFunction(std::move(absolute_value_checked)));
 
   // ----------------------------------------------------------------------
   auto add = MakeArithmeticFunction<Add>("add", &add_doc);
-  AddDecimalBinaryKernels<Add>("add", &add);
+  AddDecimalBinaryKernels<Add>("add", add.get());
   DCHECK_OK(registry->AddFunction(std::move(add)));
 
   // ----------------------------------------------------------------------
   auto add_checked =
       MakeArithmeticFunctionNotNull<AddChecked>("add_checked", &add_checked_doc);
-  AddDecimalBinaryKernels<AddChecked>("add_checked", &add_checked);
+  AddDecimalBinaryKernels<AddChecked>("add_checked", add_checked.get());
   DCHECK_OK(registry->AddFunction(std::move(add_checked)));
 
   // ----------------------------------------------------------------------
   auto subtract = MakeArithmeticFunction<Subtract>("subtract", &sub_doc);
-  AddDecimalBinaryKernels<Subtract>("subtract", &subtract);
+  AddDecimalBinaryKernels<Subtract>("subtract", subtract.get());
 
   // Add subtract(timestamp, timestamp) -> duration
   for (auto unit : TimeUnit::values()) {
@@ -2372,47 +2449,52 @@ void RegisterScalarArithmetic(FunctionRegistry* registry) {
   // ----------------------------------------------------------------------
   auto subtract_checked = MakeArithmeticFunctionNotNull<SubtractChecked>(
       "subtract_checked", &sub_checked_doc);
-  AddDecimalBinaryKernels<SubtractChecked>("subtract_checked", &subtract_checked);
+  AddDecimalBinaryKernels<SubtractChecked>("subtract_checked", subtract_checked.get());
   DCHECK_OK(registry->AddFunction(std::move(subtract_checked)));
 
   // ----------------------------------------------------------------------
   auto multiply = MakeArithmeticFunction<Multiply>("multiply", &mul_doc);
-  AddDecimalBinaryKernels<Multiply>("multiply", &multiply);
+  AddDecimalBinaryKernels<Multiply>("multiply", multiply.get());
   DCHECK_OK(registry->AddFunction(std::move(multiply)));
 
   // ----------------------------------------------------------------------
   auto multiply_checked = MakeArithmeticFunctionNotNull<MultiplyChecked>(
       "multiply_checked", &mul_checked_doc);
-  AddDecimalBinaryKernels<MultiplyChecked>("multiply_checked", &multiply_checked);
+  AddDecimalBinaryKernels<MultiplyChecked>("multiply_checked", multiply_checked.get());
   DCHECK_OK(registry->AddFunction(std::move(multiply_checked)));
 
   // ----------------------------------------------------------------------
   auto divide = MakeArithmeticFunctionNotNull<Divide>("divide", &div_doc);
-  AddDecimalBinaryKernels<Divide>("divide", &divide);
+  AddDecimalBinaryKernels<Divide>("divide", divide.get());
   DCHECK_OK(registry->AddFunction(std::move(divide)));
 
   // ----------------------------------------------------------------------
   auto divide_checked =
       MakeArithmeticFunctionNotNull<DivideChecked>("divide_checked", &div_checked_doc);
-  AddDecimalBinaryKernels<DivideChecked>("divide_checked", &divide_checked);
+  AddDecimalBinaryKernels<DivideChecked>("divide_checked", divide_checked.get());
   DCHECK_OK(registry->AddFunction(std::move(divide_checked)));
 
   // ----------------------------------------------------------------------
   auto negate = MakeUnaryArithmeticFunction<Negate>("negate", &negate_doc);
+  AddDecimalUnaryKernels<Negate>(negate.get());
   DCHECK_OK(registry->AddFunction(std::move(negate)));
 
   // ----------------------------------------------------------------------
   auto negate_checked = MakeUnarySignedArithmeticFunctionNotNull<NegateChecked>(
       "negate_checked", &negate_checked_doc);
+  AddDecimalUnaryKernels<NegateChecked>(negate_checked.get());
   DCHECK_OK(registry->AddFunction(std::move(negate_checked)));
 
   // ----------------------------------------------------------------------
-  auto power = MakeArithmeticFunction<Power>("power", &pow_doc);
+  auto power = MakeArithmeticFunction<Power, ArithmeticDecimalToFloatingPointFunction>(
+      "power", &pow_doc);
   DCHECK_OK(registry->AddFunction(std::move(power)));
 
   // ----------------------------------------------------------------------
   auto power_checked =
-      MakeArithmeticFunctionNotNull<PowerChecked>("power_checked", &pow_checked_doc);
+      MakeArithmeticFunctionNotNull<PowerChecked,
+                                    ArithmeticDecimalToFloatingPointFunction>(
+          "power_checked", &pow_checked_doc);
   DCHECK_OK(registry->AddFunction(std::move(power_checked)));
 
   // ----------------------------------------------------------------------
diff --git a/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc b/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc
index 09681b2763bee..52414042e926f 100644
--- a/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc
@@ -294,7 +294,7 @@ class TestArithmeticDecimal : public ::testing::Test {
   }
 
   void CheckRaises(const std::string& func, const DatumVector& args,
-                   const std::string& substr, FunctionOptions* options = nullptr) {
+                   const std::string& substr, const FunctionOptions* options = nullptr) {
     EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr(substr),
                                     CallFunction(func, args, options));
   }
@@ -1487,6 +1487,83 @@ TYPED_TEST(TestUnaryArithmeticFloating, AbsoluteValue) {
 
 class TestUnaryArithmeticDecimal : public TestArithmeticDecimal {};
 
+TEST_F(TestUnaryArithmeticDecimal, AbsoluteValue) {
+  auto max128 = Decimal128::GetMaxValue(38);
+  auto max256 = Decimal256::GetMaxValue(76);
+  for (const auto& func : {"abs", "abs_checked"}) {
+    for (const auto& ty : PositiveScaleTypes()) {
+      CheckScalar(func, {ArrayFromJSON(ty, R"([])")}, ArrayFromJSON(ty, R"([])"));
+      CheckScalar(func, {ArrayFromJSON(ty, R"(["1.00", "-42.15", null])")},
+                  ArrayFromJSON(ty, R"(["1.00", "42.15", null])"));
+    }
+    CheckScalar(func, {std::make_shared<Decimal128Scalar>(-max128, decimal128(38, 0))},
+                std::make_shared<Decimal128Scalar>(max128, decimal128(38, 0)));
+    CheckScalar(func, {std::make_shared<Decimal256Scalar>(-max256, decimal256(76, 0))},
+                std::make_shared<Decimal256Scalar>(max256, decimal256(76, 0)));
+    for (const auto& ty : NegativeScaleTypes()) {
+      CheckScalar(func, {ArrayFromJSON(ty, R"([])")}, ArrayFromJSON(ty, R"([])"));
+      CheckScalar(func, {DecimalArrayFromJSON(ty, R"(["12E2", "-42E2", null])")},
+                  DecimalArrayFromJSON(ty, R"(["12E2", "42E2", null])"));
+    }
+  }
+}
+
+TEST_F(TestUnaryArithmeticDecimal, Log) {
+  std::vector<std::string> unchecked = {"ln", "log2", "log10", "log1p"};
+  std::vector<std::string> checked = {"ln_checked", "log2_checked", "log10_checked",
+                                      "log1p_checked"};
+  std::vector<std::string> all = unchecked;
+  all.insert(all.end(), checked.begin(), checked.end());
+
+  for (const auto& func : all) {
+    for (const auto& ty : PositiveScaleTypes()) {
+      CheckDecimalToFloat(func, {ArrayFromJSON(ty, R"([])")});
+      CheckDecimalToFloat(func, {ArrayFromJSON(ty, R"(["0.01", "1.00", "4.42", null])")});
+    }
+    for (const auto& ty : NegativeScaleTypes()) {
+      CheckDecimalToFloat(func, {ArrayFromJSON(ty, R"([])")});
+      CheckDecimalToFloat(func, {DecimalArrayFromJSON(ty, R"(["12E2", "42E2", null])")});
+    }
+  }
+
+  for (const auto& func : unchecked) {
+    for (const auto& ty : PositiveScaleTypes()) {
+      CheckDecimalToFloat(func, {ArrayFromJSON(ty, R"(["-2.00"])")});
+    }
+  }
+  for (const auto& func : checked) {
+    for (const auto& ty : PositiveScaleTypes()) {
+      CheckRaises(func, {DecimalArrayFromJSON(ty, R"(["-2.00"])")},
+                  "logarithm of negative number");
+    }
+  }
+}
+
+TEST_F(TestUnaryArithmeticDecimal, Negate) {
+  auto max128 = Decimal128::GetMaxValue(38);
+  auto max256 = Decimal256::GetMaxValue(76);
+  for (const auto& func : {"negate", "negate_checked"}) {
+    for (const auto& ty : PositiveScaleTypes()) {
+      CheckScalar(func, {ArrayFromJSON(ty, R"([])")}, ArrayFromJSON(ty, R"([])"));
+      CheckScalar(func, {ArrayFromJSON(ty, R"(["0.00", "1.00", "-42.15", null])")},
+                  ArrayFromJSON(ty, R"(["0.00", "-1.00", "42.15", null])"));
+    }
+    CheckScalar(func, {std::make_shared<Decimal128Scalar>(-max128, decimal128(38, 0))},
+                std::make_shared<Decimal128Scalar>(max128, decimal128(38, 0)));
+    CheckScalar(func, {std::make_shared<Decimal128Scalar>(max128, decimal128(38, 0))},
+                std::make_shared<Decimal128Scalar>(-max128, decimal128(38, 0)));
+    CheckScalar(func, {std::make_shared<Decimal256Scalar>(-max256, decimal256(76, 0))},
+                std::make_shared<Decimal256Scalar>(max256, decimal256(76, 0)));
+    CheckScalar(func, {std::make_shared<Decimal256Scalar>(max256, decimal256(76, 0))},
+                std::make_shared<Decimal256Scalar>(-max256, decimal256(76, 0)));
+    for (const auto& ty : NegativeScaleTypes()) {
+      CheckScalar(func, {ArrayFromJSON(ty, R"([])")}, ArrayFromJSON(ty, R"([])"));
+      CheckScalar(func, {DecimalArrayFromJSON(ty, R"(["0", "12E2", "-42E2", null])")},
+                  DecimalArrayFromJSON(ty, R"(["0", "-12E2", "42E2", null])"));
+    }
+  }
+}
+
 // Check two modes exhaustively, give all modes a simple test
 TEST_F(TestUnaryArithmeticDecimal, Round) {
   const auto func = "round";
@@ -1973,6 +2050,116 @@ TEST_F(TestUnaryArithmeticDecimal, RoundToMultipleHalfToOdd) {
   }
 }
 
+TEST_F(TestUnaryArithmeticDecimal, Sign) {
+  auto max128 = Decimal128::GetMaxValue(38);
+  auto max256 = Decimal256::GetMaxValue(76);
+  const auto func = "sign";
+  for (const auto& ty : PositiveScaleTypes()) {
+    CheckScalar(func, {ArrayFromJSON(ty, R"([])")}, ArrayFromJSON(int64(), "[]"));
+    CheckScalar(func, {ArrayFromJSON(ty, R"(["1.00", "0.00", "-42.15", null])")},
+                ArrayFromJSON(int64(), "[1, 0, -1, null]"));
+  }
+  CheckScalar(func, {std::make_shared<Decimal128Scalar>(max128, decimal128(38, 0))},
+              ScalarFromJSON(int64(), "1"));
+  CheckScalar(func, {std::make_shared<Decimal128Scalar>(-max128, decimal128(38, 0))},
+              ScalarFromJSON(int64(), "-1"));
+  CheckScalar(func, {std::make_shared<Decimal256Scalar>(max256, decimal256(76, 0))},
+              ScalarFromJSON(int64(), "1"));
+  CheckScalar(func, {std::make_shared<Decimal256Scalar>(-max256, decimal256(76, 0))},
+              ScalarFromJSON(int64(), "-1"));
+  for (const auto& ty : NegativeScaleTypes()) {
+    CheckScalar(func, {ArrayFromJSON(ty, R"([])")}, ArrayFromJSON(int64(), "[]"));
+    CheckScalar(func, {DecimalArrayFromJSON(ty, R"(["12E2", "0", "-42E2", null])")},
+                ArrayFromJSON(int64(), "[1, 0, -1, null]"));
+  }
+}
+
+TEST_F(TestUnaryArithmeticDecimal, TrigAcos) {
+  for (const auto& func : {"acos", "acos_checked"}) {
+    for (const auto& ty : PositiveScaleTypes()) {
+      CheckDecimalToFloat(func, {ArrayFromJSON(ty, R"([])")});
+      CheckDecimalToFloat(func,
+                          {ArrayFromJSON(ty, R"(["0.00", "-1.00", "1.00", null])")});
+    }
+  }
+  for (const auto& ty : NegativeScaleTypes()) {
+    CheckDecimalToFloat("acos", {DecimalArrayFromJSON(ty, R"(["12E2", "-42E2", null])")});
+    CheckRaises("acos_checked", {DecimalArrayFromJSON(ty, R"(["12E2", "-42E2", null])")},
+                "domain error");
+  }
+}
+
+TEST_F(TestUnaryArithmeticDecimal, TrigAsin) {
+  for (const auto& func : {"asin", "asin_checked"}) {
+    for (const auto& ty : PositiveScaleTypes()) {
+      CheckDecimalToFloat(func, {ArrayFromJSON(ty, R"([])")});
+      CheckDecimalToFloat(func,
+                          {ArrayFromJSON(ty, R"(["0.00", "-1.00", "1.00", null])")});
+    }
+  }
+  for (const auto& ty : NegativeScaleTypes()) {
+    CheckDecimalToFloat("asin", {DecimalArrayFromJSON(ty, R"(["12E2", "-42E2", null])")});
+    EXPECT_RAISES_WITH_MESSAGE_THAT(
+        Invalid, ::testing::HasSubstr("domain error"),
+        CallFunction("asin_checked",
+                     {DecimalArrayFromJSON(ty, R"(["12E2", "-42E2", null])")}));
+  }
+}
+
+TEST_F(TestUnaryArithmeticDecimal, TrigAtan) {
+  const auto func = "atan";
+  for (const auto& ty : PositiveScaleTypes()) {
+    CheckDecimalToFloat(func, {ArrayFromJSON(ty, R"([])")});
+    CheckDecimalToFloat(func, {ArrayFromJSON(ty, R"(["0.00", "-1.00", "1.00", null])")});
+  }
+  for (const auto& ty : NegativeScaleTypes()) {
+    CheckDecimalToFloat(func, {ArrayFromJSON(ty, R"([])")});
+    CheckDecimalToFloat(func, {DecimalArrayFromJSON(ty, R"(["12E2", "-42E2", null])")});
+  }
+}
+
+TEST_F(TestUnaryArithmeticDecimal, TrigCos) {
+  for (const auto& func : {"cos", "cos_checked"}) {
+    for (const auto& ty : PositiveScaleTypes()) {
+      CheckDecimalToFloat(func, {ArrayFromJSON(ty, R"([])")});
+      CheckDecimalToFloat(func,
+                          {ArrayFromJSON(ty, R"(["0.00", "-1.00", "1.00", null])")});
+    }
+    for (const auto& ty : NegativeScaleTypes()) {
+      CheckDecimalToFloat(func, {ArrayFromJSON(ty, R"([])")});
+      CheckDecimalToFloat(func, {DecimalArrayFromJSON(ty, R"(["12E2", "-42E2", null])")});
+    }
+  }
+}
+
+TEST_F(TestUnaryArithmeticDecimal, TrigSin) {
+  for (const auto& func : {"sin", "sin_checked"}) {
+    for (const auto& ty : PositiveScaleTypes()) {
+      CheckDecimalToFloat(func, {ArrayFromJSON(ty, R"([])")});
+      CheckDecimalToFloat(func,
+                          {ArrayFromJSON(ty, R"(["0.00", "-1.00", "1.00", null])")});
+    }
+    for (const auto& ty : NegativeScaleTypes()) {
+      CheckDecimalToFloat(func, {ArrayFromJSON(ty, R"([])")});
+      CheckDecimalToFloat(func, {DecimalArrayFromJSON(ty, R"(["12E2", "-42E2", null])")});
+    }
+  }
+}
+
+TEST_F(TestUnaryArithmeticDecimal, TrigTan) {
+  for (const auto& func : {"tan", "tan_checked"}) {
+    for (const auto& ty : PositiveScaleTypes()) {
+      CheckDecimalToFloat(func, {ArrayFromJSON(ty, R"([])")});
+      CheckDecimalToFloat(func,
+                          {ArrayFromJSON(ty, R"(["0.00", "-1.00", "1.00", null])")});
+    }
+    for (const auto& ty : NegativeScaleTypes()) {
+      CheckDecimalToFloat(func, {ArrayFromJSON(ty, R"([])")});
+      CheckDecimalToFloat(func, {DecimalArrayFromJSON(ty, R"(["12E2", "-42E2", null])")});
+    }
+  }
+}
+
 TYPED_TEST_SUITE(TestUnaryRoundIntegral, IntegralTypes);
 TYPED_TEST_SUITE(TestUnaryRoundSigned, SignedIntegerTypes);
 TYPED_TEST_SUITE(TestUnaryRoundUnsigned, UnsignedIntegerTypes);
@@ -2179,7 +2366,9 @@ TYPED_TEST(TestUnaryRoundToMultipleFloating, RoundToMultiple) {
   this->AssertUnaryOpRaises(RoundToMultiple, values, "multiple must be positive");
 }
 
-TEST(TestBinaryDecimalArithmetic, DispatchBest) {
+class TestBinaryArithmeticDecimal : public TestArithmeticDecimal {};
+
+TEST_F(TestBinaryArithmeticDecimal, DispatchBest) {
   // decimal, floating point
   for (std::string name : {"add", "subtract", "multiply", "divide"}) {
     for (std::string suffix : {"", "_checked"}) {
@@ -2269,10 +2458,18 @@ TEST(TestBinaryDecimalArithmetic, DispatchBest) {
                         {decimal128(5, 4), decimal128(2, 0)});
     }
   }
+  for (std::string name : {"atan2", "logb", "logb_checked", "power", "power_checked"}) {
+    CheckDispatchBest(name, {decimal128(2, 1), decimal128(2, 1)}, {float64(), float64()});
+    CheckDispatchBest(name, {decimal256(2, 1), decimal256(2, 1)}, {float64(), float64()});
+    CheckDispatchBest(name, {decimal128(2, 1), int64()}, {float64(), float64()});
+    CheckDispatchBest(name, {int32(), decimal128(2, 1)}, {float64(), float64()});
+    CheckDispatchBest(name, {decimal128(2, 1), float64()}, {float64(), float64()});
+    CheckDispatchBest(name, {float32(), decimal128(2, 1)}, {float64(), float64()});
+  }
 }
 
 // reference result from bc (precsion=100, scale=40)
-TEST(TestBinaryArithmeticDecimal, AddSubtract) {
+TEST_F(TestBinaryArithmeticDecimal, AddSubtract) {
   // array array, decimal128
   {
     auto left = ArrayFromJSON(decimal128(30, 3),
@@ -2386,7 +2583,14 @@ TEST(TestBinaryArithmeticDecimal, AddSubtract) {
     CheckScalarBinary("add", right, left, added);
   }
 
-  // TODO: decimal integer
+  // decimal integer
+  {
+    auto left = ScalarFromJSON(decimal128(3, 0), R"("666")");
+    auto right = ScalarFromJSON(int64(), "888");
+    CheckScalarBinary("add", left, right, ScalarFromJSON(decimal128(20, 0), R"("1554")"));
+    CheckScalarBinary("subtract", left, right,
+                      ScalarFromJSON(decimal128(20, 0), R"("-222")"));
+  }
 
   // failed case: result maybe overflow
   {
@@ -2404,7 +2608,7 @@ TEST(TestBinaryArithmeticDecimal, AddSubtract) {
   }
 }
 
-TEST(TestBinaryArithmeticDecimal, Multiply) {
+TEST_F(TestBinaryArithmeticDecimal, Multiply) {
   // array array, decimal128
   {
     auto left = ArrayFromJSON(decimal128(20, 10),
@@ -2428,7 +2632,7 @@ TEST(TestBinaryArithmeticDecimal, Multiply) {
     CheckScalarBinary("multiply", left, right, expected);
   }
 
-  // array array, decimal26
+  // array array, decimal256
   {
     auto left = ArrayFromJSON(decimal256(30, 3),
                               R"([
@@ -2484,7 +2688,13 @@ TEST(TestBinaryArithmeticDecimal, Multiply) {
     CheckScalarBinary("multiply", right, left, expected);
   }
 
-  // TODO: decimal integer
+  // decimal integer
+  {
+    auto left = ScalarFromJSON(decimal128(3, 0), R"("666")");
+    auto right = ScalarFromJSON(int64(), "888");
+    auto expected = ScalarFromJSON(decimal128(23, 0), R"("591408")");
+    CheckScalarBinary("multiply", left, right, expected);
+  }
 
   // failed case: result maybe overflow
   {
@@ -2494,7 +2704,7 @@ TEST(TestBinaryArithmeticDecimal, Multiply) {
   }
 }
 
-TEST(TestBinaryArithmeticDecimal, Divide) {
+TEST_F(TestBinaryArithmeticDecimal, Divide) {
   // array array, decimal128
   {
     auto left = ArrayFromJSON(decimal128(13, 3), R"(["1234567890.123", "0.001"])");
@@ -2555,7 +2765,16 @@ TEST(TestBinaryArithmeticDecimal, Divide) {
     CheckScalarBinary("divide", right, left, right_div_left);
   }
 
-  // TODO: decimal integer
+  // decimal integer
+  {
+    auto left = ScalarFromJSON(decimal128(3, 0), R"("100")");
+    auto right = ScalarFromJSON(int64(), "50");
+    auto left_div_right =
+        ScalarFromJSON(decimal128(23, 20), R"("2.00000000000000000000")");
+    auto right_div_left = ScalarFromJSON(decimal128(23, 4), R"("0.5000")");
+    CheckScalarBinary("divide", left, right, left_div_right);
+    CheckScalarBinary("divide", right, left, right_div_left);
+  }
 
   // failed case: result maybe overflow
   {
@@ -2572,6 +2791,96 @@ TEST(TestBinaryArithmeticDecimal, Divide) {
   }
 }
 
+TEST_F(TestBinaryArithmeticDecimal, Atan2) {
+  // Decimal arguments promoted to double, sanity check here
+  const auto func = "atan2";
+  for (const auto& ty : PositiveScaleTypes()) {
+    CheckDecimalToFloat(func, {ArrayFromJSON(ty, R"([])"), ArrayFromJSON(ty, R"([])")});
+    CheckDecimalToFloat(
+        func, {ArrayFromJSON(ty, R"(["1.00", "10.00", "1.00", "2.00", null])"),
+               ArrayFromJSON(ty, R"(["10.00", "10.00", "2.00", "2.00", null])")});
+    CheckDecimalToFloat(
+        func,
+        {ArrayFromJSON(ty, R"(["1.00", "10.00", "1.00", "2.00", null])"),
+         ArrayFromJSON(decimal128(4, 2), R"(["10.00", "10.00", "2.00", "2.00", null])")});
+    CheckDecimalToFloat(func,
+                        {ArrayFromJSON(ty, R"(["1.00", "10.00", "1.00", "2.00", null])"),
+                         ScalarFromJSON(int64(), "10")});
+    CheckDecimalToFloat(func,
+                        {ArrayFromJSON(ty, R"(["1.00", "10.00", "1.00", "2.00", null])"),
+                         ScalarFromJSON(float64(), "10")});
+    CheckDecimalToFloat(func, {ArrayFromJSON(float64(), "[1, 10, 1, 2, null]"),
+                               ScalarFromJSON(ty, R"("10.00")")});
+    CheckDecimalToFloat(func, {ArrayFromJSON(int64(), "[1, 10, 1, 2, null]"),
+                               ScalarFromJSON(ty, R"("10.00")")});
+  }
+  for (const auto& ty : NegativeScaleTypes()) {
+    CheckDecimalToFloat(func, {ArrayFromJSON(ty, R"([])"), ArrayFromJSON(ty, R"([])")});
+    CheckDecimalToFloat(func, {DecimalArrayFromJSON(ty, R"(["12E2", "42E2", null])"),
+                               DecimalArrayFromJSON(ty, R"(["12E2", "42E2", null])")});
+    CheckDecimalToFloat(
+        func, {DecimalArrayFromJSON(ty, R"(["12E2", "42E2", null])"),
+               DecimalArrayFromJSON(decimal128(2, -2), R"(["12E2", "42E2", null])")});
+    CheckDecimalToFloat(func, {DecimalArrayFromJSON(ty, R"(["12E2", "42E2", null])"),
+                               ScalarFromJSON(int64(), "10")});
+  }
+}
+
+TEST_F(TestBinaryArithmeticDecimal, Logb) {
+  // Decimal arguments promoted to double, sanity check here
+  for (const auto& func : {"logb", "logb_checked"}) {
+    for (const auto& ty : PositiveScaleTypes()) {
+      CheckDecimalToFloat(func, {ArrayFromJSON(ty, R"([])"), ArrayFromJSON(ty, R"([])")});
+      CheckDecimalToFloat(
+          func, {ArrayFromJSON(ty, R"(["1.00", "10.00", "1.00", "2.00", null])"),
+                 ArrayFromJSON(ty, R"(["10.00", "10.00", "2.00", "2.00", null])")});
+      CheckDecimalToFloat(
+          func, {ArrayFromJSON(ty, R"(["1.00", "10.00", "1.00", "2.00", null])"),
+                 ArrayFromJSON(decimal128(4, 2),
+                               R"(["10.00", "10.00", "2.00", "2.00", null])")});
+      CheckDecimalToFloat(
+          func, {ArrayFromJSON(ty, R"(["1.00", "10.00", "1.00", "2.00", null])"),
+                 ScalarFromJSON(int64(), "10")});
+      CheckDecimalToFloat(
+          func, {ArrayFromJSON(ty, R"(["1.00", "10.00", "1.00", "2.00", null])"),
+                 ScalarFromJSON(float64(), "10")});
+      CheckDecimalToFloat(func, {ArrayFromJSON(float64(), "[1, 10, 1, 2, null]"),
+                                 ScalarFromJSON(ty, R"("10.00")")});
+      CheckDecimalToFloat(func, {ArrayFromJSON(int64(), "[1, 10, 1, 2, null]"),
+                                 ScalarFromJSON(ty, R"("10.00")")});
+    }
+    for (const auto& ty : NegativeScaleTypes()) {
+      CheckDecimalToFloat(func, {ArrayFromJSON(ty, R"([])"), ArrayFromJSON(ty, R"([])")});
+      CheckDecimalToFloat(func, {DecimalArrayFromJSON(ty, R"(["12E2", "42E2", null])"),
+                                 DecimalArrayFromJSON(ty, R"(["12E2", "42E2", null])")});
+      CheckDecimalToFloat(
+          func, {DecimalArrayFromJSON(ty, R"(["12E2", "42E2", null])"),
+                 DecimalArrayFromJSON(decimal128(2, -2), R"(["12E2", "42E2", null])")});
+      CheckDecimalToFloat(func, {DecimalArrayFromJSON(ty, R"(["12E2", "42E2", null])"),
+                                 ScalarFromJSON(int64(), "10")});
+    }
+  }
+}
+
+TEST_F(TestBinaryArithmeticDecimal, Power) {
+  // Decimal arguments promoted to double, sanity check here
+  for (const auto& func : {"logb", "logb_checked"}) {
+    for (const auto& ty : PositiveScaleTypes()) {
+      CheckDecimalToFloat(func, {ArrayFromJSON(ty, R"([])"), ArrayFromJSON(ty, R"([])")});
+      CheckDecimalToFloat(func, {ArrayFromJSON(ty, R"(["1.00", "2.00", null])"),
+                                 ArrayFromJSON(ty, R"(["1.23", null, "3.45"])")});
+      CheckDecimalToFloat(func, {ArrayFromJSON(ty, R"(["1.00", "2.00", null])"),
+                                 ArrayFromJSON(float64(), R"([1.23, null, 3.45])")});
+      CheckDecimalToFloat(func, {ArrayFromJSON(float64(), R"([1.00, 2.00, null])"),
+                                 ArrayFromJSON(ty, R"(["1.23", null, "3.45"])")});
+      CheckDecimalToFloat(func, {ArrayFromJSON(ty, R"(["1.00", "2.00", null])"),
+                                 ArrayFromJSON(int64(), R"([1, null, 3])")});
+      CheckDecimalToFloat(func, {ArrayFromJSON(int64(), R"([1, 2, null])"),
+                                 ArrayFromJSON(ty, R"(["1.23", null, "3.45"])")});
+    }
+  }
+}
+
 TYPED_TEST(TestBinaryArithmeticIntegral, ShiftLeft) {
   for (auto check_overflow : {false, true}) {
     this->SetOverflowCheck(check_overflow);
diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc
index d23a909c6fd88..3f0a6a4178383 100644
--- a/cpp/src/arrow/compute/kernels/scalar_validity.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc
@@ -176,6 +176,18 @@ void AddFloatValidityKernel(const std::shared_ptr<DataType>& ty, ScalarFunction*
                             applicator::ScalarUnary<BooleanType, InType, Op>::Exec));
 }
 
+template <bool kConstant>
+Status ConstBoolExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  if (batch.values[0].is_scalar()) {
+    checked_cast<BooleanScalar*>(out->scalar().get())->value = kConstant;
+    return Status::OK();
+  }
+  ArrayData* array = out->mutable_array();
+  BitUtil::SetBitsTo(array->buffers[1]->mutable_data(), array->offset, array->length,
+                     kConstant);
+  return Status::OK();
+}
+
 std::shared_ptr<ScalarFunction> MakeIsFiniteFunction(std::string name,
                                                      const FunctionDoc* doc) {
   auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
@@ -183,6 +195,15 @@ std::shared_ptr<ScalarFunction> MakeIsFiniteFunction(std::string name,
   AddFloatValidityKernel<FloatType, IsFiniteOperator>(float32(), func.get());
   AddFloatValidityKernel<DoubleType, IsFiniteOperator>(float64(), func.get());
 
+  for (const auto& ty : IntTypes()) {
+    DCHECK_OK(func->AddKernel({InputType(ty->id())}, boolean(), ConstBoolExec<true>));
+  }
+  DCHECK_OK(func->AddKernel({InputType(Type::NA)}, boolean(), ConstBoolExec<true>));
+  DCHECK_OK(
+      func->AddKernel({InputType(Type::DECIMAL128)}, boolean(), ConstBoolExec<true>));
+  DCHECK_OK(
+      func->AddKernel({InputType(Type::DECIMAL256)}, boolean(), ConstBoolExec<true>));
+
   return func;
 }
 
@@ -193,6 +214,15 @@ std::shared_ptr<ScalarFunction> MakeIsInfFunction(std::string name,
   AddFloatValidityKernel<FloatType, IsInfOperator>(float32(), func.get());
   AddFloatValidityKernel<DoubleType, IsInfOperator>(float64(), func.get());
 
+  for (const auto& ty : IntTypes()) {
+    DCHECK_OK(func->AddKernel({InputType(ty->id())}, boolean(), ConstBoolExec<false>));
+  }
+  DCHECK_OK(func->AddKernel({InputType(Type::NA)}, boolean(), ConstBoolExec<false>));
+  DCHECK_OK(
+      func->AddKernel({InputType(Type::DECIMAL128)}, boolean(), ConstBoolExec<false>));
+  DCHECK_OK(
+      func->AddKernel({InputType(Type::DECIMAL256)}, boolean(), ConstBoolExec<false>));
+
   return func;
 }
 
@@ -203,6 +233,15 @@ std::shared_ptr<ScalarFunction> MakeIsNanFunction(std::string name,
   AddFloatValidityKernel<FloatType, IsNanOperator>(float32(), func.get());
   AddFloatValidityKernel<DoubleType, IsNanOperator>(float64(), func.get());
 
+  for (const auto& ty : IntTypes()) {
+    DCHECK_OK(func->AddKernel({InputType(ty->id())}, boolean(), ConstBoolExec<false>));
+  }
+  DCHECK_OK(func->AddKernel({InputType(Type::NA)}, boolean(), ConstBoolExec<false>));
+  DCHECK_OK(
+      func->AddKernel({InputType(Type::DECIMAL128)}, boolean(), ConstBoolExec<false>));
+  DCHECK_OK(
+      func->AddKernel({InputType(Type::DECIMAL256)}, boolean(), ConstBoolExec<false>));
+
   return func;
 }
 
diff --git a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc
index 35a6b831ef45a..5b0934828a23b 100644
--- a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc
@@ -80,6 +80,45 @@ TEST_F(TestBooleanValidityKernels, IsNull) {
                    "[true, false, false, true]", &nan_is_null_options);
 }
 
+TEST(TestValidityKernels, IsFinite) {
+  for (const auto& ty : IntTypes()) {
+    CheckScalar("is_finite", {ArrayFromJSON(ty, "[0, 1, 42, null]")},
+                ArrayFromJSON(boolean(), "[true, true, true, null]"));
+  }
+  for (const auto& ty : {decimal128(4, 2), decimal256(4, 2)}) {
+    CheckScalar("is_finite", {ArrayFromJSON(ty, R"(["0.00", "1.01", "-42.00", null])")},
+                ArrayFromJSON(boolean(), "[true, true, true, null]"));
+  }
+  CheckScalar("is_finite", {std::make_shared<NullArray>(4)},
+              ArrayFromJSON(boolean(), "[null, null, null, null]"));
+}
+
+TEST(TestValidityKernels, IsInf) {
+  for (const auto& ty : IntTypes()) {
+    CheckScalar("is_inf", {ArrayFromJSON(ty, "[0, 1, 42, null]")},
+                ArrayFromJSON(boolean(), "[false, false, false, null]"));
+  }
+  for (const auto& ty : {decimal128(4, 2), decimal256(4, 2)}) {
+    CheckScalar("is_inf", {ArrayFromJSON(ty, R"(["0.00", "1.01", "-42.00", null])")},
+                ArrayFromJSON(boolean(), "[false, false, false, null]"));
+  }
+  CheckScalar("is_inf", {std::make_shared<NullArray>(4)},
+              ArrayFromJSON(boolean(), "[null, null, null, null]"));
+}
+
+TEST(TestValidityKernels, IsNan) {
+  for (const auto& ty : IntTypes()) {
+    CheckScalar("is_nan", {ArrayFromJSON(ty, "[0, 1, 42, null]")},
+                ArrayFromJSON(boolean(), "[false, false, false, null]"));
+  }
+  for (const auto& ty : {decimal128(4, 2), decimal256(4, 2)}) {
+    CheckScalar("is_nan", {ArrayFromJSON(ty, R"(["0.00", "1.01", "-42.00", null])")},
+                ArrayFromJSON(boolean(), "[false, false, false, null]"));
+  }
+  CheckScalar("is_nan", {std::make_shared<NullArray>(4)},
+              ArrayFromJSON(boolean(), "[null, null, null, null]"));
+}
+
 TEST(TestValidityKernels, IsValidIsNullNullType) {
   CheckScalarUnary("is_null", std::make_shared<NullArray>(5),
                    ArrayFromJSON(boolean(), "[true, true, true, true, true]"));
diff --git a/cpp/src/arrow/util/basic_decimal.cc b/cpp/src/arrow/util/basic_decimal.cc
index 1832bf5c4f9e7..c3b9d0272e86f 100644
--- a/cpp/src/arrow/util/basic_decimal.cc
+++ b/cpp/src/arrow/util/basic_decimal.cc
@@ -1115,6 +1115,12 @@ const BasicDecimal128& BasicDecimal128::GetHalfScaleMultiplier(int32_t scale) {
 
 const BasicDecimal128& BasicDecimal128::GetMaxValue() { return kMaxValue; }
 
+BasicDecimal128 BasicDecimal128::GetMaxValue(int32_t precision) {
+  DCHECK_GE(precision, 0);
+  DCHECK_LE(precision, 38);
+  return ScaleMultipliers[precision] - 1;
+}
+
 BasicDecimal128 BasicDecimal128::IncreaseScaleBy(int32_t increase_by) const {
   DCHECK_GE(increase_by, 0);
   DCHECK_LE(increase_by, 38);
@@ -1330,6 +1336,12 @@ const BasicDecimal256& BasicDecimal256::GetHalfScaleMultiplier(int32_t scale) {
   return ScaleMultipliersHalfDecimal256[scale];
 }
 
+BasicDecimal256 BasicDecimal256::GetMaxValue(int32_t precision) {
+  DCHECK_GE(precision, 0);
+  DCHECK_LE(precision, 76);
+  return ScaleMultipliersDecimal256[precision] + (-1);
+}
+
 BasicDecimal256 operator*(const BasicDecimal256& left, const BasicDecimal256& right) {
   BasicDecimal256 result = left;
   result *= right;
diff --git a/cpp/src/arrow/util/basic_decimal.h b/cpp/src/arrow/util/basic_decimal.h
index a4df3285596b5..93dced967e32d 100644
--- a/cpp/src/arrow/util/basic_decimal.h
+++ b/cpp/src/arrow/util/basic_decimal.h
@@ -227,6 +227,9 @@ class ARROW_EXPORT BasicDecimal128 {
   /// \brief Get the maximum valid unscaled decimal value.
   static const BasicDecimal128& GetMaxValue();
 
+  /// \brief Get the maximum valid unscaled decimal value for the given precision.
+  static BasicDecimal128 GetMaxValue(int32_t precision);
+
   /// \brief Get the maximum decimal value (is not a valid value).
   static inline constexpr BasicDecimal128 GetMaxSentinel() {
     return BasicDecimal128(/*high=*/std::numeric_limits<int64_t>::max(),
@@ -427,6 +430,9 @@ class ARROW_EXPORT BasicDecimal256 {
   /// \brief In-place division.
   BasicDecimal256& operator/=(const BasicDecimal256& right);
 
+  /// \brief Get the maximum valid unscaled decimal value for the given precision.
+  static BasicDecimal256 GetMaxValue(int32_t precision);
+
   /// \brief Get the maximum decimal value (is not a valid value).
   static inline constexpr BasicDecimal256 GetMaxSentinel() {
 #if ARROW_LITTLE_ENDIAN
diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst
index 34b1f3448da7c..26c44a67e5bb7 100644
--- a/docs/source/cpp/compute.rst
+++ b/docs/source/cpp/compute.rst
@@ -219,13 +219,13 @@ the input to a single output value.
 +--------------------+-------+------------------+------------------------+----------------------------------+-------+
 | quantile           | Unary | Numeric          | Scalar Numeric         | :struct:`QuantileOptions`        | \(7)  |
 +--------------------+-------+------------------+------------------------+----------------------------------+-------+
-| stddev             | Unary | Numeric          | Scalar Float64         | :struct:`VarianceOptions`        |       |
+| stddev             | Unary | Numeric          | Scalar Float64         | :struct:`VarianceOptions`        | \(8)  |
 +--------------------+-------+------------------+------------------------+----------------------------------+-------+
 | sum                | Unary | Numeric          | Scalar Numeric         | :struct:`ScalarAggregateOptions` | \(6)  |
 +--------------------+-------+------------------+------------------------+----------------------------------+-------+
-| tdigest            | Unary | Numeric          | Float64                | :struct:`TDigestOptions`         | \(8)  |
+| tdigest            | Unary | Numeric          | Float64                | :struct:`TDigestOptions`         | \(9)  |
 +--------------------+-------+------------------+------------------------+----------------------------------+-------+
-| variance           | Unary | Numeric          | Scalar Float64         | :struct:`VarianceOptions`        |       |
+| variance           | Unary | Numeric          | Scalar Float64         | :struct:`VarianceOptions`        | \(8)  |
 +--------------------+-------+------------------+------------------------+----------------------------------+-------+
 
 * \(1) If null values are taken into account, by setting the
@@ -255,10 +255,14 @@ the input to a single output value.
 
 * \(7) Output is Float64 or input type, depending on QuantileOptions.
 
-* \(8) tdigest/t-digest computes approximate quantiles, and so only needs a
+* \(8) Decimal arguments are cast to Float64 first.
+
+* \(9) tdigest/t-digest computes approximate quantiles, and so only needs a
   fixed amount of memory. See the `reference implementation
   <https://github.com/tdunning/t-digest>`_ for details.
 
+  Decimal arguments are cast to Float64 first.
+
 Grouped Aggregations ("group by")
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -330,13 +334,13 @@ equivalents above and reflects how they are implemented internally.
 +-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+
 | hash_product            | Unary | Numeric                            | Numeric                | :struct:`ScalarAggregateOptions` | \(4)  |
 +-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+
-| hash_stddev             | Unary | Numeric                            | Float64                | :struct:`VarianceOptions`        |       |
+| hash_stddev             | Unary | Numeric                            | Float64                | :struct:`VarianceOptions`        | \(5)  |
 +-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+
 | hash_sum                | Unary | Numeric                            | Numeric                | :struct:`ScalarAggregateOptions` | \(4)  |
 +-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+
-| hash_tdigest            | Unary | Numeric                            | FixedSizeList[Float64] | :struct:`TDigestOptions`         | \(5)  |
+| hash_tdigest            | Unary | Numeric                            | FixedSizeList[Float64] | :struct:`TDigestOptions`         | \(6)  |
 +-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+
-| hash_variance           | Unary | Numeric                            | Float64                | :struct:`VarianceOptions`        |       |
+| hash_variance           | Unary | Numeric                            | Float64                | :struct:`VarianceOptions`        | \(5)  |
 +-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+
 
 * \(1) If null values are taken into account, by setting the
@@ -357,10 +361,14 @@ equivalents above and reflects how they are implemented internally.
 * \(4) Output is Int64, UInt64, Float64, or Decimal128/256, depending on the
   input type.
 
-* \(5) T-digest computes approximate quantiles, and so only needs a
+* \(5) Decimal arguments are cast to Float64 first.
+
+* \(6) T-digest computes approximate quantiles, and so only needs a
   fixed amount of memory. See the `reference implementation
   <https://github.com/tdunning/t-digest>`_ for details.
 
+  Decimal arguments are cast to Float64 first.
+
 Element-wise ("scalar") functions
 ---------------------------------
 
@@ -456,8 +464,8 @@ decimal and integer arguments will cast all arguments to decimals.
   enough scale kept. Error is returned if the result precision is beyond the
   decimal value range.
 
-* \(2) Output is any of (-1,1) for nonzero inputs and 0 for zero input.
-  NaN values return NaN.  Integral values return signedness as Int8 and
+* \(2) Output is any of (-1,1) for nonzero inputs and 0 for zero input.  NaN
+  values return NaN.  Integral and decimal values return signedness as Int8 and
   floating-point values return it with the same type as the input values.
 
 Bit-wise functions
@@ -585,29 +593,31 @@ Logarithmic functions
 Logarithmic functions are also supported, and also offer ``_checked``
 variants that check for domain errors if needed.
 
-+--------------------------+------------+--------------------+---------------------+
-| Function name            | Arity      | Input types        | Output type         |
-+==========================+============+====================+=====================+
-| ln                       | Unary      | Float32/Float64    | Float32/Float64     |
-+--------------------------+------------+--------------------+---------------------+
-| ln_checked               | Unary      | Float32/Float64    | Float32/Float64     |
-+--------------------------+------------+--------------------+---------------------+
-| log10                    | Unary      | Float32/Float64    | Float32/Float64     |
-+--------------------------+------------+--------------------+---------------------+
-| log10_checked            | Unary      | Float32/Float64    | Float32/Float64     |
-+--------------------------+------------+--------------------+---------------------+
-| log1p                    | Unary      | Float32/Float64    | Float32/Float64     |
-+--------------------------+------------+--------------------+---------------------+
-| log1p_checked            | Unary      | Float32/Float64    | Float32/Float64     |
-+--------------------------+------------+--------------------+---------------------+
-| log2                     | Unary      | Float32/Float64    | Float32/Float64     |
-+--------------------------+------------+--------------------+---------------------+
-| log2_checked             | Unary      | Float32/Float64    | Float32/Float64     |
-+--------------------------+------------+--------------------+---------------------+
-| logb                     | Binary     | Float32/Float64    | Float32/Float64     |
-+--------------------------+------------+--------------------+---------------------+
-| logb_checked             | Binary     | Float32/Float64    | Float32/Float64     |
-+--------------------------+------------+--------------------+---------------------+
+Decimal values are accepted, but are cast to Float64 first.
+
++--------------------------+------------+-------------------------+---------------------+
+| Function name            | Arity      | Input types             | Output type         |
++==========================+============+=========================+=====================+
+| ln                       | Unary      | Float32/Float64/Decimal | Float32/Float64     |
++--------------------------+------------+-------------------------+---------------------+
+| ln_checked               | Unary      | Float32/Float64/Decimal | Float32/Float64     |
++--------------------------+------------+-------------------------+---------------------+
+| log10                    | Unary      | Float32/Float64/Decimal | Float32/Float64     |
++--------------------------+------------+-------------------------+---------------------+
+| log10_checked            | Unary      | Float32/Float64/Decimal | Float32/Float64     |
++--------------------------+------------+-------------------------+---------------------+
+| log1p                    | Unary      | Float32/Float64/Decimal | Float32/Float64     |
++--------------------------+------------+-------------------------+---------------------+
+| log1p_checked            | Unary      | Float32/Float64/Decimal | Float32/Float64     |
++--------------------------+------------+-------------------------+---------------------+
+| log2                     | Unary      | Float32/Float64/Decimal | Float32/Float64     |
++--------------------------+------------+-------------------------+---------------------+
+| log2_checked             | Unary      | Float32/Float64/Decimal | Float32/Float64     |
++--------------------------+------------+-------------------------+---------------------+
+| logb                     | Binary     | Float32/Float64/Decimal | Float32/Float64     |
++--------------------------+------------+-------------------------+---------------------+
+| logb_checked             | Binary     | Float32/Float64/Decimal | Float32/Float64     |
++--------------------------+------------+-------------------------+---------------------+
 
 Trigonometric functions
 ~~~~~~~~~~~~~~~~~~~~~~~
@@ -615,33 +625,35 @@ Trigonometric functions
 Trigonometric functions are also supported, and also offer ``_checked``
 variants that check for domain errors if needed.
 
-+--------------------------+------------+--------------------+---------------------+
-| Function name            | Arity      | Input types        | Output type         |
-+==========================+============+====================+=====================+
-| acos                     | Unary      | Float32/Float64    | Float32/Float64     |
-+--------------------------+------------+--------------------+---------------------+
-| acos_checked             | Unary      | Float32/Float64    | Float32/Float64     |
-+--------------------------+------------+--------------------+---------------------+
-| asin                     | Unary      | Float32/Float64    | Float32/Float64     |
-+--------------------------+------------+--------------------+---------------------+
-| asin_checked             | Unary      | Float32/Float64    | Float32/Float64     |
-+--------------------------+------------+--------------------+---------------------+
-| atan                     | Unary      | Float32/Float64    | Float32/Float64     |
-+--------------------------+------------+--------------------+---------------------+
-| atan2                    | Binary     | Float32/Float64    | Float32/Float64     |
-+--------------------------+------------+--------------------+---------------------+
-| cos                      | Unary      | Float32/Float64    | Float32/Float64     |
-+--------------------------+------------+--------------------+---------------------+
-| cos_checked              | Unary      | Float32/Float64    | Float32/Float64     |
-+--------------------------+------------+--------------------+---------------------+
-| sin                      | Unary      | Float32/Float64    | Float32/Float64     |
-+--------------------------+------------+--------------------+---------------------+
-| sin_checked              | Unary      | Float32/Float64    | Float32/Float64     |
-+--------------------------+------------+--------------------+---------------------+
-| tan                      | Unary      | Float32/Float64    | Float32/Float64     |
-+--------------------------+------------+--------------------+---------------------+
-| tan_checked              | Unary      | Float32/Float64    | Float32/Float64     |
-+--------------------------+------------+--------------------+---------------------+
+Decimal values are accepted, but are cast to Float64 first.
+
++--------------------------+------------+-------------------------+---------------------+
+| Function name            | Arity      | Input types             | Output type         |
++==========================+============+=========================+=====================+
+| acos                     | Unary      | Float32/Float64/Decimal | Float32/Float64     |
++--------------------------+------------+-------------------------+---------------------+
+| acos_checked             | Unary      | Float32/Float64/Decimal | Float32/Float64     |
++--------------------------+------------+-------------------------+---------------------+
+| asin                     | Unary      | Float32/Float64/Decimal | Float32/Float64     |
++--------------------------+------------+-------------------------+---------------------+
+| asin_checked             | Unary      | Float32/Float64/Decimal | Float32/Float64     |
++--------------------------+------------+-------------------------+---------------------+
+| atan                     | Unary      | Float32/Float64/Decimal | Float32/Float64     |
++--------------------------+------------+-------------------------+---------------------+
+| atan2                    | Binary     | Float32/Float64/Decimal | Float32/Float64     |
++--------------------------+------------+-------------------------+---------------------+
+| cos                      | Unary      | Float32/Float64/Decimal | Float32/Float64     |
++--------------------------+------------+-------------------------+---------------------+
+| cos_checked              | Unary      | Float32/Float64/Decimal | Float32/Float64     |
++--------------------------+------------+-------------------------+---------------------+
+| sin                      | Unary      | Float32/Float64/Decimal | Float32/Float64     |
++--------------------------+------------+-------------------------+---------------------+
+| sin_checked              | Unary      | Float32/Float64/Decimal | Float32/Float64     |
++--------------------------+------------+-------------------------+---------------------+
+| tan                      | Unary      | Float32/Float64/Decimal | Float32/Float64     |
++--------------------------+------------+-------------------------+---------------------+
+| tan_checked              | Unary      | Float32/Float64/Decimal | Float32/Float64     |
++--------------------------+------------+-------------------------+---------------------+
 
 Comparisons
 ~~~~~~~~~~~
@@ -1117,26 +1129,28 @@ Containment tests
 Categorizations
 ~~~~~~~~~~~~~~~
 
-+-------------------+------------+---------------------+---------------------+------------------------+---------+
-| Function name     | Arity      | Input types         | Output type         | Options class          | Notes   |
-+===================+============+=====================+=====================+========================+=========+
-| is_finite         | Unary      | Float, Double       | Boolean             |                        | \(1)    |
-+-------------------+------------+---------------------+---------------------+------------------------+---------+
-| is_inf            | Unary      | Float, Double       | Boolean             |                        | \(2)    |
-+-------------------+------------+---------------------+---------------------+------------------------+---------+
-| is_nan            | Unary      | Float, Double       | Boolean             |                        | \(3)    |
-+-------------------+------------+---------------------+---------------------+------------------------+---------+
-| is_null           | Unary      | Any                 | Boolean             | :struct:`NullOptions`  | \(4)    |
-+-------------------+------------+---------------------+---------------------+------------------------+---------+
-| is_valid          | Unary      | Any                 | Boolean             |                        | \(5)    |
-+-------------------+------------+---------------------+---------------------+------------------------+---------+
++-------------------+------------+-------------------------+---------------------+------------------------+---------+
+| Function name     | Arity      | Input types             | Output type         | Options class          | Notes   |
++===================+============+=========================+=====================+========================+=========+
+| is_finite         | Unary      | Null, Numeric           | Boolean             |                        | \(1)    |
++-------------------+------------+-------------------------+---------------------+------------------------+---------+
+| is_inf            | Unary      | Null, Numeric           | Boolean             |                        | \(2)    |
++-------------------+------------+-------------------------+---------------------+------------------------+---------+
+| is_nan            | Unary      | Null, Numeric           | Boolean             |                        | \(3)    |
++-------------------+------------+-------------------------+---------------------+------------------------+---------+
+| is_null           | Unary      | Any                     | Boolean             | :struct:`NullOptions`  | \(4)    |
++-------------------+------------+-------------------------+---------------------+------------------------+---------+
+| is_valid          | Unary      | Any                     | Boolean             |                        | \(5)    |
++-------------------+------------+-------------------------+---------------------+------------------------+---------+
 
 * \(1) Output is true iff the corresponding input element is finite (neither Infinity,
-  -Infinity, nor NaN).
+  -Infinity, nor NaN). Hence, for Decimal and integer inputs this always returns true.
 
 * \(2) Output is true iff the corresponding input element is Infinity/-Infinity.
+  Hence, for Decimal and integer inputs this always returns false.
 
 * \(3) Output is true iff the corresponding input element is NaN.
+  Hence, for Decimal and integer inputs this always returns false.
 
 * \(4) Output is true iff the corresponding input element is null. NaN values
   can also be considered null by setting :member:`NullOptions::nan_is_null`.

From 706a1d1e1879f5ff20f6f9ab816d057059089195 Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Thu, 4 Nov 2021 19:12:28 +0100
Subject: [PATCH 081/194] MINOR: [CI] Fix conda-integration GHA job

The conda-integration job is currenly failing on Github Actions (but I'm not able to reproduce locally), being unable to find a correct solution when installing the conda dependencies for Archery:
https://github.com/apache/arrow/runs/4107211303?check_suite_focus=true

Log excerpt:
```
#8 [3/6] RUN conda install -q         --file arrow/ci/conda_env_archery.txt         numpy         compilers         maven=3.5         nodejs=14         yarn         openjdk=8 &&     conda clean --all --force-pkgs-dirs
#8 sha256:c96c59f55397d6e90bff7d2897eb1247ddfa19b8ffab8019be5ec0bbfdab7dc8
#8 0.450 mesg: ttyname failed: Inappropriate ioctl for device
#8 2.279 Collecting package metadata (current_repodata.json): ...working... done
#8 10.18 Solving environment: ...working... failed with initial frozen solve. Retrying with flexible solve.
#8 10.19 Collecting package metadata (repodata.json): ...working... done
#8 41.80 Solving environment: ...working... failed with initial frozen solve. Retrying with flexible solve.
#8 79.28
#8 79.28 PackagesNotFoundError: The following packages are not available from current channels:
#8 79.28
#8 79.28   - python=3.1
#8 79.28
```

Work around by forcing a reasonable minimum Python version.

Closes #11609 from pitrou/conda-integration-fix

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 ci/docker/conda-integration.dockerfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ci/docker/conda-integration.dockerfile b/ci/docker/conda-integration.dockerfile
index b367feb88a308..d40973330a6da 100644
--- a/ci/docker/conda-integration.dockerfile
+++ b/ci/docker/conda-integration.dockerfile
@@ -29,6 +29,7 @@ ARG go=1.15
 COPY ci/conda_env_archery.txt /arrow/ci/
 RUN conda install -q \
         --file arrow/ci/conda_env_archery.txt \
+        "python>=3.7" \
         numpy \
         compilers \
         maven=${maven} \

From a58a2bd2780904adefa46d426c8791dd5de5be8d Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Thu, 4 Nov 2021 20:38:10 +0100
Subject: [PATCH 082/194] MINOR: [CI] Quote Python version in YAML files
 (#11608)

Unquoted version numbers are treated by YAML as numbers when possible.
A version number like "3.10" then gets shortened to "3.1".
---
 dev/tasks/tasks.yml | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml
index 70926ef8af56f..33067e8aaff37 100644
--- a/dev/tasks/tasks.yml
+++ b/dev/tasks/tasks.yml
@@ -347,11 +347,11 @@ tasks:
                                                 ("github", "amd64", "x86_64", "2_17", "2014"),
                                                 ("travis", "arm64", "aarch64", "2_17", "2014")] %}
   wheel-manylinux{{ manylinux }}-{{ python_tag }}-{{ arch }}:
-    ci: {{ ci }}
+    ci: "{{ ci }}"
     template: python-wheels/{{ ci }}.linux.{{ arch }}.yml
     params:
       python_version: "{{ python_version }}"
-      manylinux_version: {{ manylinux }}
+      manylinux_version: "{{ manylinux }}"
     artifacts:
       - pyarrow-{no_rc_version}-{{ python_tag }}-{{ abi_tag }}-manylinux_{{ x_y }}_{{ arch_alias }}.manylinux{{ manylinux }}_{{ arch_alias }}.whl
 {% endfor %}
@@ -369,8 +369,8 @@ tasks:
     params:
       vcpkg_version: "2021.04.30"
       python_version: "{{ python_version }}"
-      macos_deployment_target: {{ macos_version }}
-      arrow_s3: {{ arrow_s3 }}
+      macos_deployment_target: "{{ macos_version }}"
+      arrow_s3: "{{ arrow_s3 }}"
     artifacts:
       - pyarrow-{no_rc_version}-{{ python_tag }}-{{ abi_tag }}-{{ platform_tag }}.whl
 
@@ -781,7 +781,7 @@ tasks:
         TEST_DEFAULT: 0
         TEST_{{ target|upper }}: 1
       artifact: "source"
-      github_runner: {{ runner }}
+      github_runner: "{{ runner }}"
   {% endfor %}
 {% endfor %}
 
@@ -806,7 +806,7 @@ tasks:
         TEST_INTEGRATION_JAVA: 0
         TEST_{{ target|upper }}: 1
       artifact: "source"
-      github_runner: {{ runner }}
+      github_runner: "{{ runner }}"
   {% endfor %}
 {% endfor %}
 
@@ -888,7 +888,7 @@ tasks:
     template: docker-tests/github.linux.yml
     params:
       env:
-        UBUNTU: {{ ubuntu_version }}
+        UBUNTU: "{{ ubuntu_version }}"
       image: ubuntu-cpp
 {% endfor %}
 
@@ -915,7 +915,7 @@ tasks:
     template: docker-tests/github.linux.yml
     params:
       env:
-        DEBIAN: {{ debian_version }}
+        DEBIAN: "{{ debian_version }}"
       image: debian-cpp
 {% endfor %}
 
@@ -1214,9 +1214,9 @@ tasks:
     template: docker-tests/github.linux.yml
     params:
       env:
-        PYTHON: {{ python_version }}
-        PANDAS: {{ pandas_version }}
-        NUMPY: {{ numpy_version }}
+        PYTHON: "{{ python_version }}"
+        PANDAS: "{{ pandas_version }}"
+        NUMPY: "{{ numpy_version }}"
     {% if cache_leaf %}
       # use the latest pandas release, so prevent reusing any cached layers
       flags: --no-leaf-cache
@@ -1231,7 +1231,7 @@ tasks:
     params:
       env:
         PYTHON: 3.9
-        DASK: {{ dask_version }}
+        DASK: "{{ dask_version }}"
       # use the latest dask release, so prevent reusing any cached layers
       flags: --no-leaf-cache
       image: conda-python-dask
@@ -1258,7 +1258,7 @@ tasks:
     params:
       env:
         PYTHON: 3.7
-        KARTOTHEK: {{ kartothek_version }}
+        KARTOTHEK: "{{ kartothek_version }}"
       flags: --no-leaf-cache
       image: conda-python-kartothek
 {% endfor %}
@@ -1270,7 +1270,7 @@ tasks:
     params:
       env:
         PYTHON: 3.7
-        HDFS: {{ hdfs_version }}
+        HDFS: "{{ hdfs_version }}"
       image: conda-python-hdfs
 {% endfor %}
 
@@ -1282,9 +1282,9 @@ tasks:
     template: docker-tests/github.linux.yml
     params:
       env:
-        PYTHON: {{ python_version }}
-        SPARK: {{ spark_version }}
-        TEST_PYARROW_ONLY: {{ test_pyarrow_only }}
+        PYTHON: "{{ python_version }}"
+        SPARK: "{{ spark_version }}"
+        TEST_PYARROW_ONLY: "{{ test_pyarrow_only }}"
       # use the branch-3.0 of spark, so prevent reusing any layers
       flags: --no-leaf-cache
       image: conda-python-spark

From 95e8b92949e5df4acd843aa966f05edd9725d901 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= <szucs.krisztian@gmail.com>
Date: Thu, 4 Nov 2021 21:38:35 +0100
Subject: [PATCH 083/194] ARROW-14584: [Python][CI] Python sdist installation
 fails with latest setuptools 58.5
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch should go to the 6.0.1 release.

Closes #11604 from kszucs/ARROW-14584

Authored-by: Krisztián Szűcs <szucs.krisztian@gmail.com>
Signed-off-by: Krisztián Szűcs <szucs.krisztian@gmail.com>
---
 python/pyproject.toml              | 5 +++--
 python/requirements-wheel-test.txt | 4 +---
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/python/pyproject.toml b/python/pyproject.toml
index acea5acb6f3b3..656d95c6a5897 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -18,10 +18,11 @@
 [build-system]
 requires = [
     "cython >= 0.29",
-    "numpy==1.16.6; python_version<'3.9'",
+    "numpy==1.16.6; python_version<'3.8'",
+    "numpy==1.17.3; python_version=='3.8'",
     "numpy==1.19.4; python_version=='3.9'",
     "numpy==1.21.3; python_version>'3.9'",
-    "setuptools",
+    "setuptools < 58.5",  # ARROW-14584
     "setuptools_scm",
     "wheel"
 ]
diff --git a/python/requirements-wheel-test.txt b/python/requirements-wheel-test.txt
index 55bf8d839087e..d8af73da5132f 100644
--- a/python/requirements-wheel-test.txt
+++ b/python/requirements-wheel-test.txt
@@ -23,6 +23,4 @@ pandas<1.1.0;  platform_system == "Darwin"  and platform_machine != "arm64"   an
 pandas;        platform_system == "Darwin"  and platform_machine != "arm64"   and python_version >= "3.8"
 pandas;        platform_system == "Darwin"  and platform_machine == "arm64"
 pandas<1.1.0;  platform_system == "Windows"                                   and python_version <  "3.8"
-pandas;        platform_system == "Windows"                                   and python_version >= "3.8" and python_version <= "3.9"
-
-# TODO(kszucs): remove the python_version <= "3.9" constraint once https://github.com/scipy/oldest-supported-numpy/pull/27 gets merged
+pandas;        platform_system == "Windows"                                   and python_version >= "3.8"

From a9ba3f2a24f5a1da3634a1d60127ff3a8d235f9b Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Thu, 4 Nov 2021 22:02:24 +0100
Subject: [PATCH 084/194] ARROW-14496: [Docs] Create relative links for R / JS
 / C/Glib references in the sphinx toctree using stub pages
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This will create dummy pages like `/r/index.html` which can be used for sphinx to link to, but which should not be added to the actual hosted files for the arrow site (they have to be overwritten by the index.html files of the respective proper doc builds for R, js and C/Glib).

Closes #11601 from jorisvandenbossche/ARROW-14496-toctree-external

Authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Signed-off-by: Krisztián Szűcs <szucs.krisztian@gmail.com>
---
 docs/source/c_glib/index.rst         | 21 +++++++++++++++++++++
 docs/source/index.rst                |  6 +++---
 docs/source/java/index.rst           |  2 +-
 docs/source/java/reference/index.rst | 21 +++++++++++++++++++++
 docs/source/js/index.rst             | 21 +++++++++++++++++++++
 docs/source/r/index.rst              | 21 +++++++++++++++++++++
 6 files changed, 88 insertions(+), 4 deletions(-)
 create mode 100644 docs/source/c_glib/index.rst
 create mode 100644 docs/source/java/reference/index.rst
 create mode 100644 docs/source/js/index.rst
 create mode 100644 docs/source/r/index.rst

diff --git a/docs/source/c_glib/index.rst b/docs/source/c_glib/index.rst
new file mode 100644
index 0000000000000..56db23f2a2040
--- /dev/null
+++ b/docs/source/c_glib/index.rst
@@ -0,0 +1,21 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements.  See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership.  The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License.  You may obtain a copy of the License at
+
+..   http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied.  See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+C/GLib docs
+===========
+
+Stub page for the C/GLib docs; actual source is located in c_glib/doc/ sub-directory.
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 3b921082a22e7..90d6ac09ba2bc 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -41,16 +41,16 @@ target environment.**
    :maxdepth: 1
    :caption: Supported Environments
 
-   C/GLib <https://arrow.apache.org/docs/c_glib/>
+   C/GLib <c_glib/index>
    C++ <cpp/index>
    C# <https://github.com/apache/arrow/blob/master/csharp/README.md>
    Go <https://godoc.org/github.com/apache/arrow/go/arrow>
    Java <java/index>
-   JavaScript <https://arrow.apache.org/docs/js/>
+   JavaScript <js/index>
    Julia <https://github.com/apache/arrow/blob/master/julia/Arrow/README.md>
    MATLAB <https://github.com/apache/arrow/blob/master/matlab/README.md>
    Python <python/index>
-   R <https://arrow.apache.org/docs/r/>
+   R <r/index>
    Ruby <https://github.com/apache/arrow/blob/master/ruby/README.md>
    Rust <https://docs.rs/crate/arrow/>
    status
diff --git a/docs/source/java/index.rst b/docs/source/java/index.rst
index 64dd44f080ba1..65a7a3a4f10a2 100644
--- a/docs/source/java/index.rst
+++ b/docs/source/java/index.rst
@@ -28,4 +28,4 @@ on the Arrow format and other language bindings see the :doc:`parent documentati
    vector_schema_root
    ipc
    algorithm
-   Reference (javadoc) <https://arrow.apache.org/docs/java/reference/>
+   Reference (javadoc) <reference/index>
diff --git a/docs/source/java/reference/index.rst b/docs/source/java/reference/index.rst
new file mode 100644
index 0000000000000..523ac0c7f7434
--- /dev/null
+++ b/docs/source/java/reference/index.rst
@@ -0,0 +1,21 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements.  See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership.  The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License.  You may obtain a copy of the License at
+
+..   http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied.  See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+Java Reference (javadoc)
+========================
+
+Stub page for the Java reference docs; actual source is located in the java/ directory.
diff --git a/docs/source/js/index.rst b/docs/source/js/index.rst
new file mode 100644
index 0000000000000..77813c1372dfe
--- /dev/null
+++ b/docs/source/js/index.rst
@@ -0,0 +1,21 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements.  See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership.  The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License.  You may obtain a copy of the License at
+
+..   http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied.  See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+JavaScript docs
+===============
+
+Stub page for the JavaScript docs; actual source is located in js/ sub-directory.
diff --git a/docs/source/r/index.rst b/docs/source/r/index.rst
new file mode 100644
index 0000000000000..b799544bb6bb3
--- /dev/null
+++ b/docs/source/r/index.rst
@@ -0,0 +1,21 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements.  See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership.  The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License.  You may obtain a copy of the License at
+
+..   http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied.  See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+R docs
+======
+
+Stub page for the R docs; actual source is located in r/ sub-directory.

From fdc6a79b1028c8dae086ccba1cd6ce9f66f90fcb Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Thu, 4 Nov 2021 17:03:21 -0400
Subject: [PATCH 085/194] ARROW-14593: [C++] Fix crashes on invalid IPC file
 (OSS-Fuzz)

Fix the following issues:
- https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=39978
- https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=40653

Closes #11610 from pitrou/ARROW-14593-ipc-file-fuzz

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 cpp/src/arrow/array/array_list_test.cc        | 55 +++++++++++++++++++
 cpp/src/arrow/array/array_nested.h            |  4 +-
 cpp/src/arrow/array/validate.cc               |  5 +-
 .../arrow/compute/kernels/scalar_nested.cc    |  3 +-
 .../arrow/compute/kernels/vector_nested.cc    |  7 ++-
 .../compute/kernels/vector_nested_test.cc     |  6 +-
 cpp/src/arrow/type.h                          |  4 +-
 testing                                       |  2 +-
 8 files changed, 73 insertions(+), 13 deletions(-)

diff --git a/cpp/src/arrow/array/array_list_test.cc b/cpp/src/arrow/array/array_list_test.cc
index a503cbd516c66..34887ad26fc34 100644
--- a/cpp/src/arrow/array/array_list_test.cc
+++ b/cpp/src/arrow/array/array_list_test.cc
@@ -25,6 +25,8 @@
 
 #include "arrow/array.h"
 #include "arrow/array/builder_nested.h"
+#include "arrow/array/util.h"
+#include "arrow/array/validate.h"
 #include "arrow/buffer.h"
 #include "arrow/status.h"
 #include "arrow/testing/gtest_common.h"
@@ -621,6 +623,7 @@ TEST_F(TestMapArray, Equality) {
     }
     ASSERT_OK(ib.AppendValues(equal_values.data(), equal_values.size()));
     ASSERT_OK(builder_->Finish(out));
+    ASSERT_OK((*out)->ValidateFull());
   }
 
   // now an unequal one
@@ -630,6 +633,7 @@ TEST_F(TestMapArray, Equality) {
   }
   ASSERT_OK(ib.AppendValues(unequal_values.data(), unequal_values.size()));
   ASSERT_OK(builder_->Finish(&unequal_array));
+  ASSERT_OK(unequal_array->ValidateFull());
 
   // Test array equality
   EXPECT_TRUE(array->Equals(array));
@@ -713,6 +717,57 @@ TEST_F(TestMapArray, BuildingStringToInt) {
   ASSERT_ARRAYS_EQUAL(*actual, expected);
 }
 
+TEST_F(TestMapArray, ValidateErrorNullStruct) {
+  ASSERT_OK_AND_ASSIGN(
+      auto values,
+      MakeArrayOfNull(struct_({field("key", utf8()), field("value", int32())}), 1));
+
+  Int32Builder offset_builder;
+  ASSERT_OK(offset_builder.AppendNull());
+  ASSERT_OK(offset_builder.Append(0));
+  ASSERT_OK_AND_ASSIGN(auto offsets, offset_builder.Finish());
+
+  ASSERT_OK_AND_ASSIGN(auto lists, ListArray::FromArrays(*offsets, *values));
+  ASSERT_OK(lists->ValidateFull());
+  ASSERT_EQ(lists->length(), 1);
+  ASSERT_EQ(lists->null_count(), 1);
+
+  // Make a Map ArrayData from the list array
+  // Note we can't construct a MapArray as that would crash with an assertion.
+  auto map_data = lists->data()->Copy();
+  map_data->type = map(utf8(), int32());
+  ASSERT_RAISES(Invalid, internal::ValidateArray(*map_data));
+}
+
+TEST_F(TestMapArray, ValidateErrorNullKey) {
+  StringBuilder key_builder;
+  ASSERT_OK(key_builder.AppendNull());
+  ASSERT_OK_AND_ASSIGN(auto keys, key_builder.Finish());
+
+  Int32Builder item_builder;
+  ASSERT_OK(item_builder.Append(42));
+  ASSERT_OK_AND_ASSIGN(auto items, item_builder.Finish());
+
+  ASSERT_OK_AND_ASSIGN(
+      auto values,
+      StructArray::Make({keys, items}, std::vector<std::string>{"key", "value"}));
+
+  Int32Builder offset_builder;
+  ASSERT_OK(offset_builder.Append(0));
+  ASSERT_OK(offset_builder.Append(1));
+  ASSERT_OK_AND_ASSIGN(auto offsets, offset_builder.Finish());
+
+  // The list array contains: [[null, 42]]
+  ASSERT_OK_AND_ASSIGN(auto lists, ListArray::FromArrays(*offsets, *values));
+  ASSERT_OK(lists->ValidateFull());
+
+  // Make a Map ArrayData from the list array
+  // Note we can't construct a MapArray as that would crash with an assertion.
+  auto map_data = lists->data()->Copy();
+  map_data->type = map(keys->type(), items->type());
+  ASSERT_RAISES(Invalid, internal::ValidateArray(*map_data));
+}
+
 TEST_F(TestMapArray, FromArrays) {
   std::shared_ptr<Array> offsets1, offsets2, offsets3, offsets4, keys, items;
 
diff --git a/cpp/src/arrow/array/array_nested.h b/cpp/src/arrow/array/array_nested.h
index 762ba24f27980..97e470f550ad5 100644
--- a/cpp/src/arrow/array/array_nested.h
+++ b/cpp/src/arrow/array/array_nested.h
@@ -284,9 +284,9 @@ class ARROW_EXPORT FixedSizeListArray : public Array {
   std::shared_ptr<DataType> value_type() const;
 
   // The following functions will not perform boundschecking
-  int32_t value_offset(int64_t i) const {
+  int64_t value_offset(int64_t i) const {
     i += data_->offset;
-    return static_cast<int32_t>(list_size_ * i);
+    return list_size_ * i;
   }
   int32_t value_length(int64_t i = 0) const {
     ARROW_UNUSED(i);
diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc
index c66c4f53b9dda..a19c3716595e5 100644
--- a/cpp/src/arrow/array/validate.cc
+++ b/cpp/src/arrow/array/validate.cc
@@ -75,7 +75,10 @@ struct ValidateArrayImpl {
 
   Status Visit(const LargeListType& type) { return ValidateListLike(type); }
 
-  Status Visit(const MapType& type) { return ValidateListLike(type); }
+  Status Visit(const MapType& type) {
+    RETURN_NOT_OK(ValidateListLike(type));
+    return MapArray::ValidateChildData(data.child_data);
+  }
 
   Status Visit(const FixedSizeListType& type) {
     const ArrayData& values = *data.child_data[0];
diff --git a/cpp/src/arrow/compute/kernels/scalar_nested.cc b/cpp/src/arrow/compute/kernels/scalar_nested.cc
index aafaeb34159e9..aeac0d747b1a5 100644
--- a/cpp/src/arrow/compute/kernels/scalar_nested.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_nested.cc
@@ -56,12 +56,11 @@ Status ListValueLength(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
 }
 
 Status FixedSizeListValueLength(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-  using offset_type = typename FixedSizeListType::offset_type;
   auto width = checked_cast<const FixedSizeListType&>(*batch[0].type()).list_size();
   if (batch[0].kind() == Datum::ARRAY) {
     const auto& arr = *batch[0].array();
     ArrayData* out_arr = out->mutable_array();
-    auto* out_values = out_arr->GetMutableValues<offset_type>(1);
+    auto* out_values = out_arr->GetMutableValues<int32_t>(1);
     std::fill(out_values, out_values + arr.length, width);
   } else {
     const auto& arg0 = batch[0].scalar_as<FixedSizeListScalar>();
diff --git a/cpp/src/arrow/compute/kernels/vector_nested.cc b/cpp/src/arrow/compute/kernels/vector_nested.cc
index f4c61ba7472b5..fb1087aa5f7fc 100644
--- a/cpp/src/arrow/compute/kernels/vector_nested.cc
+++ b/cpp/src/arrow/compute/kernels/vector_nested.cc
@@ -80,7 +80,8 @@ struct ListParentIndicesArray {
     using offset_type = typename FixedSizeListType::offset_type;
     const offset_type slot_length = type.list_size();
     const int64_t values_length = slot_length * (input->length - input->GetNullCount());
-    ARROW_ASSIGN_OR_RAISE(auto indices, ctx->Allocate(values_length * sizeof(int32_t)));
+    ARROW_ASSIGN_OR_RAISE(auto indices,
+                          ctx->Allocate(values_length * sizeof(offset_type)));
     auto* out_indices = reinterpret_cast<offset_type*>(indices->mutable_data());
     const auto* bitmap = input->GetValues<uint8_t>(0, 0);
     for (int32_t i = 0; i < input->length; i++) {
@@ -90,7 +91,7 @@ struct ListParentIndicesArray {
         out_indices += slot_length;
       }
     }
-    out = ArrayData::Make(int32(), values_length, {nullptr, std::move(indices)},
+    out = ArrayData::Make(int64(), values_length, {nullptr, std::move(indices)},
                           /*null_count=*/0);
     return Status::OK();
   }
@@ -113,8 +114,8 @@ struct ListParentIndicesArray {
 Result<std::shared_ptr<DataType>> ListParentIndicesType(const DataType& input_type) {
   switch (input_type.id()) {
     case Type::LIST:
-    case Type::FIXED_SIZE_LIST:
       return int32();
+    case Type::FIXED_SIZE_LIST:
     case Type::LARGE_LIST:
       return int64();
     default:
diff --git a/cpp/src/arrow/compute/kernels/vector_nested_test.cc b/cpp/src/arrow/compute/kernels/vector_nested_test.cc
index 28bb4bdfde489..25bb8b62e89c5 100644
--- a/cpp/src/arrow/compute/kernels/vector_nested_test.cc
+++ b/cpp/src/arrow/compute/kernels/vector_nested_test.cc
@@ -111,18 +111,18 @@ TEST(TestVectorNested, ListParentIndicesFixedSizeList) {
   for (auto ty : {fixed_size_list(int16(), 2), fixed_size_list(uint32(), 2)}) {
     {
       auto input = ArrayFromJSON(ty, "[[0, null], null, [1, 2], [3, 4], [null, 5]]");
-      auto expected = ArrayFromJSON(int32(), "[0, 0, 2, 2, 3, 3, 4, 4]");
+      auto expected = ArrayFromJSON(int64(), "[0, 0, 2, 2, 3, 3, 4, 4]");
       CheckVectorUnary("list_parent_indices", input, expected);
     }
     {
       // Test a chunked array
       auto input =
           ChunkedArrayFromJSON(ty, {"[[0, null], null, [1, 2]]", "[[3, 4], [null, 5]]"});
-      auto expected = ChunkedArrayFromJSON(int32(), {"[0, 0, 2, 2]", "[3, 3, 4, 4]"});
+      auto expected = ChunkedArrayFromJSON(int64(), {"[0, 0, 2, 2]", "[3, 3, 4, 4]"});
       CheckVectorUnary("list_parent_indices", input, expected);
 
       input = ChunkedArrayFromJSON(ty, {});
-      expected = ChunkedArrayFromJSON(int32(), {});
+      expected = ChunkedArrayFromJSON(int64(), {});
       CheckVectorUnary("list_parent_indices", input, expected);
     }
   }
diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
index 23e6c7e9e2de6..1df403b9e1b7c 100644
--- a/cpp/src/arrow/type.h
+++ b/cpp/src/arrow/type.h
@@ -927,7 +927,9 @@ class ARROW_EXPORT MapType : public ListType {
 class ARROW_EXPORT FixedSizeListType : public BaseListType {
  public:
   static constexpr Type::type type_id = Type::FIXED_SIZE_LIST;
-  using offset_type = int32_t;
+  // While the individual item size is 32-bit, the overall data size
+  // (item size * list length) may not fit in a 32-bit int.
+  using offset_type = int64_t;
 
   static constexpr const char* type_name() { return "fixed_size_list"; }
 
diff --git a/testing b/testing
index a60b715263d9b..a51d8788cf349 160000
--- a/testing
+++ b/testing
@@ -1 +1 @@
-Subproject commit a60b715263d9bbf7e744527fb0c084b693f58043
+Subproject commit a51d8788cf34994ef88c81cffb11117a949350ef

From ddc19fa662a2444cfb1e68ff163e45da735d8735 Mon Sep 17 00:00:00 2001
From: David Li <li.davidm96@gmail.com>
Date: Fri, 5 Nov 2021 08:37:54 +0900
Subject: [PATCH 086/194] ARROW-14585: [C++] Find libgrpc++_reflection via
 pkg-config

gRPC does not ship a pkg-config file for the reflection library, so assume it's next to the discovered gRPC library.

Also, gRPC's pkg-config file neglects to include pthreads, so manually specify that as well, when doing gRPC feature detection. (Otherwise we incorrectly conclude that gRPC doesn't support advanced TLS options.)

Closes #11605 from lidavidm/arrow-14585

Authored-by: David Li <li.davidm96@gmail.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 cpp/cmake_modules/FindgRPCAlt.cmake | 20 ++++++++++++++++++++
 cpp/src/arrow/flight/CMakeLists.txt |  4 +++-
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/cpp/cmake_modules/FindgRPCAlt.cmake b/cpp/cmake_modules/FindgRPCAlt.cmake
index 18b23f32269b5..9bef477c13d0b 100644
--- a/cpp/cmake_modules/FindgRPCAlt.cmake
+++ b/cpp/cmake_modules/FindgRPCAlt.cmake
@@ -56,6 +56,20 @@ if(GRPCPP_PC_FOUND)
     list(APPEND gRPCAlt_FIND_PACKAGE_ARGS VERSION_VAR gRPCAlt_VERSION)
   endif()
   find_package_handle_standard_args(${gRPCAlt_FIND_PACKAGE_ARGS})
+
+  # gRPC does not expose the reflection library via pkg-config, but it should be alongside the main library
+  get_filename_component(GRPCPP_IMPORTED_DIRECTORY ${GRPCPP_IMPORTED_LOCATION} DIRECTORY)
+  if(ARROW_GRPC_USE_SHARED)
+    set(GRPCPP_REFLECTION_LIB_NAME
+        "${CMAKE_SHARED_LIBRARY_PREFIX}grpc++_reflection${CMAKE_SHARED_LIBRARY_SUFFIX}")
+  else()
+    set(GRPCPP_REFLECTION_LIB_NAME
+        "${CMAKE_STATIC_LIBRARY_PREFIX}grpc++_reflection${CMAKE_STATIC_LIBRARY_SUFFIX}")
+  endif()
+  find_library(GRPCPP_REFLECTION_IMPORTED_LOCATION
+               NAMES grpc++_reflection ${GRPCPP_REFLECTION_LIB_NAME}
+               PATHS ${GRPCPP_IMPORTED_DIRECTORY}
+               NO_DEFAULT_PATH)
 else()
   set(gRPCAlt_FOUND FALSE)
 endif()
@@ -70,6 +84,12 @@ if(gRPCAlt_FOUND)
                                    INTERFACE_LINK_LIBRARIES "${GRPCPP_LINK_LIBRARIES}"
                                    INTERFACE_LINK_OPTIONS "${GRPCPP_LINK_OPTIONS}")
 
+  add_library(gRPC::grpc++_reflection UNKNOWN IMPORTED)
+  set_target_properties(gRPC::grpc++_reflection
+                        PROPERTIES IMPORTED_LOCATION
+                                   "${GRPCPP_REFLECTION_IMPORTED_LOCATION}"
+                                   INTERFACE_LINK_LIBRARIES gRPC::grpc++)
+
   add_executable(gRPC::grpc_cpp_plugin IMPORTED)
   set_target_properties(gRPC::grpc_cpp_plugin PROPERTIES IMPORTED_LOCATION
                                                          ${GRPC_CPP_PLUGIN})
diff --git a/cpp/src/arrow/flight/CMakeLists.txt b/cpp/src/arrow/flight/CMakeLists.txt
index 309e5a96865cb..8a3228e5026f6 100644
--- a/cpp/src/arrow/flight/CMakeLists.txt
+++ b/cpp/src/arrow/flight/CMakeLists.txt
@@ -67,6 +67,8 @@ string(REPLACE "-Werror " " " CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
 
 # Probe the version of gRPC being used to see if it supports disabling server
 # verification when using TLS.
+# gRPC's pkg-config file neglects to specify pthreads.
+find_package(Threads REQUIRED)
 function(test_grpc_version DST_VAR DETECT_VERSION TEST_FILE)
   if(NOT DEFINED ${DST_VAR})
     message(STATUS "Checking support for TlsCredentialsOptions (gRPC >= ${DETECT_VERSION})..."
@@ -80,7 +82,7 @@ function(test_grpc_version DST_VAR DETECT_VERSION TEST_FILE)
     try_compile(HAS_GRPC_VERSION ${CMAKE_CURRENT_BINARY_DIR}/try_compile
                 SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/try_compile/${TEST_FILE}"
                 CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${CURRENT_INCLUDE_DIRECTORIES}"
-                LINK_LIBRARIES gRPC::grpc++
+                LINK_LIBRARIES gRPC::grpc++ Threads::Threads
                 OUTPUT_VARIABLE TLS_CREDENTIALS_OPTIONS_CHECK_OUTPUT CXX_STANDARD 11)
     if(HAS_GRPC_VERSION)
       set(${DST_VAR}

From 204c0b542fc4979f3652e1ca0d571f79a5dc8b83 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Fri, 5 Nov 2021 08:39:28 +0900
Subject: [PATCH 087/194] ARROW-14598: [C++][Flight] Fix protoc generation
 dependency for example

* DEPENDS variable name has a typo.
* ARROW_PROTOBUF_PROTOC doesn't depend on protobuf_ep with bundled Protobuf

Closes #11615 from kou/cpp-flight-example-fix-dependency

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 cpp/cmake_modules/ThirdpartyToolchain.cmake | 1 +
 cpp/examples/arrow/CMakeLists.txt           | 7 +++----
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake
index c59ac19815379..1a85ea39a4b27 100644
--- a/cpp/cmake_modules/ThirdpartyToolchain.cmake
+++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake
@@ -1470,6 +1470,7 @@ macro(build_protobuf)
 
   add_dependencies(toolchain protobuf_ep)
   add_dependencies(arrow::protobuf::libprotobuf protobuf_ep)
+  add_dependencies(arrow::protobuf::protoc protobuf_ep)
 
   list(APPEND ARROW_BUNDLED_STATIC_LIBS arrow::protobuf::libprotobuf)
 endmacro()
diff --git a/cpp/examples/arrow/CMakeLists.txt b/cpp/examples/arrow/CMakeLists.txt
index 34c21abada5f2..e46cc7a6fe58a 100644
--- a/cpp/examples/arrow/CMakeLists.txt
+++ b/cpp/examples/arrow/CMakeLists.txt
@@ -52,18 +52,17 @@ if(ARROW_FLIGHT)
 
   set(FLIGHT_EXAMPLE_PROTO "helloworld.proto")
   set(FLIGHT_EXAMPLE_PROTO_PATH "${CMAKE_CURRENT_LIST_DIR}")
-  set(FLIGHT_EXAMPLE_PROTO_DEPENDS ${FLIGHT_EXAMPLE_PROTO} ${ARROW_PROTOBUF_LIBPROTOBUF}
-                                   gRPC::grpc_cpp_plugin)
+  set(FLIGHT_EXAMPLE_PROTO_DEPENDS ${FLIGHT_EXAMPLE_PROTO} gRPC::grpc_cpp_plugin)
 
   add_custom_command(OUTPUT ${FLIGHT_EXAMPLE_GENERATED_PROTO_FILES}
                      COMMAND ${ARROW_PROTOBUF_PROTOC} "-I${FLIGHT_EXAMPLE_PROTO_PATH}"
                              "--cpp_out=${CMAKE_CURRENT_BINARY_DIR}"
                              "${FLIGHT_EXAMPLE_PROTO}"
-                     DEPENDS ${PROTO_EXAMPLE_PROTO_DEPENDS} ARGS
                      COMMAND ${ARROW_PROTOBUF_PROTOC} "-I${FLIGHT_EXAMPLE_PROTO_PATH}"
                              "--grpc_out=${CMAKE_CURRENT_BINARY_DIR}"
                              "--plugin=protoc-gen-grpc=$<TARGET_FILE:gRPC::grpc_cpp_plugin>"
-                             "${FLIGHT_EXAMPLE_PROTO}")
+                             "${FLIGHT_EXAMPLE_PROTO}"
+                     DEPENDS ${FLIGHT_EXAMPLE_PROTO_DEPENDS})
 
   add_custom_target(flight_grpc_example_gen ALL
                     DEPENDS ${FLIGHT_EXAMPLE_GENERATED_PROTO_FILES})

From 842b1a7a846b928aec9e7922e662940e7948b17e Mon Sep 17 00:00:00 2001
From: daipom <fukuda@clear-code.com>
Date: Fri, 5 Nov 2021 10:08:18 +0100
Subject: [PATCH 088/194] ARROW-14600: [Docs] Fix broken link in Python
 Development page

Fix a simple sphinx grammatical error of external web pages link here:
https://arrow.apache.org/docs/developers/python.html#build-and-test

Closes #11620 from daipom/fix-broken-link

Authored-by: daipom <fukuda@clear-code.com>
Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 docs/source/developers/python.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/developers/python.rst b/docs/source/developers/python.rst
index 3795512ef8c70..69bb76a0bda2a 100644
--- a/docs/source/developers/python.rst
+++ b/docs/source/developers/python.rst
@@ -330,7 +330,7 @@ virtualenv) enables cmake to choose the python executable which you are using.
 .. note::
 
    With older versions of ``cmake`` (<3.15) you might need to pass ``-DPYTHON_EXECUTABLE``
-   instead of ``-DPython3_EXECUTABLE``. See `cmake documentation <https://cmake.org/cmake/help/latest/module/FindPython3.html#artifacts-specification>`
+   instead of ``-DPython3_EXECUTABLE``. See `cmake documentation <https://cmake.org/cmake/help/latest/module/FindPython3.html#artifacts-specification>`_
    for more details.
 
 For any other C++ build challenges, see :ref:`cpp-development`.

From dce1415361581503dbf27738c16ea76e54d46a3b Mon Sep 17 00:00:00 2001
From: Matthew Topol <mtopol@factset.com>
Date: Fri, 5 Nov 2021 11:21:26 -0400
Subject: [PATCH 089/194] ARROW-14589: [CI][Go] Fix CGO Windows Tests

Closes #11611 from zeroshade/arrow-14589-cgowindows

Lead-authored-by: Matthew Topol <mtopol@factset.com>
Co-authored-by: Matt Topol <zotthewizard@gmail.com>
Signed-off-by: Matthew Topol <mtopol@factset.com>
---
 .github/workflows/go.yml  | 12 +++++-------
 ci/scripts/go_build.sh    |  7 ++++++-
 ci/scripts/go_test.sh     |  5 ++++-
 ci/scripts/msys2_setup.sh |  3 +++
 4 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml
index b2f8534ae18e8..70300b7be36a7 100644
--- a/.github/workflows/go.yml
+++ b/.github/workflows/go.yml
@@ -238,8 +238,7 @@ jobs:
     timeout-minutes: 60
     strategy:
       fail-fast: false
-      matrix:
-        go: [1.15]
+      matrix:        
         mingw-n-bits:
           #- 32 runtime handling for CGO needs 64-bit currently
           - 64
@@ -254,10 +253,6 @@ jobs:
             /t REG_DWORD `
             /d 1 `
             /f
-      - name: Install go
-        uses: actions/setup-go@v1
-        with:
-          go-version: ${{ matrix.go }}
       - name: Checkout Arrow
         uses: actions/checkout@v2
         with:
@@ -278,7 +273,10 @@ jobs:
         run: |
           echo "CGO_CPPFLAGS=-I$(cygpath --windows ${MINGW_PREFIX}/include)" >> $GITHUB_ENV
           echo "CGO_LDFLAGS=-g -O2 -L$(cygpath --windows ${MINGW_PREFIX}/lib) -L$(cygpath --windows ${MINGW_PREFIX}/bin)" >> $GITHUB_ENV
-          echo "$(cygpath --windows ${MINGW_PREFIX}/bin)" >> $GITHUB_PATH
+          echo "GOROOT=$(cygpath --windows ${MINGW_PREFIX}/lib/go)" >> $GITHUB_ENV          
+          echo "GOPATH=$(cygpath --windows ${HOME}/gopath)" >> $GITHUB_ENV
+          mkdir -p $(cygpath --windows ${HOME}/gopath)
+          echo "MINGW_PREFIX=$(cygpath --windows ${MINGW_PREFIX})" >> $GITHUB_ENV
       - name: Build
         shell: bash
         run: ci/scripts/go_build.sh .
diff --git a/ci/scripts/go_build.sh b/ci/scripts/go_build.sh
index 267f78e594697..0b71e376a765a 100755
--- a/ci/scripts/go_build.sh
+++ b/ci/scripts/go_build.sh
@@ -24,7 +24,12 @@ source_dir=${1}/go
 pushd ${source_dir}/arrow
 
 if [[ -n "${ARROW_GO_TESTCGO}" ]]; then
-    TAGS="-tags ccalloc"
+    if [[ "${MSYSTEM}" = "MINGW64" ]]; then        
+        export PATH=${MINGW_PREFIX}/bin:$PATH
+        go clean -cache
+        go clean -testcache        
+    fi
+    TAGS="-tags assert,test,ccalloc"
 fi
 
 go get -d -t -v ./...
diff --git a/ci/scripts/go_test.sh b/ci/scripts/go_test.sh
index f7b2cd963fc4d..eee156fd77cc9 100755
--- a/ci/scripts/go_test.sh
+++ b/ci/scripts/go_test.sh
@@ -36,7 +36,10 @@ fi
 pushd ${source_dir}/arrow
 
 TAGS="assert,test"
-if [[ -n "${ARROW_GO_TESTCGO}" ]]; then
+if [[ -n "${ARROW_GO_TESTCGO}" ]]; then    
+    if [[ "${MSYSTEM}" = "MINGW64" ]]; then
+        export PATH=${MINGW_PREFIX}/bin:$PATH        
+    fi
     TAGS="${TAGS},ccalloc"
 fi
 
diff --git a/ci/scripts/msys2_setup.sh b/ci/scripts/msys2_setup.sh
index 6f6012c879adf..01cd5fa9ee360 100755
--- a/ci/scripts/msys2_setup.sh
+++ b/ci/scripts/msys2_setup.sh
@@ -65,6 +65,9 @@ case "${target}" in
   cgo)
     packages+=(${MINGW_PACKAGE_PREFIX}-arrow)
     packages+=(${MINGW_PACKAGE_PREFIX}-gcc)
+    packages+=(${MINGW_PACKAGE_PREFIX}-go)
+    packages+=(${MINGW_PACKAGE_PREFIX}-toolchain)
+    packages+=(base-devel)    
     ;;
 esac
 

From 528625e6ed4bf1f6540f8a410a496a14712252e7 Mon Sep 17 00:00:00 2001
From: Benjamin Kietzman <bengilgit@gmail.com>
Date: Fri, 5 Nov 2021 14:25:28 -0400
Subject: [PATCH 090/194] ARROW-14074: [C++][Compute] C++ consumer of compute
 IR

Provide a consumer for compute IR.

The consumer produces `Declaration` objects from buffers of Compute IR. This allows us to specify mappings from IR to ExecNode graphs without needing those specific nodes in the registry or requiring the registered factories to support all features declared in IR.

This is tested by using the flatbuffer compiler to convert JSON into the corresponding binary representation.

Closes #11384 from bkietz/14074-Sketch-a-C-consumer-of-co

Authored-by: Benjamin Kietzman <bengilgit@gmail.com>
Signed-off-by: Benjamin Kietzman <bengilgit@gmail.com>
---
 c_glib/arrow-glib/compute.cpp                 |  15 +-
 ci/conda_env_cpp.txt                          |   1 +
 cpp/cmake_modules/BuildUtils.cmake            |   8 +-
 cpp/src/arrow/CMakeLists.txt                  |   1 +
 cpp/src/arrow/compute/api_vector.cc           |   4 +-
 cpp/src/arrow/compute/api_vector.h            |   8 +-
 cpp/src/arrow/compute/exec/CMakeLists.txt     |   8 +
 cpp/src/arrow/compute/exec/exec_plan.cc       |  29 +-
 cpp/src/arrow/compute/exec/exec_plan.h        |  23 +-
 cpp/src/arrow/compute/exec/ir_consumer.cc     | 661 ++++++++++++++
 cpp/src/arrow/compute/exec/ir_consumer.h      |  70 ++
 cpp/src/arrow/compute/exec/ir_test.cc         | 840 ++++++++++++++++++
 cpp/src/arrow/compute/exec/plan_test.cc       |  38 +-
 cpp/src/arrow/compute/exec/test_util.cc       | 180 ++++
 cpp/src/arrow/compute/exec/test_util.h        |   6 +
 cpp/src/arrow/compute/function_internal.h     |  27 +-
 cpp/src/arrow/compute/kernels/vector_sort.cc  |  60 +-
 .../arrow/compute/kernels/vector_sort_test.cc |  14 +-
 cpp/src/arrow/datum.h                         |   9 +-
 cpp/src/arrow/ipc/metadata_internal.cc        |  15 +-
 cpp/src/arrow/ipc/metadata_internal.h         |  22 +
 cpp/src/arrow/scalar.h                        |  13 +-
 cpp/src/arrow/testing/matchers.h              |  60 +-
 cpp/src/arrow/type.cc                         |  24 +
 cpp/src/arrow/type.h                          |   8 +
 cpp/src/arrow/util/io_util.cc                 |  33 +-
 cpp/src/arrow/util/io_util.h                  |   3 +-
 cpp/src/arrow/util/unreachable.h              |   4 +-
 python/pyarrow/tests/test_compute.py          |   3 +-
 29 files changed, 2092 insertions(+), 95 deletions(-)
 create mode 100644 cpp/src/arrow/compute/exec/ir_consumer.cc
 create mode 100644 cpp/src/arrow/compute/exec/ir_consumer.h
 create mode 100644 cpp/src/arrow/compute/exec/ir_test.cc

diff --git a/c_glib/arrow-glib/compute.cpp b/c_glib/arrow-glib/compute.cpp
index cdfc96a5a4506..c03d24293f1b6 100644
--- a/c_glib/arrow-glib/compute.cpp
+++ b/c_glib/arrow-glib/compute.cpp
@@ -106,7 +106,7 @@ namespace {
   garrow_sort_key_equal_raw(const arrow::compute::SortKey &sort_key,
                             const arrow::compute::SortKey &other_sort_key) {
     return
-      (sort_key.name == other_sort_key.name) &&
+      (sort_key.target == other_sort_key.target) &&
       (sort_key.order == other_sort_key.order);
 
   }
@@ -2330,7 +2330,7 @@ garrow_sort_key_set_property(GObject *object,
 
   switch (prop_id) {
   case PROP_SORT_KEY_NAME:
-    priv->sort_key.name = g_value_get_string(value);
+    priv->sort_key.target = g_value_get_string(value);
     break;
   case PROP_SORT_KEY_ORDER:
     priv->sort_key.order =
@@ -2352,7 +2352,11 @@ garrow_sort_key_get_property(GObject *object,
 
   switch (prop_id) {
   case PROP_SORT_KEY_NAME:
-    g_value_set_string(value, priv->sort_key.name.c_str());
+    if (auto name = priv->sort_key.target.name()) {
+      g_value_set_string(value, name->c_str());
+    } else {
+      g_value_set_string(value, "");
+    }
     break;
   case PROP_SORT_KEY_ORDER:
     g_value_set_enum(value, static_cast<GArrowSortOrder>(priv->sort_key.order));
@@ -2531,9 +2535,8 @@ garrow_sort_options_get_sort_keys(GArrowSortOptions *options)
   auto arrow_options = garrow_sort_options_get_raw(options);
   GList *sort_keys = NULL;
   for (const auto &arrow_sort_key : arrow_options->sort_keys) {
-    auto sort_key =
-      garrow_sort_key_new(arrow_sort_key.name.c_str(),
-                          static_cast<GArrowSortOrder>(arrow_sort_key.order));
+    auto sort_key = g_object_new(GARROW_TYPE_SORT_KEY, NULL);
+    GARROW_SORT_KEY_GET_PRIVATE(sort_key)->sort_key = arrow_sort_key;
     sort_keys = g_list_prepend(sort_keys, sort_key);
   }
   return g_list_reverse(sort_keys);
diff --git a/ci/conda_env_cpp.txt b/ci/conda_env_cpp.txt
index 277bf6a425bcd..6d1ebf353415f 100644
--- a/ci/conda_env_cpp.txt
+++ b/ci/conda_env_cpp.txt
@@ -40,3 +40,4 @@ snappy
 thrift-cpp>=0.11.0
 zlib
 zstd
+flatbuffers
diff --git a/cpp/cmake_modules/BuildUtils.cmake b/cpp/cmake_modules/BuildUtils.cmake
index 9ab9b582cf6a4..38c35d7e79e70 100644
--- a/cpp/cmake_modules/BuildUtils.cmake
+++ b/cpp/cmake_modules/BuildUtils.cmake
@@ -652,6 +652,7 @@ function(ADD_TEST_CASE REL_TEST_NAME)
       EXTRA_DEPENDENCIES
       LABELS
       EXTRA_LABELS
+      TEST_ARGUMENTS
       PREFIX)
   cmake_parse_arguments(ARG
                         "${options}"
@@ -730,15 +731,16 @@ function(ADD_TEST_CASE REL_TEST_NAME)
              "cd '${CMAKE_SOURCE_DIR}'; \
                valgrind --suppressions=valgrind.supp --tool=memcheck --gen-suppressions=all \
                  --num-callers=500 --leak-check=full --leak-check-heuristics=stdstring \
-                 --error-exitcode=1 ${TEST_PATH}")
+                 --error-exitcode=1 ${TEST_PATH} ${ARG_TEST_ARGUMENTS}")
   elseif(WIN32)
-    add_test(${TEST_NAME} ${TEST_PATH})
+    add_test(${TEST_NAME} ${TEST_PATH} ${ARG_TEST_ARGUMENTS})
   else()
     add_test(${TEST_NAME}
              ${BUILD_SUPPORT_DIR}/run-test.sh
              ${CMAKE_BINARY_DIR}
              test
-             ${TEST_PATH})
+             ${TEST_PATH}
+             ${ARG_TEST_ARGUMENTS})
   endif()
 
   # Add test as dependency of relevant targets
diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index 077f655e36ad9..6eba00de39032 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -377,6 +377,7 @@ if(ARROW_COMPUTE)
        compute/exec/exec_plan.cc
        compute/exec/expression.cc
        compute/exec/filter_node.cc
+       compute/exec/ir_consumer.cc
        compute/exec/project_node.cc
        compute/exec/source_node.cc
        compute/exec/sink_node.cc
diff --git a/cpp/src/arrow/compute/api_vector.cc b/cpp/src/arrow/compute/api_vector.cc
index 1fc6b78745833..95114d8d8a5cb 100644
--- a/cpp/src/arrow/compute/api_vector.cc
+++ b/cpp/src/arrow/compute/api_vector.cc
@@ -97,11 +97,11 @@ namespace compute {
 // Function options
 
 bool SortKey::Equals(const SortKey& other) const {
-  return name == other.name && order == other.order;
+  return target == other.target && order == other.order;
 }
 std::string SortKey::ToString() const {
   std::stringstream ss;
-  ss << name << ' ';
+  ss << target.ToString() << ' ';
   switch (order) {
     case SortOrder::Ascending:
       ss << "ASC";
diff --git a/cpp/src/arrow/compute/api_vector.h b/cpp/src/arrow/compute/api_vector.h
index a91cf91df069a..8788d5d160e76 100644
--- a/cpp/src/arrow/compute/api_vector.h
+++ b/cpp/src/arrow/compute/api_vector.h
@@ -98,8 +98,8 @@ enum class NullPlacement {
 /// \brief One sort key for PartitionNthIndices (TODO) and SortIndices
 class ARROW_EXPORT SortKey : public util::EqualityComparable<SortKey> {
  public:
-  explicit SortKey(std::string name, SortOrder order = SortOrder::Ascending)
-      : name(std::move(name)), order(order) {}
+  explicit SortKey(FieldRef target, SortOrder order = SortOrder::Ascending)
+      : target(std::move(target)), order(order) {}
 
   using util::EqualityComparable<SortKey>::Equals;
   using util::EqualityComparable<SortKey>::operator==;
@@ -107,8 +107,8 @@ class ARROW_EXPORT SortKey : public util::EqualityComparable<SortKey> {
   bool Equals(const SortKey& other) const;
   std::string ToString() const;
 
-  /// The name of the sort column.
-  std::string name;
+  /// A FieldRef targetting the sort column.
+  FieldRef target;
   /// How to order by this sort key.
   SortOrder order;
 };
diff --git a/cpp/src/arrow/compute/exec/CMakeLists.txt b/cpp/src/arrow/compute/exec/CMakeLists.txt
index ccc36c093e820..79ffd67b8fd0b 100644
--- a/cpp/src/arrow/compute/exec/CMakeLists.txt
+++ b/cpp/src/arrow/compute/exec/CMakeLists.txt
@@ -31,3 +31,11 @@ add_arrow_compute_test(union_node_test PREFIX "arrow-compute")
 add_arrow_compute_test(util_test PREFIX "arrow-compute")
 
 add_arrow_benchmark(expression_benchmark PREFIX "arrow-compute")
+
+add_arrow_compute_test(ir_test
+                       PREFIX
+                       "arrow-compute"
+                       EXTRA_LINK_LIBS
+                       ${GFLAGS_LIBRARIES}
+                       TEST_ARGUMENTS
+                       "--computeir_dir=${CMAKE_SOURCE_DIR}/../experimental/computeir")
diff --git a/cpp/src/arrow/compute/exec/exec_plan.cc b/cpp/src/arrow/compute/exec/exec_plan.cc
index 82e119d44a94a..7cd3011b8ab83 100644
--- a/cpp/src/arrow/compute/exec/exec_plan.cc
+++ b/cpp/src/arrow/compute/exec/exec_plan.cc
@@ -23,6 +23,7 @@
 
 #include "arrow/compute/exec.h"
 #include "arrow/compute/exec/expression.h"
+#include "arrow/compute/exec/options.h"
 #include "arrow/compute/exec_internal.h"
 #include "arrow/compute/registry.h"
 #include "arrow/datum.h"
@@ -250,27 +251,41 @@ Status ExecNode::Validate() const {
 
 std::string ExecNode::ToString() const {
   std::stringstream ss;
-  ss << kind_name() << "{\"" << label_ << '"';
+
+  auto PrintLabelAndKind = [&](const ExecNode* node) {
+    ss << node->label() << ":" << node->kind_name();
+  };
+
+  PrintLabelAndKind(this);
+  ss << "{";
+
   if (!inputs_.empty()) {
-    ss << ", inputs=[";
+    ss << "inputs=[";
     for (size_t i = 0; i < inputs_.size(); i++) {
       if (i > 0) ss << ", ";
-      ss << input_labels_[i] << ": \"" << inputs_[i]->label() << '"';
+      ss << input_labels_[i] << "=";
+      PrintLabelAndKind(inputs_[i]);
     }
     ss << ']';
   }
 
   if (!outputs_.empty()) {
-    ss << ", outputs=[";
+    if (!inputs_.empty()) {
+      ss << ", ";
+    }
+
+    ss << "outputs=[";
     for (size_t i = 0; i < outputs_.size(); i++) {
       if (i > 0) ss << ", ";
-      ss << "\"" << outputs_[i]->label() << "\"";
+      PrintLabelAndKind(outputs_[i]);
     }
     ss << ']';
   }
 
   const std::string extra = ToStringExtra();
-  if (!extra.empty()) ss << ", " << extra;
+  if (!extra.empty()) {
+    ss << ", " << extra;
+  }
 
   ss << '}';
   return ss.str();
@@ -522,6 +537,6 @@ Result<std::function<Future<util::optional<ExecBatch>>()>> MakeReaderGenerator(
 
   return MakeBackgroundGenerator(std::move(batch_it), io_executor, max_q, q_restart);
 }
-}  // namespace compute
 
+}  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/exec/exec_plan.h b/cpp/src/arrow/compute/exec/exec_plan.h
index b5e59fe8d30c5..4cb7fad009fcd 100644
--- a/cpp/src/arrow/compute/exec/exec_plan.h
+++ b/cpp/src/arrow/compute/exec/exec_plan.h
@@ -344,19 +344,26 @@ struct ARROW_EXPORT Declaration {
         options{std::move(options)},
         label{std::move(label)} {}
 
+  template <typename Options>
+  Declaration(std::string factory_name, std::vector<Input> inputs, Options options,
+              std::string label)
+      : Declaration{std::move(factory_name), std::move(inputs),
+                    std::shared_ptr<ExecNodeOptions>(
+                        std::make_shared<Options>(std::move(options))),
+                    std::move(label)} {}
+
   template <typename Options>
   Declaration(std::string factory_name, std::vector<Input> inputs, Options options)
-      : factory_name{std::move(factory_name)},
-        inputs{std::move(inputs)},
-        options{std::make_shared<Options>(std::move(options))},
-        label{this->factory_name} {}
+      : Declaration{std::move(factory_name), std::move(inputs), std::move(options),
+                    /*label=*/""} {}
 
   template <typename Options>
   Declaration(std::string factory_name, Options options)
-      : factory_name{std::move(factory_name)},
-        inputs{},
-        options{std::make_shared<Options>(std::move(options))},
-        label{this->factory_name} {}
+      : Declaration{std::move(factory_name), {}, std::move(options), /*label=*/""} {}
+
+  template <typename Options>
+  Declaration(std::string factory_name, Options options, std::string label)
+      : Declaration{std::move(factory_name), {}, std::move(options), std::move(label)} {}
 
   /// \brief Convenience factory for the common case of a simple sequence of nodes.
   ///
diff --git a/cpp/src/arrow/compute/exec/ir_consumer.cc b/cpp/src/arrow/compute/exec/ir_consumer.cc
new file mode 100644
index 0000000000000..b0e47d7108777
--- /dev/null
+++ b/cpp/src/arrow/compute/exec/ir_consumer.cc
@@ -0,0 +1,661 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/exec/ir_consumer.h"
+
+#include "arrow/array/array_nested.h"
+#include "arrow/array/builder_base.h"
+#include "arrow/compute/cast.h"
+#include "arrow/compute/exec/exec_plan.h"
+#include "arrow/compute/exec/expression.h"
+#include "arrow/compute/exec/options.h"
+#include "arrow/compute/function_internal.h"
+#include "arrow/ipc/dictionary.h"
+#include "arrow/ipc/metadata_internal.h"
+#include "arrow/util/unreachable.h"
+#include "arrow/visitor_inline.h"
+
+#include "generated/Plan_generated.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+
+namespace compute {
+
+static inline Status UnexpectedNullField(const char* name) {
+  return Status::IOError("Unexpected null field ", name, " in flatbuffer-encoded IR");
+}
+
+Result<std::shared_ptr<Field>> Convert(const flatbuf::Field& f) {
+  std::string name = ipc::internal::StringFromFlatbuffers(f.name());
+
+  FieldVector fields;
+  if (auto children = f.children()) {
+    fields.resize(children->size());
+    int i = 0;
+    for (const flatbuf::Field* child : *children) {
+      if (child) return UnexpectedNullField("Field.children[i]");
+      ARROW_ASSIGN_OR_RAISE(fields[i++], Convert(*child));
+    }
+  }
+
+  if (!f.type()) return UnexpectedNullField("Field.type");
+
+  std::shared_ptr<DataType> type;
+  RETURN_NOT_OK(ipc::internal::ConcreteTypeFromFlatbuffer(f.type_type(), f.type(),
+                                                          std::move(fields), &type));
+
+  std::shared_ptr<KeyValueMetadata> metadata;
+  RETURN_NOT_OK(ipc::internal::GetKeyValueMetadata(f.custom_metadata(), &metadata));
+
+  return field(std::move(name), std::move(type), f.nullable(), std::move(metadata));
+}
+
+std::string LabelFromRelId(const ir::RelId* id) {
+  return id ? std::to_string(id->id()) : "";
+}
+
+Result<std::shared_ptr<Buffer>> BufferFromFlatbufferByteVector(
+    const flatbuffers::Vector<int8_t>* vec) {
+  if (!vec) return nullptr;
+
+  ARROW_ASSIGN_OR_RAISE(auto buf, AllocateBuffer(vec->size()));
+
+  if (!vec->data()) return UnexpectedNullField("Vector<int8_t>.data");
+  std::memcpy(buf->mutable_data(), vec->data(), vec->size());
+
+  return std::move(buf);
+}
+
+Result<Datum> Convert(const ir::Literal& lit);
+
+struct ConvertLiteralImpl {
+  Result<Datum> Convert(const BooleanType& t) { return ValueOf<ir::BooleanLiteral>(t); }
+
+  Result<Datum> Convert(const Int8Type& t) { return ValueOf<ir::Int8Literal>(t); }
+  Result<Datum> Convert(const Int16Type& t) { return ValueOf<ir::Int16Literal>(t); }
+  Result<Datum> Convert(const Int32Type& t) { return ValueOf<ir::Int32Literal>(t); }
+  Result<Datum> Convert(const Int64Type& t) { return ValueOf<ir::Int64Literal>(t); }
+
+  Result<Datum> Convert(const UInt8Type& t) { return ValueOf<ir::UInt8Literal>(t); }
+  Result<Datum> Convert(const UInt16Type& t) { return ValueOf<ir::UInt16Literal>(t); }
+  Result<Datum> Convert(const UInt32Type& t) { return ValueOf<ir::UInt32Literal>(t); }
+  Result<Datum> Convert(const UInt64Type& t) { return ValueOf<ir::UInt64Literal>(t); }
+
+  Result<Datum> Convert(const HalfFloatType& t) { return ValueOf<ir::Float16Literal>(t); }
+  Result<Datum> Convert(const FloatType& t) { return ValueOf<ir::Float32Literal>(t); }
+  Result<Datum> Convert(const DoubleType& t) { return ValueOf<ir::Float64Literal>(t); }
+
+  Result<Datum> Convert(const Date32Type& t) { return ValueOf<ir::DateLiteral>(t); }
+  Result<Datum> Convert(const Date64Type& t) { return ValueOf<ir::DateLiteral>(t); }
+  Result<Datum> Convert(const Time32Type& t) { return ValueOf<ir::TimeLiteral>(t); }
+  Result<Datum> Convert(const Time64Type& t) { return ValueOf<ir::TimeLiteral>(t); }
+  Result<Datum> Convert(const DurationType& t) { return ValueOf<ir::DurationLiteral>(t); }
+  Result<Datum> Convert(const TimestampType& t) {
+    return ValueOf<ir::TimestampLiteral>(t);
+  }
+
+  Result<Datum> Convert(const IntervalType& t) {
+    ARROW_ASSIGN_OR_RAISE(auto lit, GetLiteral<ir::IntervalLiteral>());
+
+    if (!lit->value()) return UnexpectedNullField("IntervalLiteral.value");
+    switch (t.interval_type()) {
+      case IntervalType::MONTHS:
+        if (auto value = lit->value_as<ir::IntervalLiteralMonths>()) {
+          return Datum(std::make_shared<MonthIntervalScalar>(value->months()));
+        }
+        break;
+
+      case IntervalType::DAY_TIME:
+        if (auto value = lit->value_as<ir::IntervalLiteralDaysMilliseconds>()) {
+          DayTimeIntervalType::DayMilliseconds day_ms{value->days(),
+                                                      value->milliseconds()};
+          return Datum(std::make_shared<DayTimeIntervalScalar>(day_ms));
+        }
+        break;
+
+      case IntervalType::MONTH_DAY_NANO:
+        return Status::NotImplemented(
+            "IntervalLiteral with interval_type=MONTH_DAY_NANO");
+    }
+
+    return Status::IOError("IntervalLiteral.type was ", t.ToString(),
+                           " but IntervalLiteral.value had value_type ",
+                           ir::EnumNameIntervalLiteralImpl(lit->value_type()));
+  }
+
+  Result<Datum> Convert(const DecimalType& t) {
+    ARROW_ASSIGN_OR_RAISE(auto lit, GetLiteral<ir::DecimalLiteral>());
+
+    if (!lit->value()) return UnexpectedNullField("DecimalLiteral.value");
+    if (static_cast<int>(lit->value()->size()) != t.byte_width()) {
+      return Status::IOError("DecimalLiteral.type was ", t.ToString(),
+                             " (expected byte width ", t.byte_width(), ")",
+                             " but DecimalLiteral.value had size ", lit->value()->size());
+    }
+
+    switch (t.id()) {
+      case Type::DECIMAL128: {
+        std::array<uint64_t, 2> little_endian;
+        std::memcpy(little_endian.data(), lit->value(), lit->value()->size());
+        Decimal128 value{BasicDecimal128::LittleEndianArray, little_endian};
+        return Datum(std::make_shared<Decimal128Scalar>(value, type_));
+      }
+
+      case Type::DECIMAL256: {
+        std::array<uint64_t, 4> little_endian;
+        std::memcpy(little_endian.data(), lit->value(), lit->value()->size());
+        Decimal256 value{BasicDecimal256::LittleEndianArray, little_endian};
+        return Datum(std::make_shared<Decimal256Scalar>(value, type_));
+      }
+
+      default:
+        break;
+    }
+
+    Unreachable();
+  }
+
+  Result<Datum> Convert(const ListType&) {
+    ARROW_ASSIGN_OR_RAISE(auto lit, GetLiteral<ir::ListLiteral>());
+
+    if (!lit->values()) return UnexpectedNullField("ListLiteral.values");
+    ScalarVector values{lit->values()->size()};
+
+    int i = 0;
+    for (const ir::Literal* v : *lit->values()) {
+      if (!v) return UnexpectedNullField("ListLiteral.values[i]");
+      ARROW_ASSIGN_OR_RAISE(Datum value, arrow::compute::Convert(*v));
+      values[i++] = value.scalar();
+    }
+
+    std::unique_ptr<ArrayBuilder> builder;
+    RETURN_NOT_OK(MakeBuilder(default_memory_pool(), type_, &builder));
+    RETURN_NOT_OK(builder->AppendScalars(std::move(values)));
+    ARROW_ASSIGN_OR_RAISE(auto arr, builder->Finish());
+    return Datum(std::make_shared<ListScalar>(std::move(arr), type_));
+  }
+
+  Result<Datum> Convert(const MapType& t) {
+    ARROW_ASSIGN_OR_RAISE(auto lit, GetLiteral<ir::MapLiteral>());
+
+    if (!lit->values()) return UnexpectedNullField("MapLiteral.values");
+    ScalarVector keys{lit->values()->size()}, values{lit->values()->size()};
+
+    int i = 0;
+    for (const ir::KeyValue* kv : *lit->values()) {
+      if (!kv) return UnexpectedNullField("MapLiteral.values[i]");
+      ARROW_ASSIGN_OR_RAISE(Datum key, arrow::compute::Convert(*kv->value()));
+      ARROW_ASSIGN_OR_RAISE(Datum value, arrow::compute::Convert(*kv->value()));
+      keys[i] = key.scalar();
+      values[i] = value.scalar();
+      ++i;
+    }
+
+    ArrayVector kv_arrays(2);
+    std::unique_ptr<ArrayBuilder> builder;
+    RETURN_NOT_OK(MakeBuilder(default_memory_pool(), t.key_type(), &builder));
+    RETURN_NOT_OK(builder->AppendScalars(std::move(keys)));
+    ARROW_ASSIGN_OR_RAISE(kv_arrays[0], builder->Finish());
+
+    RETURN_NOT_OK(MakeBuilder(default_memory_pool(), t.value_type(), &builder));
+    RETURN_NOT_OK(builder->AppendScalars(std::move(values)));
+    ARROW_ASSIGN_OR_RAISE(kv_arrays[1], builder->Finish());
+
+    ARROW_ASSIGN_OR_RAISE(auto item_arr,
+                          StructArray::Make(kv_arrays, t.value_type()->fields()));
+    return Datum(std::make_shared<MapScalar>(std::move(item_arr), type_));
+  }
+
+  Result<Datum> Convert(const StructType& t) {
+    ARROW_ASSIGN_OR_RAISE(auto lit, GetLiteral<ir::StructLiteral>());
+    if (!lit->values()) return UnexpectedNullField("StructLiteral.values");
+    if (static_cast<int>(lit->values()->size()) != t.num_fields()) {
+      return Status::IOError(
+          "StructLiteral.type was ", t.ToString(), "(expected ", t.num_fields(),
+          " fields)", " but StructLiteral.values has size ", lit->values()->size());
+    }
+
+    ScalarVector values{lit->values()->size()};
+    int i = 0;
+    for (const ir::Literal* v : *lit->values()) {
+      if (!v) return UnexpectedNullField("StructLiteral.values[i]");
+      ARROW_ASSIGN_OR_RAISE(Datum value, arrow::compute::Convert(*v));
+      if (!value.type()->Equals(*t.field(i)->type())) {
+        return Status::IOError("StructLiteral.type was ", t.ToString(), " but value ", i,
+                               " had type ", value.type()->ToString(), "(expected ",
+                               t.field(i)->type()->ToString(), ")");
+      }
+      values[i++] = value.scalar();
+    }
+
+    return Datum(std::make_shared<StructScalar>(std::move(values), type_));
+  }
+
+  Result<Datum> Convert(const StringType&) {
+    ARROW_ASSIGN_OR_RAISE(auto lit, GetLiteral<ir::StringLiteral>());
+    if (!lit->value()) return UnexpectedNullField("StringLiteral.value");
+
+    return Datum(ipc::internal::StringFromFlatbuffers(lit->value()));
+  }
+
+  Result<Datum> Convert(const BinaryType&) {
+    ARROW_ASSIGN_OR_RAISE(auto lit, GetLiteral<ir::BinaryLiteral>());
+    if (!lit->value()) return UnexpectedNullField("BinaryLiteral.value");
+
+    ARROW_ASSIGN_OR_RAISE(auto buf, BufferFromFlatbufferByteVector(lit->value()));
+    return Datum(std::make_shared<BinaryScalar>(std::move(buf)));
+  }
+
+  Result<Datum> Convert(const FixedSizeBinaryType& t) {
+    ARROW_ASSIGN_OR_RAISE(auto lit, GetLiteral<ir::FixedSizeBinaryLiteral>());
+    if (!lit->value()) return UnexpectedNullField("FixedSizeBinaryLiteral.value");
+
+    if (static_cast<int>(lit->value()->size()) != t.byte_width()) {
+      return Status::IOError("FixedSizeBinaryLiteral.type was ", t.ToString(),
+                             " but FixedSizeBinaryLiteral.value had size ",
+                             lit->value()->size());
+    }
+
+    ARROW_ASSIGN_OR_RAISE(auto buf, BufferFromFlatbufferByteVector(lit->value()));
+    return Datum(std::make_shared<FixedSizeBinaryScalar>(std::move(buf), type_));
+  }
+
+  Status Visit(const NullType&) { Unreachable(); }
+
+  Status NotImplemented() {
+    return Status::NotImplemented("Literals of type ", type_->ToString());
+  }
+  Status Visit(const ExtensionType& t) { return NotImplemented(); }
+  Status Visit(const SparseUnionType& t) { return NotImplemented(); }
+  Status Visit(const DenseUnionType& t) { return NotImplemented(); }
+  Status Visit(const FixedSizeListType& t) { return NotImplemented(); }
+  Status Visit(const DictionaryType& t) { return NotImplemented(); }
+  Status Visit(const LargeStringType& t) { return NotImplemented(); }
+  Status Visit(const LargeBinaryType& t) { return NotImplemented(); }
+  Status Visit(const LargeListType& t) { return NotImplemented(); }
+
+  template <typename T>
+  Status Visit(const T& t) {
+    ARROW_ASSIGN_OR_RAISE(out_, Convert(t));
+    return Status::OK();
+  }
+
+  template <typename Lit>
+  Result<const Lit*> GetLiteral() {
+    if (const Lit* l = lit_.impl_as<Lit>()) return l;
+
+    return Status::IOError(
+        "Literal.type was ", type_->ToString(), " but got ",
+        ir::EnumNameLiteralImpl(ir::LiteralImplTraits<Lit>::enum_value), " Literal.impl");
+  }
+
+  template <typename Lit, typename T,
+            typename ScalarType = typename TypeTraits<T>::ScalarType,
+            typename ValueType = typename ScalarType::ValueType>
+  Result<Datum> ValueOf(const T&) {
+    ARROW_ASSIGN_OR_RAISE(auto lit, GetLiteral<Lit>());
+    auto scalar =
+        std::make_shared<ScalarType>(static_cast<ValueType>(lit->value()), type_);
+    return Datum(std::move(scalar));
+  }
+
+  Datum out_;
+  const std::shared_ptr<DataType>& type_;
+  const ir::Literal& lit_;
+};
+
+Result<Datum> Convert(const ir::Literal& lit) {
+  if (!lit.type()) return UnexpectedNullField("Literal.type");
+  if (lit.type()->name()) {
+    return Status::IOError("Literal.type should have null Field.name");
+  }
+
+  ARROW_ASSIGN_OR_RAISE(auto field, Convert(*lit.type()));
+  if (!lit.impl()) return MakeNullScalar(field->type());
+
+  if (field->type()->id() == Type::NA) {
+    return Status::IOError("Literal of type null had non-null Literal.impl");
+  }
+
+  ConvertLiteralImpl visitor{{}, field->type(), lit};
+  RETURN_NOT_OK(VisitTypeInline(*field->type(), &visitor));
+  return std::move(visitor.out_);
+}
+
+Result<FieldRef> Convert(const ir::FieldRef& ref) {
+  switch (ref.ref_type()) {
+    case ir::Deref::StructField:
+      return FieldRef(ref.ref_as<ir::StructField>()->position());
+
+    case ir::Deref::FieldIndex:
+      return FieldRef(ref.ref_as<ir::FieldIndex>()->position());
+
+    case ir::Deref::MapKey:
+    case ir::Deref::ArraySubscript:
+    case ir::Deref::ArraySlice:
+    default:
+      break;
+  }
+  return Status::NotImplemented("Deref::", EnumNameDeref(ref.ref_type()));
+}
+
+Result<Expression> Convert(const ir::Expression& expr);
+
+Result<std::pair<std::vector<Expression>, std::vector<Expression>>> Convert(
+    const flatbuffers::Vector<flatbuffers::Offset<ir::CaseFragment>>& cases) {
+  std::vector<Expression> conditions(cases.size()), arguments(cases.size());
+
+  int i = 0;
+  for (const ir::CaseFragment* c : cases) {
+    if (!c) return UnexpectedNullField("Vector<CaseFragment>[i]");
+    ARROW_ASSIGN_OR_RAISE(conditions[i], Convert(*c->match()));
+    ARROW_ASSIGN_OR_RAISE(arguments[i], Convert(*c->result()));
+    ++i;
+  }
+
+  return std::make_pair(std::move(conditions), std::move(arguments));
+}
+
+Expression CaseWhen(std::vector<Expression> conditions, std::vector<Expression> arguments,
+                    Expression default_value) {
+  arguments.insert(arguments.begin(), call("make_struct", std::move(conditions)));
+  arguments.push_back(std::move(default_value));
+  return call("case_when", std::move(arguments));
+}
+
+Result<Expression> Convert(const ir::Expression& expr) {
+  switch (expr.impl_type()) {
+    case ir::ExpressionImpl::Literal: {
+      ARROW_ASSIGN_OR_RAISE(Datum value, Convert(*expr.impl_as<ir::Literal>()));
+      return literal(std::move(value));
+    }
+
+    case ir::ExpressionImpl::FieldRef: {
+      ARROW_ASSIGN_OR_RAISE(FieldRef ref, Convert(*expr.impl_as<ir::FieldRef>()));
+      return field_ref(std::move(ref));
+    }
+
+    case ir::ExpressionImpl::Call: {
+      auto call = expr.impl_as<ir::Call>();
+
+      if (!call->name()) return UnexpectedNullField("Call.name");
+      auto name = ipc::internal::StringFromFlatbuffers(call->name());
+
+      if (!call->arguments()) return UnexpectedNullField("Call.arguments");
+      std::vector<Expression> arguments(call->arguments()->size());
+
+      int i = 0;
+      for (const ir::Expression* a : *call->arguments()) {
+        if (!a) return UnexpectedNullField("Call.arguments[i]");
+        ARROW_ASSIGN_OR_RAISE(arguments[i++], Convert(*a));
+      }
+
+      // What about options...?
+      return arrow::compute::call(std::move(name), std::move(arguments));
+    }
+
+    case ir::ExpressionImpl::Cast: {
+      auto cast = expr.impl_as<ir::Cast>();
+
+      if (!cast->operand()) return UnexpectedNullField("Cast.operand");
+      ARROW_ASSIGN_OR_RAISE(Expression arg, Convert(*cast->operand()));
+
+      if (!cast->to()) return UnexpectedNullField("Cast.to");
+      ARROW_ASSIGN_OR_RAISE(auto to, Convert(*cast->to()));
+
+      return call("cast", {std::move(arg)}, CastOptions::Safe(to->type()));
+    }
+
+    case ir::ExpressionImpl::ConditionalCase: {
+      auto conditional_case = expr.impl_as<ir::ConditionalCase>();
+
+      if (!conditional_case->conditions()) {
+        return UnexpectedNullField("ConditionalCase.conditions");
+      }
+      ARROW_ASSIGN_OR_RAISE(auto cases, Convert(*conditional_case->conditions()));
+
+      if (!conditional_case->else_()) return UnexpectedNullField("ConditionalCase.else");
+      ARROW_ASSIGN_OR_RAISE(auto default_value, Convert(*conditional_case->else_()));
+
+      return CaseWhen(std::move(cases.first), std::move(cases.second),
+                      std::move(default_value));
+    }
+
+    case ir::ExpressionImpl::SimpleCase: {
+      auto simple_case = expr.impl_as<ir::SimpleCase>();
+      auto expression = simple_case->expression();
+      auto matches = simple_case->matches();
+      auto else_ = simple_case->else_();
+
+      if (!expression) return UnexpectedNullField("SimpleCase.expression");
+      ARROW_ASSIGN_OR_RAISE(auto rhs, Convert(*expression));
+
+      if (!matches) return UnexpectedNullField("SimpleCase.matches");
+      ARROW_ASSIGN_OR_RAISE(auto cases, Convert(*simple_case->matches()));
+
+      // replace each condition with an equality expression with the rhs
+      for (auto& condition : cases.first) {
+        condition = equal(std::move(condition), rhs);
+      }
+
+      if (!else_) return UnexpectedNullField("SimpleCase.else");
+      ARROW_ASSIGN_OR_RAISE(auto default_value, Convert(*simple_case->else_()));
+
+      return CaseWhen(std::move(cases.first), std::move(cases.second),
+                      std::move(default_value));
+    }
+
+    case ir::ExpressionImpl::WindowCall:
+    default:
+      break;
+  }
+
+  return Status::NotImplemented("ExpressionImpl::",
+                                EnumNameExpressionImpl(expr.impl_type()));
+}
+
+Result<Declaration> Convert(const ir::Relation& rel) {
+  switch (rel.impl_type()) {
+    case ir::RelationImpl::Source: {
+      auto source = rel.impl_as<ir::Source>();
+
+      if (!source->name()) return UnexpectedNullField("Source.name");
+      auto name = ipc::internal::StringFromFlatbuffers(source->name());
+
+      std::shared_ptr<Schema> schema;
+      if (source->schema()) {
+        ipc::DictionaryMemo ignore;
+        RETURN_NOT_OK(ipc::internal::GetSchema(source->schema(), &ignore, &schema));
+      }
+
+      return Declaration{"catalog_source",
+                         {},
+                         CatalogSourceNodeOptions{std::move(name), std::move(schema)},
+                         LabelFromRelId(source->id())};
+    }
+
+    case ir::RelationImpl::Filter: {
+      auto filter = rel.impl_as<ir::Filter>();
+
+      if (!filter->predicate()) return UnexpectedNullField("Filter.predicate");
+      ARROW_ASSIGN_OR_RAISE(auto predicate, Convert(*filter->predicate()));
+
+      if (!filter->rel()) return UnexpectedNullField("Filter.rel");
+      ARROW_ASSIGN_OR_RAISE(auto arg, Convert(*filter->rel()).As<Declaration::Input>());
+
+      return Declaration{"filter",
+                         {std::move(arg)},
+                         FilterNodeOptions{std::move(predicate)},
+                         LabelFromRelId(filter->id())};
+    }
+
+    case ir::RelationImpl::Project: {
+      auto project = rel.impl_as<ir::Project>();
+
+      if (!project->rel()) return UnexpectedNullField("Project.rel");
+      ARROW_ASSIGN_OR_RAISE(auto arg, Convert(*project->rel()).As<Declaration::Input>());
+
+      ProjectNodeOptions opts{{}};
+
+      if (!project->expressions()) return UnexpectedNullField("Project.expressions");
+      for (const ir::Expression* expression : *project->expressions()) {
+        if (!expression) return UnexpectedNullField("Project.expressions[i]");
+        ARROW_ASSIGN_OR_RAISE(auto expr, Convert(*expression));
+        opts.expressions.push_back(std::move(expr));
+      }
+
+      return Declaration{
+          "project", {std::move(arg)}, std::move(opts), LabelFromRelId(project->id())};
+    }
+
+    case ir::RelationImpl::Aggregate: {
+      auto aggregate = rel.impl_as<ir::Aggregate>();
+
+      if (!aggregate->rel()) return UnexpectedNullField("Aggregate.rel");
+      ARROW_ASSIGN_OR_RAISE(auto arg,
+                            Convert(*aggregate->rel()).As<Declaration::Input>());
+
+      AggregateNodeOptions opts{{}, {}, {}};
+
+      if (!aggregate->measures()) return UnexpectedNullField("Aggregate.measures");
+      for (const ir::Expression* m : *aggregate->measures()) {
+        if (!m) return UnexpectedNullField("Aggregate.measures[i]");
+        ARROW_ASSIGN_OR_RAISE(auto measure, Convert(*m));
+
+        auto call = measure.call();
+        if (!call || call->arguments.size() != 1) {
+          return Status::IOError("One of Aggregate.measures was ", measure.ToString(),
+                                 " (expected Expression::Call with one argument)");
+        }
+
+        auto target = call->arguments.front().field_ref();
+        if (!target) {
+          return Status::NotImplemented(
+              "Support for non-FieldRef arguments to Aggregate.measures");
+        }
+
+        opts.aggregates.push_back({call->function_name, nullptr});
+        opts.targets.push_back(*target);
+        opts.names.push_back(call->function_name + " " + target->ToString());
+      }
+
+      if (!aggregate->groupings()) return UnexpectedNullField("Aggregate.groupings");
+      if (aggregate->groupings()->size() > 1) {
+        return Status::NotImplemented("Support for multiple grouping sets");
+      }
+
+      if (aggregate->groupings()->size() == 1) {
+        if (!aggregate->groupings()->Get(0)) {
+          return UnexpectedNullField("Aggregate.groupings[0]");
+        }
+
+        if (!aggregate->groupings()->Get(0)->keys()) {
+          return UnexpectedNullField("Grouping.keys");
+        }
+
+        for (const ir::Expression* key : *aggregate->groupings()->Get(0)->keys()) {
+          if (!key) return UnexpectedNullField("Grouping.keys[i]");
+          ARROW_ASSIGN_OR_RAISE(auto key_expr, Convert(*key));
+
+          auto key_ref = key_expr.field_ref();
+          if (!key_ref) {
+            return Status::NotImplemented("Support for non-FieldRef grouping keys");
+          }
+          opts.keys.push_back(*key_ref);
+        }
+      }
+
+      return Declaration{"aggregate",
+                         {std::move(arg)},
+                         std::move(opts),
+                         LabelFromRelId(aggregate->id())};
+    }
+
+    case ir::RelationImpl::OrderBy: {
+      auto order_by = rel.impl_as<ir::OrderBy>();
+
+      if (!order_by->rel()) return UnexpectedNullField("OrderBy.rel");
+      ARROW_ASSIGN_OR_RAISE(auto arg, Convert(*order_by->rel()).As<Declaration::Input>());
+
+      if (!order_by->keys()) return UnexpectedNullField("OrderBy.keys");
+      if (order_by->keys()->size() == 0) {
+        return Status::NotImplemented("Empty sort key list");
+      }
+
+      util::optional<NullPlacement> null_placement;
+      std::vector<SortKey> sort_keys;
+
+      for (const ir::SortKey* key : *order_by->keys()) {
+        if (!key) return UnexpectedNullField("OrderBy.keys[i]");
+        ARROW_ASSIGN_OR_RAISE(auto expr, Convert(*key->expression()));
+
+        auto target = expr.field_ref();
+        if (!target) {
+          return Status::NotImplemented(
+              "Support for non-FieldRef expressions in SortKey");
+        }
+        if (target->IsNested()) {
+          return Status::NotImplemented(
+              "Support for nested FieldRef expressions in SortKey");
+        }
+        switch (key->ordering()) {
+          case ir::Ordering::ASCENDING_THEN_NULLS:
+          case ir::Ordering::NULLS_THEN_ASCENDING:
+            sort_keys.emplace_back(*target, SortOrder::Ascending);
+            break;
+          case ir::Ordering::DESCENDING_THEN_NULLS:
+          case ir::Ordering::NULLS_THEN_DESCENDING:
+            sort_keys.emplace_back(*target, SortOrder::Descending);
+            break;
+        }
+
+        NullPlacement key_null_placement;
+        switch (key->ordering()) {
+          case ir::Ordering::ASCENDING_THEN_NULLS:
+          case ir::Ordering::DESCENDING_THEN_NULLS:
+            key_null_placement = NullPlacement::AtEnd;
+            break;
+          case ir::Ordering::NULLS_THEN_ASCENDING:
+          case ir::Ordering::NULLS_THEN_DESCENDING:
+            key_null_placement = NullPlacement::AtStart;
+            break;
+        }
+
+        if (null_placement && *null_placement != key_null_placement) {
+          return Status::NotImplemented("Per-key null_placement");
+        }
+        null_placement = key_null_placement;
+      }
+
+      return Declaration{"order_by_sink",
+                         {std::move(arg)},
+                         OrderBySinkNodeOptions{
+                             SortOptions{std::move(sort_keys), *null_placement}, nullptr},
+                         LabelFromRelId(order_by->id())};
+    }
+
+    default:
+      break;
+  }
+
+  return Status::NotImplemented("RelationImpl::", EnumNameRelationImpl(rel.impl_type()));
+}
+
+}  // namespace compute
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/exec/ir_consumer.h b/cpp/src/arrow/compute/exec/ir_consumer.h
new file mode 100644
index 0000000000000..5af27f98f58fe
--- /dev/null
+++ b/cpp/src/arrow/compute/exec/ir_consumer.h
@@ -0,0 +1,70 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <flatbuffers/flatbuffers.h>
+
+#include "arrow/compute/exec/exec_plan.h"
+#include "arrow/compute/exec/expression.h"
+#include "arrow/compute/exec/options.h"
+#include "arrow/datum.h"
+#include "arrow/result.h"
+#include "arrow/util/visibility.h"
+
+#include "generated/Plan_generated.h"
+
+namespace arrow {
+
+namespace flatbuf = org::apache::arrow::flatbuf;
+
+namespace compute {
+
+namespace ir = org::apache::arrow::computeir::flatbuf;
+
+class ARROW_EXPORT CatalogSourceNodeOptions : public ExecNodeOptions {
+ public:
+  CatalogSourceNodeOptions(std::string name, std::shared_ptr<Schema> schema,
+                           Expression filter = literal(true),
+                           std::vector<FieldRef> projection = {})
+      : name(std::move(name)),
+        schema(std::move(schema)),
+        filter(std::move(filter)),
+        projection(std::move(projection)) {}
+
+  std::string name;
+  std::shared_ptr<Schema> schema;
+  Expression filter;
+  std::vector<FieldRef> projection;
+};
+
+ARROW_EXPORT
+Result<Datum> Convert(const ir::Literal& lit);
+
+ARROW_EXPORT
+Result<Expression> Convert(const ir::Expression& lit);
+
+ARROW_EXPORT
+Result<Declaration> Convert(const ir::Relation& rel);
+
+template <typename Ir>
+auto ConvertRoot(const Buffer& buf) -> decltype(Convert(std::declval<Ir>())) {
+  return Convert(*flatbuffers::GetRoot<Ir>(buf.data()));
+}
+
+}  // namespace compute
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/exec/ir_test.cc b/cpp/src/arrow/compute/exec/ir_test.cc
new file mode 100644
index 0000000000000..847f555c69ace
--- /dev/null
+++ b/cpp/src/arrow/compute/exec/ir_test.cc
@@ -0,0 +1,840 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/exec/ir_consumer.h"
+
+#include <fstream>
+
+#include <gflags/gflags.h>
+
+#include "arrow/compute/exec/options.h"
+#include "arrow/compute/exec/test_util.h"
+#include "arrow/io/file.h"
+#include "arrow/testing/matchers.h"
+#include "arrow/util/io_util.h"
+#include "arrow/util/string_view.h"
+
+#include "generated/Plan_generated.h"
+
+using testing::ElementsAre;
+using testing::ElementsAreArray;
+using testing::Eq;
+using testing::HasSubstr;
+using testing::Optional;
+using testing::UnorderedElementsAreArray;
+
+namespace ir = org::apache::arrow::computeir::flatbuf;
+namespace flatbuf = org::apache::arrow::flatbuf;
+
+DEFINE_string(computeir_dir, "",
+              "Directory containing Flatbuffer schemas for Arrow compute IR.\n"
+              "This is currently $ARROW_REPO/experimental/computeir/");
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+
+  if (std::system("flatc --version") != 0) {
+    std::cout << "flatc not available, skipping tests" << std::endl;
+    return 0;
+  }
+
+  int ret = RUN_ALL_TESTS();
+  gflags::ShutDownCommandLineFlags();
+  return ret;
+}
+
+namespace arrow {
+namespace compute {
+
+std::shared_ptr<Buffer> FlatbufferFromJSON(std::string root_type,
+                                           util::string_view json) {
+  static std::unique_ptr<arrow::internal::TemporaryDir> dir;
+
+  if (!dir) {
+    if (FLAGS_computeir_dir == "") {
+      std::cout << "Required argument -computeir_dir was not provided!" << std::endl;
+      std::abort();
+    }
+
+    dir = *arrow::internal::TemporaryDir::Make("ir_json_");
+  }
+
+  auto json_path = dir->path().ToString() + "ir.json";
+  std::ofstream{json_path} << json;
+
+  std::string cmd = "flatc --binary " + FLAGS_computeir_dir + "/Plan.fbs" +
+                    " --root-type org.apache.arrow.computeir.flatbuf." + root_type + " " +
+                    json_path;
+
+  if (int err = std::system(cmd.c_str())) {
+    std::cerr << cmd << " failed with error code: " << err;
+    std::abort();
+  }
+
+  auto bin = *io::MemoryMappedFile::Open("ir.bin", io::FileMode::READ);
+  return *bin->Read(*bin->GetSize());
+}
+
+template <typename Ir>
+auto ConvertJSON(util::string_view json) -> decltype(Convert(std::declval<Ir>())) {
+  std::string root_type;
+  if (std::is_same<Ir, ir::Literal>::value) {
+    root_type = "Literal";
+  } else if (std::is_same<Ir, ir::Expression>::value) {
+    root_type = "Expression";
+  } else if (std::is_same<Ir, ir::Relation>::value) {
+    root_type = "Relation";
+  } else if (std::is_same<Ir, ir::Plan>::value) {
+    root_type = "Plan";
+  } else {
+    std::cout << "Unknown Ir class in.";
+    std::abort();
+  }
+
+  auto buf = FlatbufferFromJSON(root_type, json);
+  return ConvertRoot<Ir>(*buf);
+}
+
+TEST(Literal, Int64) {
+  ASSERT_THAT(ConvertJSON<ir::Literal>(R"({
+    type: {
+      type_type: "Int",
+      type: { bitWidth: 64, is_signed: true }
+    }
+  })"),
+              ResultWith(DataEq(std::make_shared<Int64Scalar>())));
+
+  ASSERT_THAT(ConvertJSON<ir::Literal>(R"({
+    type: {
+      type_type: "Int",
+      type: { bitWidth: 64, is_signed: true }
+    },
+    impl_type: "Int64Literal",
+    impl: { value: 42 }
+  })"),
+              ResultWith(DataEq<int64_t>(42)));
+}
+
+TEST(Expression, Comparison) {
+  ASSERT_THAT(ConvertJSON<ir::Expression>(R"({
+    impl_type: "Call",
+    impl: {
+      name: "equal",
+      arguments: [
+        {
+          impl_type: "FieldRef",
+          impl: {
+            ref_type: "FieldIndex",
+            ref: {
+              position: 2
+            }
+          }
+        },
+        {
+          impl_type: "Literal",
+          impl: {
+            type: {
+              type_type: "Int",
+              type: { bitWidth: 64, is_signed: true }
+            },
+            impl_type: "Int64Literal",
+            impl: { value: 42 }
+          }
+        }
+      ]
+    }
+  })"),
+              ResultWith(Eq(equal(field_ref(2), literal<int64_t>(42)))));
+}
+
+TEST(Relation, Filter) {
+  ASSERT_THAT(
+      ConvertJSON<ir::Relation>(R"({
+    impl_type: "Filter",
+    impl: {
+      id: { id: 1 },
+      rel: {
+        impl_type: "Source",
+        impl: {
+          id: { id: 0 },
+          name: "test source",
+          schema: {
+            endianness: "Little",
+            fields: [
+              {
+                name: "i32",
+                type_type: "Int",
+                type: {
+                  bitWidth: 32,
+                  is_signed: true
+                },
+                nullable: true
+              },
+              {
+                name: "f64",
+                type_type: "FloatingPoint",
+                type: {
+                  precision: "DOUBLE"
+                },
+                nullable: true
+              },
+              {
+                name: "i64",
+                type_type: "Int",
+                type: {
+                  bitWidth: 64,
+                  is_signed: true
+                },
+                nullable: true
+              }
+            ]
+          }
+        }
+      },
+      predicate: {
+        impl_type: "Call",
+        impl: {
+          name: "equal",
+          arguments: [
+            {
+              impl_type: "FieldRef",
+              impl: {
+                ref_type: "FieldIndex",
+                ref: {
+                  position: 2
+                }
+              }
+            },
+            {
+              impl_type: "Literal",
+              impl: {
+                type: {
+                  type_type: "Int",
+                  type: { bitWidth: 64, is_signed: true }
+                },
+                impl_type: "Int64Literal",
+                impl: { value: 42 }
+              }
+            }
+          ]
+        }
+      }
+    }
+  })"),
+      ResultWith(Eq(Declaration::Sequence({
+          {"catalog_source",
+           CatalogSourceNodeOptions{"test source", schema({
+                                                       field("i32", int32()),
+                                                       field("f64", float64()),
+                                                       field("i64", int64()),
+                                                   })},
+           "0"},
+          {"filter", FilterNodeOptions{equal(field_ref(2), literal<int64_t>(42))}, "1"},
+      }))));
+}
+
+TEST(Relation, AggregateSimple) {
+  ASSERT_THAT(ConvertJSON<ir::Relation>(R"({
+            "impl": {
+                id: {id: 1},
+                "groupings": [
+                    {
+                        "keys": [
+                            {
+                                "impl": {
+                                    "ref": {
+                                        "position": 0
+                                    },
+                                    "ref_type": "FieldIndex",
+                                    "relation_index": 0
+                                },
+                                "impl_type": "FieldRef"
+                            }
+                        ]
+                    }
+                ],
+                "measures": [
+                    {
+                        "impl": {
+                            "arguments": [
+                                {
+                                    "impl": {
+                                        "ref": {
+                                            "position": 1
+                                        },
+                                        "ref_type": "FieldIndex",
+                                        "relation_index": 0
+                                    },
+                                    "impl_type": "FieldRef"
+                                }
+                            ],
+                            "name": "sum"
+                        },
+                        "impl_type": "Call"
+                    },
+                    {
+                        "impl": {
+                            "arguments": [
+                                {
+                                    "impl": {
+                                        "ref": {
+                                            "position": 2
+                                        },
+                                        "ref_type": "FieldIndex",
+                                        "relation_index": 0
+                                    },
+                                    "impl_type": "FieldRef"
+                                }
+                            ],
+                            "name": "mean"
+                        },
+                        "impl_type": "Call"
+                    }
+                ],
+                "rel": {
+                    "impl": {
+                        id: {id: 0},
+                        "name": "tbl",
+                        "schema": {
+                            "endianness": "Little",
+                            "fields": [
+                                {
+                                    "name": "foo",
+                                    "nullable": true,
+                                    "type": {
+                                        "bitWidth": 32,
+                                        "is_signed": true
+                                    },
+                                    "type_type": "Int"
+                                },
+                                {
+                                    "name": "bar",
+                                    "nullable": true,
+                                    "type": {
+                                        "bitWidth": 64,
+                                        "is_signed": true
+                                    },
+                                    "type_type": "Int"
+                                },
+                                {
+                                    "name": "baz",
+                                    "nullable": true,
+                                    "type": {
+                                        "precision": "DOUBLE"
+                                    },
+                                    "type_type": "FloatingPoint"
+                                }
+                            ]
+                        }
+                    },
+                    "impl_type": "Source"
+                }
+            },
+            "impl_type": "Aggregate"
+})"),
+              ResultWith(Eq(Declaration::Sequence({
+                  {"catalog_source",
+                   CatalogSourceNodeOptions{"tbl", schema({
+                                                       field("foo", int32()),
+                                                       field("bar", int64()),
+                                                       field("baz", float64()),
+                                                   })},
+                   "0"},
+                  {"aggregate",
+                   AggregateNodeOptions{/*aggregates=*/{
+                                            {"sum", nullptr},
+                                            {"mean", nullptr},
+                                        },
+                                        /*targets=*/{1, 2},
+                                        /*names=*/
+                                        {
+                                            "sum FieldRef.FieldPath(1)",
+                                            "mean FieldRef.FieldPath(2)",
+                                        },
+                                        /*keys=*/{0}},
+                   "1"},
+              }))));
+}
+
+TEST(Relation, AggregateWithHaving) {
+  ASSERT_THAT(
+      ConvertJSON<ir::Relation>(R"({
+            "impl": {
+                id: {id: 3},
+                "predicate": {
+                    "impl": {
+                        "arguments": [
+                            {
+                                            "impl": {
+                                                "ref": {
+                                                    "position": 0
+                                                },
+                                                "ref_type": "FieldIndex",
+                                                "relation_index": 0
+                                            },
+                                            "impl_type": "FieldRef"
+                            },
+                            {
+                                "impl": {
+                                    "impl": {
+                                        "value": 10
+                                    },
+                                    "impl_type": "Int8Literal",
+                                    "type": {
+                                        "nullable": true,
+                                        "type": {
+                                            "bitWidth": 8,
+                                            "is_signed": true
+                                        },
+                                        "type_type": "Int"
+                                    }
+                                },
+                                "impl_type": "Literal"
+                            }
+                        ],
+                        "name": "greater"
+                    },
+                    "impl_type": "Call"
+                },
+                "rel": {
+                    "impl": {
+                        id: {id: 2},
+                        "groupings": [
+                            {
+                                "keys": [
+                                    {
+                                        "impl": {
+                                            "ref": {
+                                                "position": 0
+                                            },
+                                            "ref_type": "FieldIndex",
+                                            "relation_index": 0
+                                        },
+                                        "impl_type": "FieldRef"
+                                    }
+                                ]
+                            }
+                        ],
+                        "measures": [
+                            {
+                                "impl": {
+                                    "arguments": [
+                                        {
+                                            "impl": {
+                                                "ref": {
+                                                    "position": 1
+                                                },
+                                                "ref_type": "FieldIndex",
+                                                "relation_index": 0
+                                            },
+                                            "impl_type": "FieldRef"
+                                        }
+                                    ],
+                                    "name": "sum"
+                                },
+                                "impl_type": "Call"
+                            },
+                            {
+                                "impl": {
+                                    "arguments": [
+                                        {
+                                            "impl": {
+                                                "ref": {
+                                                    "position": 2
+                                                },
+                                                "ref_type": "FieldIndex",
+                                                "relation_index": 0
+                                            },
+                                            "impl_type": "FieldRef"
+                                        }
+                                    ],
+                                    "name": "mean"
+                                },
+                                "impl_type": "Call"
+                            }
+                        ],
+                        "rel": {
+                            "impl": {
+                                id: {id: 1},
+                                "predicate": {
+                                    "impl": {
+                                        "arguments": [
+                                            {
+                                                "impl": {
+                                                    "ref": {
+                                                        "position": 0
+                                                    },
+                                                    "ref_type": "FieldIndex",
+                                                    "relation_index": 0
+                                                },
+                                                "impl_type": "FieldRef"
+                                            },
+                                            {
+                                                "impl": {
+                                                    "impl": {
+                                                        "value": 3
+                                                    },
+                                                    "impl_type": "Int8Literal",
+                                                    "type": {
+                                                        "nullable": true,
+                                                        "type": {
+                                                            "bitWidth": 8,
+                                                            "is_signed": true
+                                                        },
+                                                        "type_type": "Int"
+                                                    }
+                                                },
+                                                "impl_type": "Literal"
+                                            }
+                                        ],
+                                        "name": "less"
+                                    },
+                                    "impl_type": "Call"
+                                },
+                                "rel": {
+                                    "impl": {
+                                        id: {id: 0},
+                                        "name": "tbl",
+                                        "schema": {
+                                            "endianness": "Little",
+                                            "fields": [
+                                                {
+                                                    "name": "foo",
+                                                    "nullable": true,
+                                                    "type": {
+                                                        "bitWidth": 32,
+                                                        "is_signed": true
+                                                    },
+                                                    "type_type": "Int"
+                                                },
+                                                {
+                                                    "name": "bar",
+                                                    "nullable": true,
+                                                    "type": {
+                                                        "bitWidth": 64,
+                                                        "is_signed": true
+                                                    },
+                                                    "type_type": "Int"
+                                                },
+                                                {
+                                                    "name": "baz",
+                                                    "nullable": true,
+                                                    "type": {
+                                                        "precision": "DOUBLE"
+                                                    },
+                                                    "type_type": "FloatingPoint"
+                                                }
+                                            ]
+                                        }
+                                    },
+                                    "impl_type": "Source"
+                                }
+                            },
+                            "impl_type": "Filter"
+                        }
+                    },
+                    "impl_type": "Aggregate"
+                }
+            },
+            "impl_type": "Filter"
+})"),
+      ResultWith(Eq(Declaration::Sequence({
+          {"catalog_source",
+           CatalogSourceNodeOptions{"tbl", schema({
+                                               field("foo", int32()),
+                                               field("bar", int64()),
+                                               field("baz", float64()),
+                                           })},
+           "0"},
+          {"filter", FilterNodeOptions{less(field_ref(0), literal<int8_t>(3))}, "1"},
+          {"aggregate",
+           AggregateNodeOptions{/*aggregates=*/{
+                                    {"sum", nullptr},
+                                    {"mean", nullptr},
+                                },
+                                /*targets=*/{1, 2},
+                                /*names=*/
+                                {
+                                    "sum FieldRef.FieldPath(1)",
+                                    "mean FieldRef.FieldPath(2)",
+                                },
+                                /*keys=*/{0}},
+           "2"},
+          {"filter", FilterNodeOptions{greater(field_ref(0), literal<int8_t>(10))}, "3"},
+      }))));
+}
+
+TEST(Relation, ProjectionWithFilter) {
+  ASSERT_THAT(
+      ConvertJSON<ir::Relation>(R"({
+            "impl": {
+                id: {id:2},
+                "predicate": {
+                    "impl": {
+                        "arguments": [
+                            {
+                                "impl": {
+                                    "ref": {
+                                        "position": 0
+                                    },
+                                    "ref_type": "FieldIndex",
+                                    "relation_index": 0
+                                },
+                                "impl_type": "FieldRef"
+                            },
+                            {
+                                "impl": {
+                                    "impl": {
+                                        "value": 3
+                                    },
+                                    "impl_type": "Int8Literal",
+                                    "type": {
+                                        "nullable": true,
+                                        "type": {
+                                            "bitWidth": 8,
+                                            "is_signed": true
+                                        },
+                                        "type_type": "Int"
+                                    }
+                                },
+                                "impl_type": "Literal"
+                            }
+                        ],
+                        "name": "less"
+                    },
+                    "impl_type": "Call"
+                },
+                "rel": {
+                    "impl": {
+                        id: {id:1},
+                        "expressions": [
+                            {
+                                "impl": {
+                                    "ref": {
+                                        "position": 1
+                                    },
+                                    "ref_type": "FieldIndex",
+                                    "relation_index": 0
+                                },
+                                "impl_type": "FieldRef"
+                            },
+                            {
+                                "impl": {
+                                    "ref": {
+                                        "position": 2
+                                    },
+                                    "ref_type": "FieldIndex",
+                                    "relation_index": 0
+                                },
+                                "impl_type": "FieldRef"
+                            }
+                        ],
+                        "rel": {
+                            "impl": {
+                                id: {id:0},
+                                "name": "tbl",
+                                "schema": {
+                                    "endianness": "Little",
+                                    "fields": [
+                                        {
+                                            "name": "foo",
+                                            "nullable": true,
+                                            "type": {
+                                                "bitWidth": 32,
+                                                "is_signed": true
+                                            },
+                                            "type_type": "Int"
+                                        },
+                                        {
+                                            "name": "bar",
+                                            "nullable": true,
+                                            "type": {
+                                                "bitWidth": 64,
+                                                "is_signed": true
+                                            },
+                                            "type_type": "Int"
+                                        },
+                                        {
+                                            "name": "baz",
+                                            "nullable": true,
+                                            "type": {
+                                                "precision": "DOUBLE"
+                                            },
+                                            "type_type": "FloatingPoint"
+                                        }
+                                    ]
+                                }
+                            },
+                            "impl_type": "Source"
+                        }
+                    },
+                    "impl_type": "Project"
+                }
+            },
+            "impl_type": "Filter"
+})"),
+      ResultWith(Eq(Declaration::Sequence({
+          {"catalog_source",
+           CatalogSourceNodeOptions{"tbl", schema({
+                                               field("foo", int32()),
+                                               field("bar", int64()),
+                                               field("baz", float64()),
+                                           })},
+           "0"},
+          {"project", ProjectNodeOptions{/*expressions=*/{field_ref(1), field_ref(2)}},
+           "1"},
+          {"filter", FilterNodeOptions{less(field_ref(0), literal<int8_t>(3))}, "2"},
+      }))));
+}
+
+TEST(Relation, ProjectionWithSort) {
+  ASSERT_THAT(
+      ConvertJSON<ir::Relation>(R"({
+            "impl": {
+                id: {id:2},
+                "keys": [
+                    {
+                        "expression": {
+                            "impl": {
+                                "ref": {
+                                    "position": 0
+                                },
+                                "ref_type": "FieldIndex",
+                                "relation_index": 0
+                            },
+                            "impl_type": "FieldRef"
+                        },
+                        "ordering": "NULLS_THEN_ASCENDING"
+                    },
+                    {
+                        "expression": {
+                            "impl": {
+                                "ref": {
+                                    "position": 1
+                                },
+                                "ref_type": "FieldIndex",
+                                "relation_index": 0
+                            },
+                            "impl_type": "FieldRef"
+                        },
+                        "ordering": "NULLS_THEN_DESCENDING"
+                    }
+                ],
+                "rel": {
+                    "impl": {
+                        id: {id:1},
+                        "expressions": [
+                            {
+                                "impl": {
+                                    "ref": {
+                                        "position": 0
+                                    },
+                                    "ref_type": "FieldIndex",
+                                    "relation_index": 0
+                                },
+                                "impl_type": "FieldRef"
+                            },
+                            {
+                                "impl": {
+                                    "ref": {
+                                        "position": 1
+                                    },
+                                    "ref_type": "FieldIndex",
+                                    "relation_index": 0
+                                },
+                                "impl_type": "FieldRef"
+                            },
+                            {
+                                "impl": {
+                                    "ref": {
+                                        "position": 2
+                                    },
+                                    "ref_type": "FieldIndex",
+                                    "relation_index": 0
+                                },
+                                "impl_type": "FieldRef"
+                            }
+                        ],
+                        "rel": {
+                            "impl": {
+                                id: {id: 0},
+                                "name": "tbl",
+                                "schema": {
+                                    "endianness": "Little",
+                                    "fields": [
+                                        {
+                                            "name": "foo",
+                                            "nullable": true,
+                                            "type": {
+                                                "bitWidth": 32,
+                                                "is_signed": true
+                                            },
+                                            "type_type": "Int"
+                                        },
+                                        {
+                                            "name": "bar",
+                                            "nullable": true,
+                                            "type": {
+                                                "bitWidth": 64,
+                                                "is_signed": true
+                                            },
+                                            "type_type": "Int"
+                                        },
+                                        {
+                                            "name": "baz",
+                                            "nullable": true,
+                                            "type": {
+                                                "precision": "DOUBLE"
+                                            },
+                                            "type_type": "FloatingPoint"
+                                        }
+                                    ]
+                                }
+                            },
+                            "impl_type": "Source"
+                        }
+                    },
+                    "impl_type": "Project"
+                }
+            },
+            "impl_type": "OrderBy"
+})"),
+      ResultWith(Eq(Declaration::Sequence({
+          {"catalog_source",
+           CatalogSourceNodeOptions{"tbl", schema({
+                                               field("foo", int32()),
+                                               field("bar", int64()),
+                                               field("baz", float64()),
+                                           })},
+           "0"},
+          {"project",
+           ProjectNodeOptions{/*expressions=*/{field_ref(0), field_ref(1), field_ref(2)}},
+           "1"},
+          {"order_by_sink",
+           OrderBySinkNodeOptions{SortOptions{{
+                                                  SortKey{0, SortOrder::Ascending},
+                                                  SortKey{1, SortOrder::Descending},
+                                              },
+                                              NullPlacement::AtStart},
+                                  nullptr},
+           "2"},
+      }))));
+}
+
+}  // namespace compute
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/exec/plan_test.cc b/cpp/src/arrow/compute/exec/plan_test.cc
index 437a93f9e1c46..1a58596f8b1ef 100644
--- a/cpp/src/arrow/compute/exec/plan_test.cc
+++ b/cpp/src/arrow/compute/exec/plan_test.cc
@@ -301,12 +301,11 @@ TEST(ExecPlan, ToString) {
                     {"sink", SinkNodeOptions{&sink_gen}},
                 })
                 .AddToPlan(plan.get()));
-  EXPECT_EQ(plan->sources()[0]->ToString(), R"(SourceNode{"source", outputs=["sink"]})");
-  EXPECT_EQ(plan->sinks()[0]->ToString(),
-            R"(SinkNode{"sink", inputs=[collected: "source"]})");
+  EXPECT_EQ(plan->sources()[0]->ToString(), R"(:SourceNode{outputs=[:SinkNode]})");
+  EXPECT_EQ(plan->sinks()[0]->ToString(), R"(:SinkNode{inputs=[collected=:SourceNode]})");
   EXPECT_EQ(plan->ToString(), R"(ExecPlan with 2 nodes:
-SourceNode{"source", outputs=["sink"]}
-SinkNode{"sink", inputs=[collected: "source"]}
+:SourceNode{outputs=[:SinkNode]}
+:SinkNode{inputs=[collected=:SourceNode]}
 )");
 
   ASSERT_OK_AND_ASSIGN(plan, ExecPlan::Make());
@@ -316,7 +315,8 @@ SinkNode{"sink", inputs=[collected: "source"]}
           {
               {"source",
                SourceNodeOptions{basic_data.schema,
-                                 basic_data.gen(/*parallel=*/false, /*slow=*/false)}},
+                                 basic_data.gen(/*parallel=*/false, /*slow=*/false)},
+               "custom_source_label"},
               {"filter", FilterNodeOptions{greater_equal(field_ref("i32"), literal(0))}},
               {"project", ProjectNodeOptions{{
                               field_ref("bool"),
@@ -333,22 +333,24 @@ SinkNode{"sink", inputs=[collected: "source"]}
               {"order_by_sink",
                OrderBySinkNodeOptions{
                    SortOptions({SortKey{"sum(multiply(i32, 2))", SortOrder::Ascending}}),
-                   &sink_gen}},
+                   &sink_gen},
+               "custom_sink_label"},
           })
           .AddToPlan(plan.get()));
   EXPECT_EQ(plan->ToString(), R"a(ExecPlan with 6 nodes:
-SourceNode{"source", outputs=["filter"]}
-FilterNode{"filter", inputs=[target: "source"], outputs=["project"], filter=(i32 >= 0)}
-ProjectNode{"project", inputs=[target: "filter"], outputs=["aggregate"], projection=[bool, multiply(i32, 2)]}
-GroupByNode{"aggregate", inputs=[groupby: "project"], outputs=["filter"], keys=["bool"], aggregates=[
+custom_source_label:SourceNode{outputs=[:FilterNode]}
+:FilterNode{inputs=[target=custom_source_label:SourceNode], outputs=[:ProjectNode], filter=(i32 >= 0)}
+:ProjectNode{inputs=[target=:FilterNode], outputs=[:GroupByNode], projection=[bool, multiply(i32, 2)]}
+:GroupByNode{inputs=[groupby=:ProjectNode], outputs=[:FilterNode], keys=["bool"], aggregates=[
 	hash_sum(multiply(i32, 2)),
 	hash_count(multiply(i32, 2), {mode=NON_NULL}),
 ]}
-FilterNode{"filter", inputs=[target: "aggregate"], outputs=["order_by_sink"], filter=(sum(multiply(i32, 2)) > 10)}
-OrderBySinkNode{"order_by_sink", inputs=[collected: "filter"], by={sort_keys=[sum(multiply(i32, 2)) ASC], null_placement=AtEnd}}
+:FilterNode{inputs=[target=:GroupByNode], outputs=[custom_sink_label:OrderBySinkNode], filter=(sum(multiply(i32, 2)) > 10)}
+custom_sink_label:OrderBySinkNode{inputs=[collected=:FilterNode], by={sort_keys=[FieldRef.Name(sum(multiply(i32, 2))) ASC], null_placement=AtEnd}}
 )a");
 
   ASSERT_OK_AND_ASSIGN(plan, ExecPlan::Make());
+
   Declaration union_node{"union", ExecNodeOptions{}};
   Declaration lhs{"source",
                   SourceNodeOptions{basic_data.schema,
@@ -372,13 +374,13 @@ OrderBySinkNode{"order_by_sink", inputs=[collected: "filter"], by={sort_keys=[su
           })
           .AddToPlan(plan.get()));
   EXPECT_EQ(plan->ToString(), R"a(ExecPlan with 5 nodes:
-SourceNode{"lhs", outputs=["union"]}
-SourceNode{"rhs", outputs=["union"]}
-UnionNode{"union", inputs=[input_0_label: "lhs", input_1_label: "rhs"], outputs=["aggregate"]}
-ScalarAggregateNode{"aggregate", inputs=[target: "union"], outputs=["sink"], aggregates=[
+lhs:SourceNode{outputs=[:UnionNode]}
+rhs:SourceNode{outputs=[:UnionNode]}
+:UnionNode{inputs=[input_0_label=lhs:SourceNode, input_1_label=rhs:SourceNode], outputs=[:ScalarAggregateNode]}
+:ScalarAggregateNode{inputs=[target=:UnionNode], outputs=[:SinkNode], aggregates=[
 	count(i32, {mode=NON_NULL}),
 ]}
-SinkNode{"sink", inputs=[collected: "aggregate"]}
+:SinkNode{inputs=[collected=:ScalarAggregateNode]}
 )a");
 }
 
diff --git a/cpp/src/arrow/compute/exec/test_util.cc b/cpp/src/arrow/compute/exec/test_util.cc
index 964c09398bd0a..64f3ec997c934 100644
--- a/cpp/src/arrow/compute/exec/test_util.cc
+++ b/cpp/src/arrow/compute/exec/test_util.cc
@@ -33,7 +33,10 @@
 #include "arrow/compute/api_vector.h"
 #include "arrow/compute/exec.h"
 #include "arrow/compute/exec/exec_plan.h"
+#include "arrow/compute/exec/ir_consumer.h"
+#include "arrow/compute/exec/options.h"
 #include "arrow/compute/exec/util.h"
+#include "arrow/compute/function_internal.h"
 #include "arrow/datum.h"
 #include "arrow/record_batch.h"
 #include "arrow/table.h"
@@ -44,6 +47,7 @@
 #include "arrow/util/iterator.h"
 #include "arrow/util/logging.h"
 #include "arrow/util/optional.h"
+#include "arrow/util/unreachable.h"
 #include "arrow/util/vector.h"
 
 namespace arrow {
@@ -235,5 +239,181 @@ void AssertExecBatchesEqual(const std::shared_ptr<Schema>& schema,
   AssertTablesEqual(exp_tab, act_tab);
 }
 
+template <typename T>
+static const T& OptionsAs(const ExecNodeOptions& opts) {
+  const auto& ptr = checked_cast<const T&>(opts);
+  return ptr;
+}
+
+template <typename T>
+static const T& OptionsAs(const Declaration& decl) {
+  return OptionsAs<T>(*decl.options);
+}
+
+bool operator==(const Declaration& l, const Declaration& r) {
+  if (l.factory_name != r.factory_name) return false;
+  if (l.inputs != r.inputs) return false;
+  if (l.label != r.label) return false;
+
+  if (l.factory_name == "catalog_source") {
+    auto l_opts = &OptionsAs<CatalogSourceNodeOptions>(l);
+    auto r_opts = &OptionsAs<CatalogSourceNodeOptions>(r);
+
+    bool schemas_equal = l_opts->schema == nullptr
+                             ? r_opts->schema == nullptr
+                             : l_opts->schema->Equals(r_opts->schema);
+
+    return l_opts->name == r_opts->name && schemas_equal &&
+           l_opts->filter == r_opts->filter && l_opts->projection == r_opts->projection;
+  }
+
+  if (l.factory_name == "filter") {
+    return OptionsAs<FilterNodeOptions>(l).filter_expression ==
+           OptionsAs<FilterNodeOptions>(r).filter_expression;
+  }
+
+  if (l.factory_name == "project") {
+    auto l_opts = &OptionsAs<ProjectNodeOptions>(l);
+    auto r_opts = &OptionsAs<ProjectNodeOptions>(r);
+    return l_opts->expressions == r_opts->expressions && l_opts->names == r_opts->names;
+  }
+
+  if (l.factory_name == "aggregate") {
+    auto l_opts = &OptionsAs<AggregateNodeOptions>(l);
+    auto r_opts = &OptionsAs<AggregateNodeOptions>(r);
+
+    if (l_opts->aggregates.size() != r_opts->aggregates.size()) return false;
+    for (size_t i = 0; i < l_opts->aggregates.size(); ++i) {
+      auto l_agg = &l_opts->aggregates[i];
+      auto r_agg = &r_opts->aggregates[i];
+
+      if (l_agg->function != r_agg->function) return false;
+
+      if (l_agg->options == r_agg->options) continue;
+      if (l_agg->options == nullptr || r_agg->options == nullptr) return false;
+
+      if (!l_agg->options->Equals(*r_agg->options)) return false;
+    }
+
+    return l_opts->targets == r_opts->targets && l_opts->names == r_opts->names &&
+           l_opts->keys == r_opts->keys;
+  }
+
+  if (l.factory_name == "order_by_sink") {
+    auto l_opts = &OptionsAs<OrderBySinkNodeOptions>(l);
+    auto r_opts = &OptionsAs<OrderBySinkNodeOptions>(r);
+
+    return l_opts->generator == r_opts->generator &&
+           l_opts->sort_options == r_opts->sort_options;
+  }
+
+  Unreachable("equality comparison is not supported for all ExecNodeOptions");
+}
+
+static inline void PrintToImpl(const std::string& factory_name,
+                               const ExecNodeOptions& opts, std::ostream* os) {
+  if (factory_name == "catalog_source") {
+    auto o = &OptionsAs<CatalogSourceNodeOptions>(opts);
+    *os << o->name << ", schema=" << o->schema->ToString();
+    if (o->filter != literal(true)) {
+      *os << ", filter=" << o->filter.ToString();
+    }
+    if (!o->projection.empty()) {
+      *os << ", projection=[";
+      for (const auto& ref : o->projection) {
+        *os << ref.ToString() << ",";
+      }
+      *os << "]";
+    }
+    return;
+  }
+
+  if (factory_name == "filter") {
+    return PrintTo(OptionsAs<FilterNodeOptions>(opts).filter_expression, os);
+  }
+
+  if (factory_name == "project") {
+    auto o = &OptionsAs<ProjectNodeOptions>(opts);
+    *os << "expressions={";
+    for (const auto& expr : o->expressions) {
+      PrintTo(expr, os);
+      *os << ",";
+    }
+    *os << "},";
+
+    if (!o->names.empty()) {
+      *os << "names={";
+      for (const auto& name : o->names) {
+        *os << name << ",";
+      }
+      *os << "}";
+    }
+    return;
+  }
+
+  if (factory_name == "aggregate") {
+    auto o = &OptionsAs<AggregateNodeOptions>(opts);
+
+    *os << "aggregates={";
+    for (const auto& agg : o->aggregates) {
+      *os << agg.function << "<";
+      if (agg.options) PrintTo(*agg.options, os);
+      *os << ">,";
+    }
+    *os << "},";
+
+    *os << "targets={";
+    for (const auto& target : o->targets) {
+      *os << target.ToString() << ",";
+    }
+    *os << "},";
+
+    *os << "names={";
+    for (const auto& name : o->names) {
+      *os << name << ",";
+    }
+    *os << "}";
+
+    if (!o->keys.empty()) {
+      *os << ",keys={";
+      for (const auto& key : o->keys) {
+        *os << key.ToString() << ",";
+      }
+      *os << "}";
+    }
+    return;
+  }
+
+  if (factory_name == "order_by_sink") {
+    auto o = &OptionsAs<OrderBySinkNodeOptions>(opts);
+    if (o->generator) {
+      *os << "NON_NULL_GENERATOR,";
+    }
+    return PrintTo(o->sort_options, os);
+  }
+
+  Unreachable("debug printing is not supported for all ExecNodeOptions");
+}
+
+void PrintTo(const Declaration& decl, std::ostream* os) {
+  *os << decl.factory_name;
+
+  if (decl.label != decl.factory_name) {
+    *os << ":" << decl.label;
+  }
+
+  *os << "<";
+  PrintToImpl(decl.factory_name, *decl.options, os);
+  *os << ">";
+
+  *os << "{";
+  for (const auto& input : decl.inputs) {
+    if (auto decl = util::get_if<Declaration>(&input)) {
+      PrintTo(*decl, os);
+    }
+  }
+  *os << "}";
+}
+
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/exec/test_util.h b/cpp/src/arrow/compute/exec/test_util.h
index 2ee140a5348a3..b68c29394334b 100644
--- a/cpp/src/arrow/compute/exec/test_util.h
+++ b/cpp/src/arrow/compute/exec/test_util.h
@@ -105,5 +105,11 @@ void AssertExecBatchesEqual(const std::shared_ptr<Schema>& schema,
                             const std::vector<ExecBatch>& exp,
                             const std::vector<ExecBatch>& act);
 
+ARROW_TESTING_EXPORT
+bool operator==(const Declaration&, const Declaration&);
+
+ARROW_TESTING_EXPORT
+void PrintTo(const Declaration& decl, std::ostream* os);
+
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/function_internal.h b/cpp/src/arrow/compute/function_internal.h
index 587b9396b5f8d..0395ab3e9fb10 100644
--- a/cpp/src/arrow/compute/function_internal.h
+++ b/cpp/src/arrow/compute/function_internal.h
@@ -265,7 +265,7 @@ template <typename T>
 static inline enable_if_same<T, SortKey, std::shared_ptr<DataType>>
 GenericTypeSingleton() {
   std::vector<std::shared_ptr<Field>> fields;
-  fields.emplace_back(new Field("name", GenericTypeSingleton<std::string>()));
+  fields.emplace_back(new Field("target", GenericTypeSingleton<std::string>()));
   fields.emplace_back(new Field("order", GenericTypeSingleton<SortOrder>()));
   return std::make_shared<StructType>(std::move(fields));
 }
@@ -283,16 +283,20 @@ static inline Result<std::shared_ptr<Scalar>> GenericToScalar(bool value) {
   return MakeScalar(value);
 }
 
+static inline Result<std::shared_ptr<Scalar>> GenericToScalar(const FieldRef& ref) {
+  return MakeScalar(ref.ToDotPath());
+}
+
 template <typename T, typename Enable = enable_if_t<has_enum_traits<T>::value>>
 static inline Result<std::shared_ptr<Scalar>> GenericToScalar(const T value) {
   using CType = typename EnumTraits<T>::CType;
   return GenericToScalar(static_cast<CType>(value));
 }
 
-static inline Result<std::shared_ptr<Scalar>> GenericToScalar(const SortKey& value) {
-  ARROW_ASSIGN_OR_RAISE(auto name, GenericToScalar(value.name));
-  ARROW_ASSIGN_OR_RAISE(auto order, GenericToScalar(value.order));
-  return StructScalar::Make({name, order}, {"name", "order"});
+static inline Result<std::shared_ptr<Scalar>> GenericToScalar(const SortKey& key) {
+  ARROW_ASSIGN_OR_RAISE(auto target, GenericToScalar(key.target));
+  ARROW_ASSIGN_OR_RAISE(auto order, GenericToScalar(key.order));
+  return StructScalar::Make({target, order}, {"target", "order"});
 }
 
 static inline Result<std::shared_ptr<Scalar>> GenericToScalar(
@@ -398,6 +402,13 @@ static inline enable_if_same_result<T, std::string> GenericFromScalar(
   return holder.value->ToString();
 }
 
+template <typename T>
+static inline enable_if_same_result<T, FieldRef> GenericFromScalar(
+    const std::shared_ptr<Scalar>& value) {
+  ARROW_ASSIGN_OR_RAISE(auto path, GenericFromScalar<std::string>(value));
+  return FieldRef::FromDotPath(path);
+}
+
 template <typename T>
 static inline enable_if_same_result<T, SortKey> GenericFromScalar(
     const std::shared_ptr<Scalar>& value) {
@@ -406,11 +417,11 @@ static inline enable_if_same_result<T, SortKey> GenericFromScalar(
   }
   if (!value->is_valid) return Status::Invalid("Got null scalar");
   const auto& holder = checked_cast<const StructScalar&>(*value);
-  ARROW_ASSIGN_OR_RAISE(auto name_holder, holder.field("name"));
+  ARROW_ASSIGN_OR_RAISE(auto target_holder, holder.field("target"));
   ARROW_ASSIGN_OR_RAISE(auto order_holder, holder.field("order"));
-  ARROW_ASSIGN_OR_RAISE(auto name, GenericFromScalar<std::string>(name_holder));
+  ARROW_ASSIGN_OR_RAISE(auto target, GenericFromScalar<FieldRef>(target_holder));
   ARROW_ASSIGN_OR_RAISE(auto order, GenericFromScalar<SortOrder>(order_holder));
-  return SortKey{std::move(name), order};
+  return SortKey{std::move(target), order};
 }
 
 template <typename T>
diff --git a/cpp/src/arrow/compute/kernels/vector_sort.cc b/cpp/src/arrow/compute/kernels/vector_sort.cc
index e7178f71dd051..448f140da1c4b 100644
--- a/cpp/src/arrow/compute/kernels/vector_sort.cc
+++ b/cpp/src/arrow/compute/kernels/vector_sort.cc
@@ -51,6 +51,19 @@ struct SortField {
   SortOrder order;
 };
 
+Status CheckNonNested(const FieldRef& ref) {
+  if (ref.IsNested()) {
+    return Status::KeyError("Nested keys not supported for SortKeys");
+  }
+  return Status::OK();
+}
+
+template <typename T>
+Result<T> PrependInvalidColumn(Result<T> res) {
+  if (res.ok()) return res;
+  return res.status().WithMessage("Invalid sort key column: ", res.status().message());
+}
+
 // Return the field indices of the sort keys, deduplicating them along the way
 Result<std::vector<SortField>> FindSortKeys(const Schema& schema,
                                             const std::vector<SortKey>& sort_keys) {
@@ -60,12 +73,12 @@ Result<std::vector<SortField>> FindSortKeys(const Schema& schema,
   seen.reserve(sort_keys.size());
 
   for (const auto& sort_key : sort_keys) {
-    const auto r = schema.GetFieldIndex(sort_key.name);
-    if (r < 0) {
-      return Status::KeyError("Nonexistent sort key column: ", sort_key.name);
-    }
-    if (seen.insert(r).second) {
-      fields.push_back({r, sort_key.order});
+    RETURN_NOT_OK(CheckNonNested(sort_key.target));
+
+    ARROW_ASSIGN_OR_RAISE(auto match,
+                          PrependInvalidColumn(sort_key.target.FindOne(schema)));
+    if (seen.insert(match[0]).second) {
+      fields.push_back({match[0], sort_key.order});
     }
   }
   return fields;
@@ -91,6 +104,20 @@ Result<std::vector<ResolvedSortKey>> ResolveSortKeys(
       });
 }
 
+// Returns nullptr if no column matching `ref` is found, or if the FieldRef is
+// a nested reference.
+std::shared_ptr<ChunkedArray> GetTableColumn(const Table& table, const FieldRef& ref) {
+  if (ref.IsNested()) return nullptr;
+
+  if (auto name = ref.name()) {
+    return table.GetColumnByName(*name);
+  }
+
+  auto index = ref.field_path()->indices()[0];
+  if (index >= table.num_columns()) return nullptr;
+  return table.column(index);
+}
+
 // We could try to reproduce the concrete Array classes' facilities
 // (such as cached raw values pointer) in a separate hierarchy of
 // physical accessors, but doing so ends up too cumbersome.
@@ -1214,11 +1241,8 @@ class SortIndicesMetaFunction : public MetaFunction {
       return Status::Invalid("Must specify one or more sort keys");
     }
     if (n_sort_keys == 1) {
-      auto array = batch.GetColumnByName(options.sort_keys[0].name);
-      if (!array) {
-        return Status::Invalid("Nonexistent sort key column: ",
-                               options.sort_keys[0].name);
-      }
+      ARROW_ASSIGN_OR_RAISE(
+          auto array, PrependInvalidColumn(options.sort_keys[0].target.GetOne(batch)));
       return SortIndices(*array, options, ctx);
     }
 
@@ -1254,10 +1278,10 @@ class SortIndicesMetaFunction : public MetaFunction {
       return Status::Invalid("Must specify one or more sort keys");
     }
     if (n_sort_keys == 1) {
-      auto chunked_array = table.GetColumnByName(options.sort_keys[0].name);
+      auto chunked_array = GetTableColumn(table, options.sort_keys[0].target);
       if (!chunked_array) {
         return Status::Invalid("Nonexistent sort key column: ",
-                               options.sort_keys[0].name);
+                               options.sort_keys[0].target.ToString());
       }
       return SortIndices(*chunked_array, options, ctx);
     }
@@ -1562,7 +1586,7 @@ class RecordBatchSelecter : public TypeVisitor {
       const RecordBatch& batch, const std::vector<SortKey>& sort_keys) {
     std::vector<ResolvedSortKey> resolved;
     for (const auto& key : sort_keys) {
-      auto array = batch.GetColumnByName(key.name);
+      auto array = key.target.GetOne(batch).ValueOr(nullptr);
       resolved.emplace_back(array, key.order);
     }
     return resolved;
@@ -1698,7 +1722,7 @@ class TableSelecter : public TypeVisitor {
       const Table& table, const std::vector<SortKey>& sort_keys) {
     std::vector<ResolvedSortKey> resolved;
     for (const auto& key : sort_keys) {
-      auto chunked_array = table.GetColumnByName(key.name);
+      auto chunked_array = GetTableColumn(table, key.target);
       resolved.emplace_back(chunked_array, key.order);
     }
     return resolved;
@@ -1808,10 +1832,8 @@ class TableSelecter : public TypeVisitor {
 static Status CheckConsistency(const Schema& schema,
                                const std::vector<SortKey>& sort_keys) {
   for (const auto& key : sort_keys) {
-    auto field = schema.GetFieldByName(key.name);
-    if (!field) {
-      return Status::Invalid("Nonexistent sort key column: ", key.name);
-    }
+    RETURN_NOT_OK(CheckNonNested(key.target));
+    RETURN_NOT_OK(PrependInvalidColumn(key.target.FindOne(schema)));
   }
   return Status::OK();
 }
diff --git a/cpp/src/arrow/compute/kernels/vector_sort_test.cc b/cpp/src/arrow/compute/kernels/vector_sort_test.cc
index d39f6722ce5ea..6e4f6160a2953 100644
--- a/cpp/src/arrow/compute/kernels/vector_sort_test.cc
+++ b/cpp/src/arrow/compute/kernels/vector_sort_test.cc
@@ -1679,8 +1679,15 @@ class TestTableSortIndicesRandom : public testing::TestWithParam<RandomParam> {
    public:
     Comparator(const Table& table, const SortOptions& options) : options_(options) {
       for (const auto& sort_key : options_.sort_keys) {
-        sort_columns_.emplace_back(table.GetColumnByName(sort_key.name).get(),
-                                   sort_key.order);
+        DCHECK(!sort_key.target.IsNested());
+
+        if (auto name = sort_key.target.name()) {
+          sort_columns_.emplace_back(table.GetColumnByName(*name).get(), sort_key.order);
+          continue;
+        }
+
+        auto index = sort_key.target.field_path()->indices()[0];
+        sort_columns_.emplace_back(table.column(index).get(), sort_key.order);
       }
     }
 
@@ -1844,7 +1851,8 @@ TEST_P(TestTableSortIndicesRandom, Sort) {
 
   std::stringstream ss;
   for (const auto& sort_key : sort_keys) {
-    ss << sort_key.name << (sort_key.order == SortOrder::Ascending ? " ASC" : " DESC");
+    ss << sort_key.target.ToString()
+       << (sort_key.order == SortOrder::Ascending ? " ASC" : " DESC");
     ss << ", ";
   }
   ARROW_SCOPED_TRACE("sort_keys = ", ss.str());
diff --git a/cpp/src/arrow/datum.h b/cpp/src/arrow/datum.h
index e51bbaab7f5a0..a9ec4b03495b4 100644
--- a/cpp/src/arrow/datum.h
+++ b/cpp/src/arrow/datum.h
@@ -146,10 +146,13 @@ struct ARROW_EXPORT Datum {
   explicit Datum(const RecordBatch& value);
   explicit Datum(const Table& value);
 
-  // Cast from subtypes of Array to Datum
-  template <typename T, typename = enable_if_t<std::is_base_of<Array, T>::value>>
+  // Cast from subtypes of Array or Scalar to Datum
+  template <typename T, bool IsArray = std::is_base_of<Array, T>::value,
+            bool IsScalar = std::is_base_of<Scalar, T>::value,
+            typename = enable_if_t<IsArray || IsScalar>>
   Datum(const std::shared_ptr<T>& value)  // NOLINT implicit conversion
-      : Datum(std::shared_ptr<Array>(value)) {}
+      : Datum(std::shared_ptr<typename std::conditional<IsArray, Array, Scalar>::type>(
+            value)) {}
 
   // Convenience constructors
   explicit Datum(bool value);
diff --git a/cpp/src/arrow/ipc/metadata_internal.cc b/cpp/src/arrow/ipc/metadata_internal.cc
index f7fd46ee87550..1873bbca9626f 100644
--- a/cpp/src/arrow/ipc/metadata_internal.cc
+++ b/cpp/src/arrow/ipc/metadata_internal.cc
@@ -203,7 +203,9 @@ Status UnionFromFlatbuffer(const flatbuf::Union* union_data,
   *offset = IntToFlatbuffer(fbb, BIT_WIDTH, IS_SIGNED); \
   break;
 
-static inline flatbuf::TimeUnit ToFlatbufferUnit(TimeUnit::type unit) {
+}  // namespace
+
+flatbuf::TimeUnit ToFlatbufferUnit(TimeUnit::type unit) {
   switch (unit) {
     case TimeUnit::SECOND:
       return flatbuf::TimeUnit::SECOND;
@@ -219,7 +221,7 @@ static inline flatbuf::TimeUnit ToFlatbufferUnit(TimeUnit::type unit) {
   return flatbuf::TimeUnit::MIN;
 }
 
-static inline TimeUnit::type FromFlatbufferUnit(flatbuf::TimeUnit unit) {
+TimeUnit::type FromFlatbufferUnit(flatbuf::TimeUnit unit) {
   switch (unit) {
     case flatbuf::TimeUnit::SECOND:
       return TimeUnit::SECOND;
@@ -237,8 +239,7 @@ static inline TimeUnit::type FromFlatbufferUnit(flatbuf::TimeUnit unit) {
 }
 
 Status ConcreteTypeFromFlatbuffer(flatbuf::Type type, const void* type_data,
-                                  const std::vector<std::shared_ptr<Field>>& children,
-                                  std::shared_ptr<DataType>* out) {
+                                  FieldVector children, std::shared_ptr<DataType>* out) {
   switch (type) {
     case flatbuf::Type::NONE:
       return Status::Invalid("Type metadata cannot be none");
@@ -390,6 +391,8 @@ Status ConcreteTypeFromFlatbuffer(flatbuf::Type type, const void* type_data,
   }
 }
 
+namespace {
+
 Status TensorTypeToFlatbuffer(FBB& fbb, const DataType& type, flatbuf::Type* out_type,
                               Offset* offset) {
   switch (type.id()) {
@@ -778,8 +781,8 @@ Status FieldFromFlatbuffer(const flatbuf::Field* field, FieldPosition field_pos,
   // 2. Top-level concrete data type
   auto type_data = field->type();
   CHECK_FLATBUFFERS_NOT_NULL(type_data, "Field.type");
-  RETURN_NOT_OK(
-      ConcreteTypeFromFlatbuffer(field->type_type(), type_data, child_fields, &type));
+  RETURN_NOT_OK(ConcreteTypeFromFlatbuffer(field->type_type(), type_data,
+                                           std::move(child_fields), &type));
 
   // 3. Is it a dictionary type?
   int64_t dictionary_id = -1;
diff --git a/cpp/src/arrow/ipc/metadata_internal.h b/cpp/src/arrow/ipc/metadata_internal.h
index 2afa95f6f8342..abbed5b2dace0 100644
--- a/cpp/src/arrow/ipc/metadata_internal.h
+++ b/cpp/src/arrow/ipc/metadata_internal.h
@@ -125,23 +125,28 @@ inline std::string StringFromFlatbuffers(const flatbuffers::String* s) {
 // dictionary-encoded fields to a DictionaryMemo instance. May be
 // expensive for very large schemas if you are only interested in a
 // few fields
+ARROW_EXPORT
 Status GetSchema(const void* opaque_schema, DictionaryMemo* dictionary_memo,
                  std::shared_ptr<Schema>* out);
 
+ARROW_EXPORT
 Status GetTensorMetadata(const Buffer& metadata, std::shared_ptr<DataType>* type,
                          std::vector<int64_t>* shape, std::vector<int64_t>* strides,
                          std::vector<std::string>* dim_names);
 
 // EXPERIMENTAL: Extracting metadata of a SparseCOOIndex from the message
+ARROW_EXPORT
 Status GetSparseCOOIndexMetadata(const flatbuf::SparseTensorIndexCOO* sparse_index,
                                  std::shared_ptr<DataType>* indices_type);
 
 // EXPERIMENTAL: Extracting metadata of a SparseCSXIndex from the message
+ARROW_EXPORT
 Status GetSparseCSXIndexMetadata(const flatbuf::SparseMatrixIndexCSX* sparse_index,
                                  std::shared_ptr<DataType>* indptr_type,
                                  std::shared_ptr<DataType>* indices_type);
 
 // EXPERIMENTAL: Extracting metadata of a SparseCSFIndex from the message
+ARROW_EXPORT
 Status GetSparseCSFIndexMetadata(const flatbuf::SparseTensorIndexCSF* sparse_index,
                                  std::vector<int64_t>* axis_order,
                                  std::vector<int64_t>* indices_size,
@@ -149,14 +154,20 @@ Status GetSparseCSFIndexMetadata(const flatbuf::SparseTensorIndexCSF* sparse_ind
                                  std::shared_ptr<DataType>* indices_type);
 
 // EXPERIMENTAL: Extracting metadata of a sparse tensor from the message
+ARROW_EXPORT
 Status GetSparseTensorMetadata(const Buffer& metadata, std::shared_ptr<DataType>* type,
                                std::vector<int64_t>* shape,
                                std::vector<std::string>* dim_names, int64_t* length,
                                SparseTensorFormat::type* sparse_tensor_format_id);
 
+ARROW_EXPORT
 Status GetKeyValueMetadata(const KVVector* fb_metadata,
                            std::shared_ptr<KeyValueMetadata>* out);
 
+ARROW_EXPORT
+Status ConcreteTypeFromFlatbuffer(flatbuf::Type type, const void* type_data,
+                                  FieldVector children, std::shared_ptr<DataType>* out);
+
 template <typename RootType>
 bool VerifyFlatbuffers(const uint8_t* data, int64_t size) {
   // Heuristic: tables in a Arrow flatbuffers buffer must take at least 1 bit
@@ -180,6 +191,7 @@ static inline Status VerifyMessage(const uint8_t* data, int64_t size,
 }
 
 // Serialize arrow::Schema as a Flatbuffer
+ARROW_EXPORT
 Status WriteSchemaMessage(const Schema& schema, const DictionaryFieldMapper& mapper,
                           const IpcWriteOptions& options, std::shared_ptr<Buffer>* out);
 
@@ -191,19 +203,23 @@ Status WriteRecordBatchMessage(
     const std::vector<FieldMetadata>& nodes, const std::vector<BufferMetadata>& buffers,
     const IpcWriteOptions& options, std::shared_ptr<Buffer>* out);
 
+ARROW_EXPORT
 Result<std::shared_ptr<Buffer>> WriteTensorMessage(const Tensor& tensor,
                                                    const int64_t buffer_start_offset,
                                                    const IpcWriteOptions& options);
 
+ARROW_EXPORT
 Result<std::shared_ptr<Buffer>> WriteSparseTensorMessage(
     const SparseTensor& sparse_tensor, int64_t body_length,
     const std::vector<BufferMetadata>& buffers, const IpcWriteOptions& options);
 
+ARROW_EXPORT
 Status WriteFileFooter(const Schema& schema, const std::vector<FileBlock>& dictionaries,
                        const std::vector<FileBlock>& record_batches,
                        const std::shared_ptr<const KeyValueMetadata>& metadata,
                        io::OutputStream* out);
 
+ARROW_EXPORT
 Status WriteDictionaryMessage(
     const int64_t id, const bool is_delta, const int64_t length,
     const int64_t body_length,
@@ -223,6 +239,12 @@ static inline Result<std::shared_ptr<Buffer>> WriteFlatbufferBuilder(
   return std::move(result);
 }
 
+ARROW_EXPORT
+flatbuf::TimeUnit ToFlatbufferUnit(TimeUnit::type unit);
+
+ARROW_EXPORT
+TimeUnit::type FromFlatbufferUnit(flatbuf::TimeUnit unit);
+
 }  // namespace internal
 }  // namespace ipc
 }  // namespace arrow
diff --git a/cpp/src/arrow/scalar.h b/cpp/src/arrow/scalar.h
index 232d54950dbf6..b99e1144b90f1 100644
--- a/cpp/src/arrow/scalar.h
+++ b/cpp/src/arrow/scalar.h
@@ -302,7 +302,7 @@ struct TemporalScalar : internal::PrimitiveScalar<T> {
   using internal::PrimitiveScalar<T>::PrimitiveScalar;
   using ValueType = typename TemporalScalar<T>::ValueType;
 
-  explicit TemporalScalar(ValueType value, std::shared_ptr<DataType> type)
+  TemporalScalar(ValueType value, std::shared_ptr<DataType> type)
       : internal::PrimitiveScalar<T>(std::move(value), type) {}
 };
 
@@ -327,6 +327,9 @@ struct ARROW_EXPORT Date64Scalar : public DateScalar<Date64Type> {
 template <typename T>
 struct ARROW_EXPORT TimeScalar : public TemporalScalar<T> {
   using TemporalScalar<T>::TemporalScalar;
+
+  TimeScalar(typename TemporalScalar<T>::ValueType value, TimeUnit::type unit)
+      : TimeScalar(std::move(value), std::make_shared<T>(unit)) {}
 };
 
 struct ARROW_EXPORT Time32Scalar : public TimeScalar<Time32Type> {
@@ -339,6 +342,10 @@ struct ARROW_EXPORT Time64Scalar : public TimeScalar<Time64Type> {
 
 struct ARROW_EXPORT TimestampScalar : public TemporalScalar<TimestampType> {
   using TemporalScalar<TimestampType>::TemporalScalar;
+
+  TimestampScalar(typename TemporalScalar<TimestampType>::ValueType value,
+                  TimeUnit::type unit)
+      : TimestampScalar(std::move(value), timestamp(unit)) {}
 };
 
 template <typename T>
@@ -366,6 +373,10 @@ struct ARROW_EXPORT MonthDayNanoIntervalScalar
 
 struct ARROW_EXPORT DurationScalar : public TemporalScalar<DurationType> {
   using TemporalScalar<DurationType>::TemporalScalar;
+
+  DurationScalar(typename TemporalScalar<DurationType>::ValueType value,
+                 TimeUnit::type unit)
+      : DurationScalar(std::move(value), duration(unit)) {}
 };
 
 struct ARROW_EXPORT Decimal128Scalar : public internal::PrimitiveScalarBase {
diff --git a/cpp/src/arrow/testing/matchers.h b/cpp/src/arrow/testing/matchers.h
index b64269ea7a12e..381b3b0cb9d1f 100644
--- a/cpp/src/arrow/testing/matchers.h
+++ b/cpp/src/arrow/testing/matchers.h
@@ -19,6 +19,7 @@
 
 #include <gmock/gmock-matchers.h>
 
+#include "arrow/datum.h"
 #include "arrow/result.h"
 #include "arrow/status.h"
 #include "arrow/testing/future_util.h"
@@ -192,12 +193,10 @@ class OkMatcher {
       bool MatchAndExplain(const Res& maybe_value,
                            testing::MatchResultListener* listener) const override {
         const Status& status = internal::GenericToStatus(maybe_value);
-        testing::StringMatchResultListener value_listener;
 
         const bool match = status.ok();
         *listener << "whose value " << testing::PrintToString(status.ToString())
                   << (match ? " matches" : " doesn't match");
-        testing::internal::PrintIfNotEmpty(value_listener.str(), listener->stream());
         return match;
       }
     };
@@ -234,4 +233,61 @@ ErrorMatcher Raises(StatusCode code, const MessageMatcher& message_matcher) {
   return ErrorMatcher(code, testing::MatcherCast<std::string>(message_matcher));
 }
 
+class DataEqMatcher {
+ public:
+  explicit DataEqMatcher(Datum expected) : expected_(std::move(expected)) {}
+
+  template <typename Data>
+  operator testing::Matcher<Data>() const {  // NOLINT runtime/explicit
+    struct Impl : testing::MatcherInterface<const Data&> {
+      explicit Impl(Datum expected) : expected_(std::move(expected)) {}
+
+      void DescribeTo(::std::ostream* os) const override {
+        *os << "has data ";
+        PrintTo(expected_, os);
+      }
+
+      void DescribeNegationTo(::std::ostream* os) const override {
+        *os << "doesn't have data ";
+        PrintTo(expected_, os);
+      }
+
+      bool MatchAndExplain(const Data& data,
+                           testing::MatchResultListener* listener) const override {
+        Datum boxed(data);
+
+        if (boxed.kind() != expected_.kind()) {
+          *listener << "whose Datum::kind " << boxed.ToString() << " doesn't match "
+                    << expected_.ToString();
+          return false;
+        }
+
+        if (*boxed.type() != *expected_.type()) {
+          *listener << "whose DataType " << boxed.type()->ToString() << " doesn't match "
+                    << expected_.type()->ToString();
+          return false;
+        }
+
+        const bool match = boxed == expected_;
+        *listener << "whose value ";
+        PrintTo(boxed, listener->stream());
+        *listener << (match ? " matches" : " doesn't match");
+        return match;
+      }
+
+      Datum expected_;
+    };
+
+    return testing::Matcher<Data>(new Impl(expected_));
+  }
+
+ private:
+  Datum expected_;
+};
+
+template <typename Data>
+DataEqMatcher DataEq(Data&& dat) {
+  return DataEqMatcher(Datum(std::forward<Data>(dat)));
+}
+
 }  // namespace arrow
diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc
index ab5e15ed76d36..309815606ef81 100644
--- a/cpp/src/arrow/type.cc
+++ b/cpp/src/arrow/type.cc
@@ -1170,6 +1170,30 @@ Result<FieldRef> FieldRef::FromDotPath(const std::string& dot_path_arg) {
   return out;
 }
 
+std::string FieldRef::ToDotPath() const {
+  struct Visitor {
+    std::string operator()(const FieldPath& path) {
+      std::string out;
+      for (int i : path.indices()) {
+        out += "[" + std::to_string(i) + "]";
+      }
+      return out;
+    }
+
+    std::string operator()(const std::string& name) { return "." + name; }
+
+    std::string operator()(const std::vector<FieldRef>& children) {
+      std::string out;
+      for (const auto& child : children) {
+        out += child.ToDotPath();
+      }
+      return out;
+    }
+  };
+
+  return util::visit(Visitor{}, impl_);
+}
+
 size_t FieldRef::hash() const {
   struct Visitor : std::hash<std::string> {
     using std::hash<std::string>::operator();
diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
index 1df403b9e1b7c..136aff5bb715f 100644
--- a/cpp/src/arrow/type.h
+++ b/cpp/src/arrow/type.h
@@ -188,6 +188,13 @@ class ARROW_EXPORT DataType : public detail::Fingerprintable {
 ARROW_EXPORT
 std::ostream& operator<<(std::ostream& os, const DataType& type);
 
+inline bool operator==(const DataType& lhs, const DataType& rhs) {
+  return lhs.Equals(rhs);
+}
+inline bool operator!=(const DataType& lhs, const DataType& rhs) {
+  return !lhs.Equals(rhs);
+}
+
 /// \brief Return the compatible physical data type
 ///
 /// Some types may have distinct logical meanings but the exact same physical
@@ -1621,6 +1628,7 @@ class ARROW_EXPORT FieldRef {
   /// the resulting name. Therefore if a name must contain the characters '.', '\', or '['
   /// those must be escaped with a preceding '\'.
   static Result<FieldRef> FromDotPath(const std::string& dot_path);
+  std::string ToDotPath() const;
 
   bool Equals(const FieldRef& other) const { return impl_ == other.impl_; }
   bool operator==(const FieldRef& other) const { return Equals(other); }
diff --git a/cpp/src/arrow/util/io_util.cc b/cpp/src/arrow/util/io_util.cc
index f6566ea7e3679..20e631a863f7f 100644
--- a/cpp/src/arrow/util/io_util.cc
+++ b/cpp/src/arrow/util/io_util.cc
@@ -34,8 +34,11 @@
 #include "arrow/util/windows_compatibility.h"  // IWYU pragma: keep
 
 #include <algorithm>
+#include <array>
 #include <cerrno>
+#include <climits>
 #include <cstdint>
+#include <cstdlib>
 #include <cstring>
 #include <iostream>
 #include <random>
@@ -55,6 +58,7 @@
 // file compatibility stuff
 
 #ifdef _WIN32
+#include <direct.h>
 #include <io.h>
 #include <share.h>
 #else  // POSIX-like platforms
@@ -306,6 +310,26 @@ int SignalFromStatus(const Status& status) {
   return 0;
 }
 
+namespace {
+
+Result<NativePathString> NativeReal(const NativePathString& path) {
+#if _WIN32
+  std::array<wchar_t, _MAX_PATH> resolved;
+  if (_wfullpath(const_cast<wchar_t*>(path.c_str()), resolved.data(), resolved.size()) ==
+      nullptr) {
+    return IOErrorFromWinError(errno, "Failed to resolve real path");
+  }
+#else
+  std::array<char, PATH_MAX + 1> resolved;
+  if (realpath(path.c_str(), resolved.data()) == nullptr) {
+    return IOErrorFromErrno(errno, "Failed to resolve real path");
+  }
+#endif
+  return NativePathString{resolved.data()};
+}
+
+}  // namespace
+
 //
 // PlatformFilename implementation
 //
@@ -342,8 +366,8 @@ PlatformFilename& PlatformFilename::operator=(PlatformFilename&& other) {
   return *this;
 }
 
-PlatformFilename::PlatformFilename(const NativePathString& path)
-    : PlatformFilename(Impl{path}) {}
+PlatformFilename::PlatformFilename(NativePathString path)
+    : PlatformFilename(Impl{std::move(path)}) {}
 
 PlatformFilename::PlatformFilename(const NativePathString::value_type* path)
     : PlatformFilename(NativePathString(path)) {}
@@ -376,6 +400,11 @@ PlatformFilename PlatformFilename::Parent() const {
   return PlatformFilename(NativeParent(ToNative()));
 }
 
+Result<PlatformFilename> PlatformFilename::Real() const {
+  ARROW_ASSIGN_OR_RAISE(auto real, NativeReal(ToNative()));
+  return PlatformFilename(std::move(real));
+}
+
 Result<PlatformFilename> PlatformFilename::FromString(const std::string& file_name) {
   RETURN_NOT_OK(ValidatePath(file_name));
   ARROW_ASSIGN_OR_RAISE(auto ns, StringToNative(file_name));
diff --git a/cpp/src/arrow/util/io_util.h b/cpp/src/arrow/util/io_util.h
index 4255dd3710590..c094727a64776 100644
--- a/cpp/src/arrow/util/io_util.h
+++ b/cpp/src/arrow/util/io_util.h
@@ -57,13 +57,14 @@ class ARROW_EXPORT PlatformFilename {
   PlatformFilename(PlatformFilename&&);
   PlatformFilename& operator=(const PlatformFilename&);
   PlatformFilename& operator=(PlatformFilename&&);
-  explicit PlatformFilename(const NativePathString& path);
+  explicit PlatformFilename(NativePathString path);
   explicit PlatformFilename(const NativePathString::value_type* path);
 
   const NativePathString& ToNative() const;
   std::string ToString() const;
 
   PlatformFilename Parent() const;
+  Result<PlatformFilename> Real() const;
 
   // These functions can fail for character encoding reasons.
   static Result<PlatformFilename> FromString(const std::string& file_name);
diff --git a/cpp/src/arrow/util/unreachable.h b/cpp/src/arrow/util/unreachable.h
index 5526359813489..3da0db6f2be10 100644
--- a/cpp/src/arrow/util/unreachable.h
+++ b/cpp/src/arrow/util/unreachable.h
@@ -17,8 +17,10 @@
 
 #pragma once
 
+#include "arrow/util/visibility.h"
+
 namespace arrow {
 
-[[noreturn]] void Unreachable(const char* message = "Unreachable");
+[[noreturn]] ARROW_EXPORT void Unreachable(const char* message = "Unreachable");
 
 }  // namespace arrow
diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py
index be2da31b9d19d..5185232fd9b97 100644
--- a/python/pyarrow/tests/test_compute.py
+++ b/python/pyarrow/tests/test_compute.py
@@ -1974,7 +1974,8 @@ def validate_select_k(select_k_indices, tbl, sort_keys, stable_sort=False):
     with pytest.raises(ValueError, match="not a valid sort order"):
         pc.select_k_unstable(table, k=k, sort_keys=[("a", "nonscending")])
 
-    with pytest.raises(ValueError, match="Nonexistent sort key column"):
+    with pytest.raises(ValueError,
+                       match="Invalid sort key column: No match for.*unknown"):
         pc.select_k_unstable(table, k=k, sort_keys=[("unknown", "ascending")])
 
 

From 230afef57f0ccc2135ced23093bac4298d5ba9e4 Mon Sep 17 00:00:00 2001
From: michalursa <michal@ursacomputing.com>
Date: Fri, 5 Nov 2021 14:37:22 -0400
Subject: [PATCH 091/194] ARROW-14181: [C++][Compute] Support for dictionaries
 in hash join

Supporting dictionary arrays and dictionary scalars as inputs to hash join on both its sides, in key columns and non-key columns.

A key column from probe side of the join can be matched against a key column from build side of the join, as long as the underlying value types are equal, that means that:
- dictionary column (on either side) can be matched against non-dictionary column (on the other side) if underlying value
types are equal
- dictionary column can be matched against dictionary column with a different index type, and potentially using a different dictionary, as long as the underlying value types are equal

We keep the same limitation that is present in hash group by with respect to dictionaries, that is the same dictionary must be used for a given column in all input exec batches. The values in the dictionary do not have to be unique - it can contain duplicate entries and/or null entries.

Closes #11446 from michalursa/ARROW-14181-hash-join-dict

Lead-authored-by: michalursa <michal@ursacomputing.com>
Co-authored-by: Neal Richardson <neal.p.richardson@gmail.com>
Signed-off-by: Neal Richardson <neal.p.richardson@gmail.com>
---
 cpp/src/arrow/CMakeLists.txt                  |   1 +
 cpp/src/arrow/compute/exec/hash_join.cc       | 103 ++-
 cpp/src/arrow/compute/exec/hash_join.h        |   4 -
 cpp/src/arrow/compute/exec/hash_join_dict.cc  | 665 ++++++++++++++++++
 cpp/src/arrow/compute/exec/hash_join_dict.h   | 315 +++++++++
 cpp/src/arrow/compute/exec/hash_join_node.cc  |  28 +-
 .../arrow/compute/exec/hash_join_node_test.cc | 543 ++++++++++++++
 cpp/src/arrow/compute/exec/schema_util.h      |   6 +-
 cpp/src/arrow/compute/exec/source_node.cc     |  14 +-
 cpp/src/arrow/compute/kernels/row_encoder.cc  |   4 +-
 cpp/src/arrow/compute/kernels/row_encoder.h   |   6 +-
 r/tests/testthat/test-dplyr-join.R            |  12 +-
 12 files changed, 1637 insertions(+), 64 deletions(-)
 create mode 100644 cpp/src/arrow/compute/exec/hash_join_dict.cc
 create mode 100644 cpp/src/arrow/compute/exec/hash_join_dict.h

diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index 6eba00de39032..481d5fb6ba051 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -424,6 +424,7 @@ if(ARROW_COMPUTE)
        compute/exec/key_compare.cc
        compute/exec/key_encode.cc
        compute/exec/util.cc
+       compute/exec/hash_join_dict.cc
        compute/exec/hash_join.cc
        compute/exec/hash_join_node.cc
        compute/exec/task_util.cc)
diff --git a/cpp/src/arrow/compute/exec/hash_join.cc b/cpp/src/arrow/compute/exec/hash_join.cc
index 8bbd81824510e..a89e23796d4b9 100644
--- a/cpp/src/arrow/compute/exec/hash_join.cc
+++ b/cpp/src/arrow/compute/exec/hash_join.cc
@@ -24,6 +24,7 @@
 #include <unordered_map>
 #include <vector>
 
+#include "arrow/compute/exec/hash_join_dict.h"
 #include "arrow/compute/exec/task_util.h"
 #include "arrow/compute/kernels/row_encoder.h"
 
@@ -96,6 +97,7 @@ class HashJoinBasicImpl : public HashJoinImpl {
       local_states_[i].is_initialized = false;
       local_states_[i].is_has_match_initialized = false;
     }
+    dict_probe_.Init(num_threads);
 
     has_hash_table_ = false;
     num_batches_produced_.store(0);
@@ -144,12 +146,13 @@ class HashJoinBasicImpl : public HashJoinImpl {
       if (has_payload) {
         InitEncoder(0, HashJoinProjection::PAYLOAD, &local_state.exec_batch_payloads);
       }
+
       local_state.is_initialized = true;
     }
   }
 
   Status EncodeBatch(int side, HashJoinProjection projection_handle, RowEncoder* encoder,
-                     const ExecBatch& batch) {
+                     const ExecBatch& batch, ExecBatch* opt_projected_batch = nullptr) {
     ExecBatch projected({}, batch.length);
     int num_cols = schema_mgr_->proj_maps[side].num_cols(projection_handle);
     projected.values.resize(num_cols);
@@ -160,6 +163,10 @@ class HashJoinBasicImpl : public HashJoinImpl {
       projected.values[icol] = batch.values[to_input.get(icol)];
     }
 
+    if (opt_projected_batch) {
+      *opt_projected_batch = projected;
+    }
+
     return encoder->EncodeAndAppend(projected);
   }
 
@@ -170,6 +177,8 @@ class HashJoinBasicImpl : public HashJoinImpl {
                          std::vector<int32_t>* output_no_match,
                          std::vector<int32_t>* output_match_left,
                          std::vector<int32_t>* output_match_right) {
+    InitHasMatchIfNeeded(local_state);
+
     ARROW_DCHECK(has_hash_table_);
 
     InitHasMatchIfNeeded(local_state);
@@ -311,6 +320,8 @@ class HashJoinBasicImpl : public HashJoinImpl {
       ARROW_DCHECK(opt_right_ids);
       ARROW_ASSIGN_OR_RAISE(right_key,
                             hash_table_keys_.Decode(batch_size_next, opt_right_ids));
+      // Post process build side keys that use dictionary
+      RETURN_NOT_OK(dict_build_.PostDecode(schema_mgr_->proj_maps[1], &right_key, ctx_));
     }
     if (has_right_payload) {
       ARROW_ASSIGN_OR_RAISE(right_payload,
@@ -368,13 +379,48 @@ class HashJoinBasicImpl : public HashJoinImpl {
     return Status::OK();
   }
 
+  void NullInfoFromBatch(const ExecBatch& batch,
+                         std::vector<const uint8_t*>* nn_bit_vectors,
+                         std::vector<int64_t>* nn_offsets,
+                         std::vector<uint8_t>* nn_bit_vector_all_nulls) {
+    int num_cols = static_cast<int>(batch.values.size());
+    nn_bit_vectors->resize(num_cols);
+    nn_offsets->resize(num_cols);
+    nn_bit_vector_all_nulls->clear();
+    for (int64_t i = 0; i < num_cols; ++i) {
+      const uint8_t* nn = nullptr;
+      int64_t offset = 0;
+      if (batch[i].is_array()) {
+        if (batch[i].array()->buffers[0] != NULLPTR) {
+          nn = batch[i].array()->buffers[0]->data();
+          offset = batch[i].array()->offset;
+        }
+      } else {
+        ARROW_DCHECK(batch[i].is_scalar());
+        if (!batch[i].scalar_as<arrow::internal::PrimitiveScalarBase>().is_valid) {
+          if (nn_bit_vector_all_nulls->empty()) {
+            nn_bit_vector_all_nulls->resize(BitUtil::BytesForBits(batch.length));
+            memset(nn_bit_vector_all_nulls->data(), 0,
+                   BitUtil::BytesForBits(batch.length));
+          }
+          nn = nn_bit_vector_all_nulls->data();
+        }
+      }
+      (*nn_bit_vectors)[i] = nn;
+      (*nn_offsets)[i] = offset;
+    }
+  }
+
   Status ProbeBatch(size_t thread_index, const ExecBatch& batch) {
     ThreadLocalState& local_state = local_states_[thread_index];
     InitLocalStateIfNeeded(thread_index);
 
     local_state.exec_batch_keys.Clear();
-    RETURN_NOT_OK(
-        EncodeBatch(0, HashJoinProjection::KEY, &local_state.exec_batch_keys, batch));
+
+    ExecBatch batch_key_for_lookups;
+
+    RETURN_NOT_OK(EncodeBatch(0, HashJoinProjection::KEY, &local_state.exec_batch_keys,
+                              batch, &batch_key_for_lookups));
     bool has_left_payload =
         (schema_mgr_->proj_maps[0].num_cols(HashJoinProjection::PAYLOAD) > 0);
     if (has_left_payload) {
@@ -388,26 +434,24 @@ class HashJoinBasicImpl : public HashJoinImpl {
     local_state.match_left.clear();
     local_state.match_right.clear();
 
+    bool use_key_batch_for_dicts = dict_probe_.BatchRemapNeeded(
+        thread_index, schema_mgr_->proj_maps[0], schema_mgr_->proj_maps[1], ctx_);
+    RowEncoder* row_encoder_for_lookups = &local_state.exec_batch_keys;
+    if (use_key_batch_for_dicts) {
+      RETURN_NOT_OK(dict_probe_.EncodeBatch(
+          thread_index, schema_mgr_->proj_maps[0], schema_mgr_->proj_maps[1], dict_build_,
+          batch, &row_encoder_for_lookups, &batch_key_for_lookups, ctx_));
+    }
+
+    // Collect information about all nulls in key columns.
+    //
     std::vector<const uint8_t*> non_null_bit_vectors;
     std::vector<int64_t> non_null_bit_vector_offsets;
-    int num_key_cols = schema_mgr_->proj_maps[0].num_cols(HashJoinProjection::KEY);
-    non_null_bit_vectors.resize(num_key_cols);
-    non_null_bit_vector_offsets.resize(num_key_cols);
-    auto from_batch =
-        schema_mgr_->proj_maps[0].map(HashJoinProjection::KEY, HashJoinProjection::INPUT);
-    for (int i = 0; i < num_key_cols; ++i) {
-      int input_col_id = from_batch.get(i);
-      const uint8_t* non_nulls = nullptr;
-      int64_t offset = 0;
-      if (batch[input_col_id].array()->buffers[0] != NULLPTR) {
-        non_nulls = batch[input_col_id].array()->buffers[0]->data();
-        offset = batch[input_col_id].array()->offset;
-      }
-      non_null_bit_vectors[i] = non_nulls;
-      non_null_bit_vector_offsets[i] = offset;
-    }
+    std::vector<uint8_t> all_nulls;
+    NullInfoFromBatch(batch_key_for_lookups, &non_null_bit_vectors,
+                      &non_null_bit_vector_offsets, &all_nulls);
 
-    ProbeBatch_Lookup(&local_state, local_state.exec_batch_keys, non_null_bit_vectors,
+    ProbeBatch_Lookup(&local_state, *row_encoder_for_lookups, non_null_bit_vectors,
                       non_null_bit_vector_offsets, &local_state.match,
                       &local_state.no_match, &local_state.match_left,
                       &local_state.match_right);
@@ -427,7 +471,7 @@ class HashJoinBasicImpl : public HashJoinImpl {
     if (batches.empty()) {
       hash_table_empty_ = true;
     } else {
-      InitEncoder(1, HashJoinProjection::KEY, &hash_table_keys_);
+      dict_build_.InitEncoder(schema_mgr_->proj_maps[1], &hash_table_keys_, ctx_);
       bool has_payload =
           (schema_mgr_->proj_maps[1].num_cols(HashJoinProjection::PAYLOAD) > 0);
       if (has_payload) {
@@ -441,11 +485,14 @@ class HashJoinBasicImpl : public HashJoinImpl {
         const ExecBatch& batch = batches[ibatch];
         if (batch.length == 0) {
           continue;
-        } else {
+        } else if (hash_table_empty_) {
           hash_table_empty_ = false;
+
+          RETURN_NOT_OK(dict_build_.Init(schema_mgr_->proj_maps[1], &batch, ctx_));
         }
         int32_t num_rows_before = hash_table_keys_.num_rows();
-        RETURN_NOT_OK(EncodeBatch(1, HashJoinProjection::KEY, &hash_table_keys_, batch));
+        RETURN_NOT_OK(dict_build_.EncodeBatch(thread_index, schema_mgr_->proj_maps[1],
+                                              batch, &hash_table_keys_, ctx_));
         if (has_payload) {
           RETURN_NOT_OK(
               EncodeBatch(1, HashJoinProjection::PAYLOAD, &hash_table_payloads_, batch));
@@ -456,6 +503,11 @@ class HashJoinBasicImpl : public HashJoinImpl {
         }
       }
     }
+
+    if (hash_table_empty_) {
+      RETURN_NOT_OK(dict_build_.Init(schema_mgr_->proj_maps[1], nullptr, ctx_));
+    }
+
     return Status::OK();
   }
 
@@ -713,6 +765,11 @@ class HashJoinBasicImpl : public HashJoinImpl {
   std::vector<uint8_t> has_match_;
   bool hash_table_empty_;
 
+  // Dictionary handling
+  //
+  HashJoinDictBuildMulti dict_build_;
+  HashJoinDictProbeMulti dict_probe_;
+
   std::vector<ExecBatch> left_batches_;
   bool has_hash_table_;
   std::mutex left_batches_mutex_;
diff --git a/cpp/src/arrow/compute/exec/hash_join.h b/cpp/src/arrow/compute/exec/hash_join.h
index a2312e09653c7..11b36d9af27c9 100644
--- a/cpp/src/arrow/compute/exec/hash_join.h
+++ b/cpp/src/arrow/compute/exec/hash_join.h
@@ -31,10 +31,6 @@
 namespace arrow {
 namespace compute {
 
-// Identifiers for all different row schemas that are used in a join
-//
-enum class HashJoinProjection : int { INPUT = 0, KEY = 1, PAYLOAD = 2, OUTPUT = 3 };
-
 class ARROW_EXPORT HashJoinSchema {
  public:
   Status Init(JoinType join_type, const Schema& left_schema,
diff --git a/cpp/src/arrow/compute/exec/hash_join_dict.cc b/cpp/src/arrow/compute/exec/hash_join_dict.cc
new file mode 100644
index 0000000000000..195331a597604
--- /dev/null
+++ b/cpp/src/arrow/compute/exec/hash_join_dict.cc
@@ -0,0 +1,665 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/exec/hash_join_dict.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include "arrow/buffer.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/checked_cast.h"
+
+namespace arrow {
+namespace compute {
+
+bool HashJoinDictUtil::KeyDataTypesValid(
+    const std::shared_ptr<DataType>& probe_data_type,
+    const std::shared_ptr<DataType>& build_data_type) {
+  bool l_is_dict = (probe_data_type->id() == Type::DICTIONARY);
+  bool r_is_dict = (build_data_type->id() == Type::DICTIONARY);
+  DataType* l_type;
+  if (l_is_dict) {
+    const auto& dict_type = checked_cast<const DictionaryType&>(*probe_data_type);
+    l_type = dict_type.value_type().get();
+  } else {
+    l_type = probe_data_type.get();
+  }
+  DataType* r_type;
+  if (r_is_dict) {
+    const auto& dict_type = checked_cast<const DictionaryType&>(*build_data_type);
+    r_type = dict_type.value_type().get();
+  } else {
+    r_type = build_data_type.get();
+  }
+  return l_type->Equals(*r_type);
+}
+
+Result<std::shared_ptr<ArrayData>> HashJoinDictUtil::IndexRemapUsingLUT(
+    ExecContext* ctx, const Datum& indices, int64_t batch_length,
+    const std::shared_ptr<ArrayData>& map_array,
+    const std::shared_ptr<DataType>& data_type) {
+  ARROW_DCHECK(indices.is_array() || indices.is_scalar());
+
+  const uint8_t* map_non_nulls = map_array->buffers[0]->data();
+  const int32_t* map = reinterpret_cast<const int32_t*>(map_array->buffers[1]->data());
+
+  ARROW_DCHECK(data_type->id() == Type::DICTIONARY);
+  const auto& dict_type = checked_cast<const DictionaryType&>(*data_type);
+
+  ARROW_ASSIGN_OR_RAISE(
+      std::shared_ptr<ArrayData> result,
+      ConvertToInt32(dict_type.index_type(), indices, batch_length, ctx));
+
+  uint8_t* nns = result->buffers[0]->mutable_data();
+  int32_t* ids = reinterpret_cast<int32_t*>(result->buffers[1]->mutable_data());
+  for (int64_t i = 0; i < batch_length; ++i) {
+    bool is_null = !BitUtil::GetBit(nns, i);
+    if (is_null) {
+      ids[i] = kNullId;
+    } else {
+      ARROW_DCHECK(ids[i] >= 0 && ids[i] < map_array->length);
+      if (!BitUtil::GetBit(map_non_nulls, ids[i])) {
+        BitUtil::ClearBit(nns, i);
+        ids[i] = kNullId;
+      } else {
+        ids[i] = map[ids[i]];
+      }
+    }
+  }
+
+  return result;
+}
+
+namespace {
+template <typename FROM, typename TO>
+static Result<std::shared_ptr<ArrayData>> ConvertImp(
+    const std::shared_ptr<DataType>& to_type, const Datum& input, int64_t batch_length,
+    ExecContext* ctx) {
+  ARROW_DCHECK(input.is_array() || input.is_scalar());
+  bool is_scalar = input.is_scalar();
+
+  ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> to_buf,
+                        AllocateBuffer(batch_length * sizeof(TO), ctx->memory_pool()));
+  TO* to = reinterpret_cast<TO*>(to_buf->mutable_data());
+  ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> to_nn_buf,
+                        AllocateBitmap(batch_length, ctx->memory_pool()));
+  uint8_t* to_nn = to_nn_buf->mutable_data();
+  memset(to_nn, 0xff, BitUtil::BytesForBits(batch_length));
+
+  if (!is_scalar) {
+    const ArrayData& arr = *input.array();
+    const FROM* from = arr.GetValues<FROM>(1);
+    DCHECK_EQ(arr.length, batch_length);
+
+    for (int64_t i = 0; i < arr.length; ++i) {
+      to[i] = static_cast<TO>(from[i]);
+      // Make sure we did not lose information during cast
+      ARROW_DCHECK(static_cast<FROM>(to[i]) == from[i]);
+
+      bool is_null = (arr.buffers[0] != NULLPTR) &&
+                     !BitUtil::GetBit(arr.buffers[0]->data(), arr.offset + i);
+      if (is_null) {
+        BitUtil::ClearBit(to_nn, i);
+      }
+    }
+
+    // Pass null buffer unchanged
+    return ArrayData::Make(to_type, arr.length,
+                           {std::move(to_nn_buf), std::move(to_buf)});
+  } else {
+    const auto& scalar = input.scalar_as<arrow::internal::PrimitiveScalarBase>();
+    if (scalar.is_valid) {
+      const util::string_view data = scalar.view();
+      DCHECK_EQ(data.size(), sizeof(FROM));
+      const FROM from = *reinterpret_cast<const FROM*>(data.data());
+      const TO to_value = static_cast<TO>(from);
+      // Make sure we did not lose information during cast
+      ARROW_DCHECK(static_cast<FROM>(to_value) == from);
+
+      for (int64_t i = 0; i < batch_length; ++i) {
+        to[i] = to_value;
+      }
+
+      memset(to_nn, 0xff, BitUtil::BytesForBits(batch_length));
+      return ArrayData::Make(to_type, batch_length,
+                             {std::move(to_nn_buf), std::move(to_buf)});
+    } else {
+      memset(to_nn, 0, BitUtil::BytesForBits(batch_length));
+      return ArrayData::Make(to_type, batch_length,
+                             {std::move(to_nn_buf), std::move(to_buf)});
+    }
+  }
+}
+}  // namespace
+
+Result<std::shared_ptr<ArrayData>> HashJoinDictUtil::ConvertToInt32(
+    const std::shared_ptr<DataType>& from_type, const Datum& input, int64_t batch_length,
+    ExecContext* ctx) {
+  switch (from_type->id()) {
+    case Type::UINT8:
+      return ConvertImp<uint8_t, int32_t>(int32(), input, batch_length, ctx);
+    case Type::INT8:
+      return ConvertImp<int8_t, int32_t>(int32(), input, batch_length, ctx);
+    case Type::UINT16:
+      return ConvertImp<uint16_t, int32_t>(int32(), input, batch_length, ctx);
+    case Type::INT16:
+      return ConvertImp<int16_t, int32_t>(int32(), input, batch_length, ctx);
+    case Type::UINT32:
+      return ConvertImp<uint32_t, int32_t>(int32(), input, batch_length, ctx);
+    case Type::INT32:
+      return ConvertImp<int32_t, int32_t>(int32(), input, batch_length, ctx);
+    case Type::UINT64:
+      return ConvertImp<uint64_t, int32_t>(int32(), input, batch_length, ctx);
+    case Type::INT64:
+      return ConvertImp<int64_t, int32_t>(int32(), input, batch_length, ctx);
+    default:
+      ARROW_DCHECK(false);
+      return nullptr;
+  }
+}
+
+Result<std::shared_ptr<ArrayData>> HashJoinDictUtil::ConvertFromInt32(
+    const std::shared_ptr<DataType>& to_type, const Datum& input, int64_t batch_length,
+    ExecContext* ctx) {
+  switch (to_type->id()) {
+    case Type::UINT8:
+      return ConvertImp<int32_t, uint8_t>(to_type, input, batch_length, ctx);
+    case Type::INT8:
+      return ConvertImp<int32_t, int8_t>(to_type, input, batch_length, ctx);
+    case Type::UINT16:
+      return ConvertImp<int32_t, uint16_t>(to_type, input, batch_length, ctx);
+    case Type::INT16:
+      return ConvertImp<int32_t, int16_t>(to_type, input, batch_length, ctx);
+    case Type::UINT32:
+      return ConvertImp<int32_t, uint32_t>(to_type, input, batch_length, ctx);
+    case Type::INT32:
+      return ConvertImp<int32_t, int32_t>(to_type, input, batch_length, ctx);
+    case Type::UINT64:
+      return ConvertImp<int32_t, uint64_t>(to_type, input, batch_length, ctx);
+    case Type::INT64:
+      return ConvertImp<int32_t, int64_t>(to_type, input, batch_length, ctx);
+    default:
+      ARROW_DCHECK(false);
+      return nullptr;
+  }
+}
+
+std::shared_ptr<Array> HashJoinDictUtil::ExtractDictionary(const Datum& data) {
+  return data.is_array() ? MakeArray(data.array()->dictionary)
+                         : data.scalar_as<DictionaryScalar>().value.dictionary;
+}
+
+Status HashJoinDictBuild::Init(ExecContext* ctx, std::shared_ptr<Array> dictionary,
+                               std::shared_ptr<DataType> index_type,
+                               std::shared_ptr<DataType> value_type) {
+  index_type_ = std::move(index_type);
+  value_type_ = std::move(value_type);
+  hash_table_.clear();
+
+  if (!dictionary) {
+    ARROW_ASSIGN_OR_RAISE(auto dict, MakeArrayOfNull(value_type_, 0));
+    unified_dictionary_ = dict->data();
+    return Status::OK();
+  }
+
+  dictionary_ = dictionary;
+
+  // Initialize encoder
+  internal::RowEncoder encoder;
+  std::vector<ValueDescr> encoder_types;
+  encoder_types.emplace_back(value_type_, ValueDescr::ARRAY);
+  encoder.Init(encoder_types, ctx);
+
+  // Encode all dictionary values
+  int64_t length = dictionary->data()->length;
+  if (length >= std::numeric_limits<int32_t>::max()) {
+    return Status::Invalid(
+        "Dictionary length in hash join must fit into signed 32-bit integer.");
+  }
+  ExecBatch batch({dictionary->data()}, length);
+  RETURN_NOT_OK(encoder.EncodeAndAppend(batch));
+
+  std::vector<int32_t> entries_to_take;
+
+  ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> non_nulls_buf,
+                        AllocateBitmap(length, ctx->memory_pool()));
+  ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> ids_buf,
+                        AllocateBuffer(length * sizeof(int32_t), ctx->memory_pool()));
+  uint8_t* non_nulls = non_nulls_buf->mutable_data();
+  int32_t* ids = reinterpret_cast<int32_t*>(ids_buf->mutable_data());
+  memset(non_nulls, 0xff, BitUtil::BytesForBits(length));
+
+  int32_t num_entries = 0;
+  for (int64_t i = 0; i < length; ++i) {
+    std::string str = encoder.encoded_row(static_cast<int32_t>(i));
+
+    // Do not insert null values into resulting dictionary.
+    // Null values will always be represented as null not an id pointing to a
+    // dictionary entry for null.
+    //
+    if (internal::KeyEncoder::IsNull(reinterpret_cast<const uint8_t*>(str.data()))) {
+      ids[i] = HashJoinDictUtil::kNullId;
+      BitUtil::ClearBit(non_nulls, i);
+      continue;
+    }
+
+    auto iter = hash_table_.find(str);
+    if (iter == hash_table_.end()) {
+      hash_table_.insert(std::make_pair(str, num_entries));
+      ids[i] = num_entries;
+      entries_to_take.push_back(static_cast<int32_t>(i));
+      ++num_entries;
+    } else {
+      ids[i] = iter->second;
+    }
+  }
+
+  ARROW_ASSIGN_OR_RAISE(auto out, encoder.Decode(num_entries, entries_to_take.data()));
+
+  unified_dictionary_ = out[0].array();
+  remapped_ids_ = ArrayData::Make(DataTypeAfterRemapping(), length,
+                                  {std::move(non_nulls_buf), std::move(ids_buf)});
+
+  return Status::OK();
+}
+
+Result<std::shared_ptr<ArrayData>> HashJoinDictBuild::RemapInputValues(
+    ExecContext* ctx, const Datum& values, int64_t batch_length) const {
+  // Initialize encoder
+  //
+  internal::RowEncoder encoder;
+  std::vector<ValueDescr> encoder_types;
+  encoder_types.emplace_back(value_type_, ValueDescr::ARRAY);
+  encoder.Init(encoder_types, ctx);
+
+  // Encode all
+  //
+  ARROW_DCHECK(values.is_array() || values.is_scalar());
+  bool is_scalar = values.is_scalar();
+  int64_t encoded_length = is_scalar ? 1 : batch_length;
+  ExecBatch batch({values}, encoded_length);
+  RETURN_NOT_OK(encoder.EncodeAndAppend(batch));
+
+  // Allocate output buffers
+  //
+  ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> non_nulls_buf,
+                        AllocateBitmap(batch_length, ctx->memory_pool()));
+  ARROW_ASSIGN_OR_RAISE(
+      std::shared_ptr<Buffer> ids_buf,
+      AllocateBuffer(batch_length * sizeof(int32_t), ctx->memory_pool()));
+  uint8_t* non_nulls = non_nulls_buf->mutable_data();
+  int32_t* ids = reinterpret_cast<int32_t*>(ids_buf->mutable_data());
+  memset(non_nulls, 0xff, BitUtil::BytesForBits(batch_length));
+
+  // Populate output buffers (for scalar only the first entry is populated)
+  //
+  for (int64_t i = 0; i < encoded_length; ++i) {
+    std::string str = encoder.encoded_row(static_cast<int32_t>(i));
+    if (internal::KeyEncoder::IsNull(reinterpret_cast<const uint8_t*>(str.data()))) {
+      // Map nulls to nulls
+      BitUtil::ClearBit(non_nulls, i);
+      ids[i] = HashJoinDictUtil::kNullId;
+    } else {
+      auto iter = hash_table_.find(str);
+      if (iter == hash_table_.end()) {
+        ids[i] = HashJoinDictUtil::kMissingValueId;
+      } else {
+        ids[i] = iter->second;
+      }
+    }
+  }
+
+  // Generate array of repeated values for scalar input
+  //
+  if (is_scalar) {
+    if (!BitUtil::GetBit(non_nulls, 0)) {
+      memset(non_nulls, 0, BitUtil::BytesForBits(batch_length));
+    }
+    for (int64_t i = 1; i < batch_length; ++i) {
+      ids[i] = ids[0];
+    }
+  }
+
+  return ArrayData::Make(DataTypeAfterRemapping(), batch_length,
+                         {std::move(non_nulls_buf), std::move(ids_buf)});
+}
+
+Result<std::shared_ptr<ArrayData>> HashJoinDictBuild::RemapInput(
+    ExecContext* ctx, const Datum& indices, int64_t batch_length,
+    const std::shared_ptr<DataType>& data_type) const {
+  auto dict = HashJoinDictUtil::ExtractDictionary(indices);
+
+  if (!dictionary_->Equals(dict)) {
+    return Status::NotImplemented("Unifying differing dictionaries");
+  }
+
+  return HashJoinDictUtil::IndexRemapUsingLUT(ctx, indices, batch_length, remapped_ids_,
+                                              data_type);
+}
+
+Result<std::shared_ptr<ArrayData>> HashJoinDictBuild::RemapOutput(
+    const ArrayData& indices32Bit, ExecContext* ctx) const {
+  ARROW_ASSIGN_OR_RAISE(std::shared_ptr<ArrayData> indices,
+                        HashJoinDictUtil::ConvertFromInt32(
+                            index_type_, Datum(indices32Bit), indices32Bit.length, ctx));
+
+  auto type = std::make_shared<DictionaryType>(index_type_, value_type_);
+  return ArrayData::Make(type, indices->length, indices->buffers, {},
+                         unified_dictionary_);
+}
+
+void HashJoinDictBuild::CleanUp() {
+  index_type_.reset();
+  value_type_.reset();
+  hash_table_.clear();
+  remapped_ids_.reset();
+  unified_dictionary_.reset();
+}
+
+bool HashJoinDictProbe::KeyNeedsProcessing(
+    const std::shared_ptr<DataType>& probe_data_type,
+    const std::shared_ptr<DataType>& build_data_type) {
+  bool l_is_dict = (probe_data_type->id() == Type::DICTIONARY);
+  bool r_is_dict = (build_data_type->id() == Type::DICTIONARY);
+  return l_is_dict || r_is_dict;
+}
+
+std::shared_ptr<DataType> HashJoinDictProbe::DataTypeAfterRemapping(
+    const std::shared_ptr<DataType>& build_data_type) {
+  bool r_is_dict = (build_data_type->id() == Type::DICTIONARY);
+  if (r_is_dict) {
+    return HashJoinDictBuild::DataTypeAfterRemapping();
+  } else {
+    return build_data_type;
+  }
+}
+
+Result<std::shared_ptr<ArrayData>> HashJoinDictProbe::RemapInput(
+    const HashJoinDictBuild* opt_build_side, const Datum& data, int64_t batch_length,
+    const std::shared_ptr<DataType>& probe_data_type,
+    const std::shared_ptr<DataType>& build_data_type, ExecContext* ctx) {
+  // Cases:
+  // 1. Dictionary(probe)-Dictionary(build)
+  // 2. Dictionary(probe)-Value(build)
+  // 3. Value(probe)-Dictionary(build)
+  //
+  bool l_is_dict = (probe_data_type->id() == Type::DICTIONARY);
+  bool r_is_dict = (build_data_type->id() == Type::DICTIONARY);
+  if (l_is_dict) {
+    auto dict = HashJoinDictUtil::ExtractDictionary(data);
+    const auto& dict_type = checked_cast<const DictionaryType&>(*probe_data_type);
+
+    // Verify that the dictionary is always the same.
+    if (dictionary_) {
+      if (!dictionary_->Equals(dict)) {
+        return Status::NotImplemented(
+            "Unifying differing dictionaries for probe key of hash join");
+      }
+    } else {
+      dictionary_ = dict;
+
+      // Precompute helper data for the given dictionary if this is the first call.
+      if (r_is_dict) {
+        ARROW_DCHECK(opt_build_side);
+        ARROW_ASSIGN_OR_RAISE(
+            remapped_ids_,
+            opt_build_side->RemapInputValues(ctx, Datum(dict->data()), dict->length()));
+      } else {
+        std::vector<ValueDescr> encoder_types;
+        encoder_types.emplace_back(dict_type.value_type(), ValueDescr::ARRAY);
+        encoder_.Init(encoder_types, ctx);
+        ExecBatch batch({dict->data()}, dict->length());
+        RETURN_NOT_OK(encoder_.EncodeAndAppend(batch));
+      }
+    }
+
+    if (r_is_dict) {
+      // CASE 1:
+      // Remap dictionary ids
+      return HashJoinDictUtil::IndexRemapUsingLUT(ctx, data, batch_length, remapped_ids_,
+                                                  probe_data_type);
+    } else {
+      // CASE 2:
+      // Decode selected rows from encoder.
+      ARROW_ASSIGN_OR_RAISE(std::shared_ptr<ArrayData> row_ids_arr,
+                            HashJoinDictUtil::ConvertToInt32(dict_type.index_type(), data,
+                                                             batch_length, ctx));
+      // Change nulls to internal::RowEncoder::kRowIdForNulls() in index.
+      int32_t* row_ids =
+          reinterpret_cast<int32_t*>(row_ids_arr->buffers[1]->mutable_data());
+      const uint8_t* non_nulls = row_ids_arr->buffers[0]->data();
+      for (int64_t i = 0; i < batch_length; ++i) {
+        if (!BitUtil::GetBit(non_nulls, i)) {
+          row_ids[i] = internal::RowEncoder::kRowIdForNulls();
+        }
+      }
+
+      ARROW_ASSIGN_OR_RAISE(ExecBatch batch, encoder_.Decode(batch_length, row_ids));
+      return batch.values[0].array();
+    }
+  } else {
+    // CASE 3:
+    // Map values to dictionary ids from build side.
+    // Values missing in the dictionary will get assigned a special constant
+    // HashJoinDictUtil::kMissingValueId (different than any valid id).
+    //
+    ARROW_DCHECK(r_is_dict);
+    ARROW_DCHECK(opt_build_side);
+    return opt_build_side->RemapInputValues(ctx, data, batch_length);
+  }
+}
+
+void HashJoinDictProbe::CleanUp() {
+  dictionary_.reset();
+  remapped_ids_.reset();
+  encoder_.Clear();
+}
+
+Status HashJoinDictBuildMulti::Init(
+    const SchemaProjectionMaps<HashJoinProjection>& proj_map,
+    const ExecBatch* opt_non_empty_batch, ExecContext* ctx) {
+  int num_keys = proj_map.num_cols(HashJoinProjection::KEY);
+  needs_remap_.resize(num_keys);
+  remap_imp_.resize(num_keys);
+  for (int i = 0; i < num_keys; ++i) {
+    needs_remap_[i] = HashJoinDictBuild::KeyNeedsProcessing(
+        proj_map.data_type(HashJoinProjection::KEY, i));
+  }
+
+  bool build_side_empty = (opt_non_empty_batch == nullptr);
+
+  if (!build_side_empty) {
+    auto key_to_input = proj_map.map(HashJoinProjection::KEY, HashJoinProjection::INPUT);
+    for (int i = 0; i < num_keys; ++i) {
+      const std::shared_ptr<DataType>& data_type =
+          proj_map.data_type(HashJoinProjection::KEY, i);
+      if (data_type->id() == Type::DICTIONARY) {
+        const auto& dict_type = checked_cast<const DictionaryType&>(*data_type);
+        const auto& dict = HashJoinDictUtil::ExtractDictionary(
+            opt_non_empty_batch->values[key_to_input.get(i)]);
+        RETURN_NOT_OK(remap_imp_[i].Init(ctx, dict, dict_type.index_type(),
+                                         dict_type.value_type()));
+      }
+    }
+  } else {
+    for (int i = 0; i < num_keys; ++i) {
+      const std::shared_ptr<DataType>& data_type =
+          proj_map.data_type(HashJoinProjection::KEY, i);
+      if (data_type->id() == Type::DICTIONARY) {
+        const auto& dict_type = checked_cast<const DictionaryType&>(*data_type);
+        RETURN_NOT_OK(remap_imp_[i].Init(ctx, nullptr, dict_type.index_type(),
+                                         dict_type.value_type()));
+      }
+    }
+  }
+  return Status::OK();
+}
+
+void HashJoinDictBuildMulti::InitEncoder(
+    const SchemaProjectionMaps<HashJoinProjection>& proj_map, RowEncoder* encoder,
+    ExecContext* ctx) {
+  int num_cols = proj_map.num_cols(HashJoinProjection::KEY);
+  std::vector<ValueDescr> data_types(num_cols);
+  for (int icol = 0; icol < num_cols; ++icol) {
+    std::shared_ptr<DataType> data_type =
+        proj_map.data_type(HashJoinProjection::KEY, icol);
+    if (HashJoinDictBuild::KeyNeedsProcessing(data_type)) {
+      data_type = HashJoinDictBuild::DataTypeAfterRemapping();
+    }
+    data_types[icol] = ValueDescr(data_type, ValueDescr::ARRAY);
+  }
+  encoder->Init(data_types, ctx);
+}
+
+Status HashJoinDictBuildMulti::EncodeBatch(
+    size_t thread_index, const SchemaProjectionMaps<HashJoinProjection>& proj_map,
+    const ExecBatch& batch, RowEncoder* encoder, ExecContext* ctx) const {
+  ExecBatch projected({}, batch.length);
+  int num_cols = proj_map.num_cols(HashJoinProjection::KEY);
+  projected.values.resize(num_cols);
+
+  auto to_input = proj_map.map(HashJoinProjection::KEY, HashJoinProjection::INPUT);
+  for (int icol = 0; icol < num_cols; ++icol) {
+    projected.values[icol] = batch.values[to_input.get(icol)];
+
+    if (needs_remap_[icol]) {
+      ARROW_ASSIGN_OR_RAISE(
+          projected.values[icol],
+          remap_imp_[icol].RemapInput(ctx, projected.values[icol], batch.length,
+                                      proj_map.data_type(HashJoinProjection::KEY, icol)));
+    }
+  }
+  return encoder->EncodeAndAppend(projected);
+}
+
+Status HashJoinDictBuildMulti::PostDecode(
+    const SchemaProjectionMaps<HashJoinProjection>& proj_map,
+    ExecBatch* decoded_key_batch, ExecContext* ctx) {
+  // Post process build side keys that use dictionary
+  int num_keys = proj_map.num_cols(HashJoinProjection::KEY);
+  for (int i = 0; i < num_keys; ++i) {
+    if (needs_remap_[i]) {
+      ARROW_ASSIGN_OR_RAISE(
+          decoded_key_batch->values[i],
+          remap_imp_[i].RemapOutput(*decoded_key_batch->values[i].array(), ctx));
+    }
+  }
+  return Status::OK();
+}
+
+void HashJoinDictProbeMulti::Init(size_t num_threads) {
+  local_states_.resize(num_threads);
+  for (size_t i = 0; i < local_states_.size(); ++i) {
+    local_states_[i].is_initialized = false;
+  }
+}
+
+bool HashJoinDictProbeMulti::BatchRemapNeeded(
+    size_t thread_index, const SchemaProjectionMaps<HashJoinProjection>& proj_map_probe,
+    const SchemaProjectionMaps<HashJoinProjection>& proj_map_build, ExecContext* ctx) {
+  InitLocalStateIfNeeded(thread_index, proj_map_probe, proj_map_build, ctx);
+  return local_states_[thread_index].any_needs_remap;
+}
+
+void HashJoinDictProbeMulti::InitLocalStateIfNeeded(
+    size_t thread_index, const SchemaProjectionMaps<HashJoinProjection>& proj_map_probe,
+    const SchemaProjectionMaps<HashJoinProjection>& proj_map_build, ExecContext* ctx) {
+  ThreadLocalState& local_state = local_states_[thread_index];
+
+  // Check if we need to remap any of the input keys because of dictionary encoding
+  // on either side of the join
+  //
+  int num_cols = proj_map_probe.num_cols(HashJoinProjection::KEY);
+  local_state.any_needs_remap = false;
+  local_state.needs_remap.resize(num_cols);
+  local_state.remap_imp.resize(num_cols);
+  for (int i = 0; i < num_cols; ++i) {
+    local_state.needs_remap[i] = HashJoinDictProbe::KeyNeedsProcessing(
+        proj_map_probe.data_type(HashJoinProjection::KEY, i),
+        proj_map_build.data_type(HashJoinProjection::KEY, i));
+    if (local_state.needs_remap[i]) {
+      local_state.any_needs_remap = true;
+    }
+  }
+
+  if (local_state.any_needs_remap) {
+    InitEncoder(proj_map_probe, proj_map_build, &local_state.post_remap_encoder, ctx);
+  }
+}
+
+void HashJoinDictProbeMulti::InitEncoder(
+    const SchemaProjectionMaps<HashJoinProjection>& proj_map_probe,
+    const SchemaProjectionMaps<HashJoinProjection>& proj_map_build, RowEncoder* encoder,
+    ExecContext* ctx) {
+  int num_cols = proj_map_probe.num_cols(HashJoinProjection::KEY);
+  std::vector<ValueDescr> data_types(num_cols);
+  for (int icol = 0; icol < num_cols; ++icol) {
+    std::shared_ptr<DataType> data_type =
+        proj_map_probe.data_type(HashJoinProjection::KEY, icol);
+    std::shared_ptr<DataType> build_data_type =
+        proj_map_build.data_type(HashJoinProjection::KEY, icol);
+    if (HashJoinDictProbe::KeyNeedsProcessing(data_type, build_data_type)) {
+      data_type = HashJoinDictProbe::DataTypeAfterRemapping(build_data_type);
+    }
+    data_types[icol] = ValueDescr(data_type, ValueDescr::ARRAY);
+  }
+  encoder->Init(data_types, ctx);
+}
+
+Status HashJoinDictProbeMulti::EncodeBatch(
+    size_t thread_index, const SchemaProjectionMaps<HashJoinProjection>& proj_map_probe,
+    const SchemaProjectionMaps<HashJoinProjection>& proj_map_build,
+    const HashJoinDictBuildMulti& dict_build, const ExecBatch& batch,
+    RowEncoder** out_encoder, ExecBatch* opt_out_key_batch, ExecContext* ctx) {
+  ThreadLocalState& local_state = local_states_[thread_index];
+  InitLocalStateIfNeeded(thread_index, proj_map_probe, proj_map_build, ctx);
+
+  ExecBatch projected({}, batch.length);
+  int num_cols = proj_map_probe.num_cols(HashJoinProjection::KEY);
+  projected.values.resize(num_cols);
+
+  auto to_input = proj_map_probe.map(HashJoinProjection::KEY, HashJoinProjection::INPUT);
+  for (int icol = 0; icol < num_cols; ++icol) {
+    projected.values[icol] = batch.values[to_input.get(icol)];
+
+    if (local_state.needs_remap[icol]) {
+      ARROW_ASSIGN_OR_RAISE(
+          projected.values[icol],
+          local_state.remap_imp[icol].RemapInput(
+              &(dict_build.get_dict_build(icol)), projected.values[icol], batch.length,
+              proj_map_probe.data_type(HashJoinProjection::KEY, icol),
+              proj_map_build.data_type(HashJoinProjection::KEY, icol), ctx));
+    }
+  }
+
+  if (opt_out_key_batch) {
+    *opt_out_key_batch = projected;
+  }
+
+  local_state.post_remap_encoder.Clear();
+  RETURN_NOT_OK(local_state.post_remap_encoder.EncodeAndAppend(projected));
+  *out_encoder = &local_state.post_remap_encoder;
+
+  return Status::OK();
+}
+
+}  // namespace compute
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/exec/hash_join_dict.h b/cpp/src/arrow/compute/exec/hash_join_dict.h
new file mode 100644
index 0000000000000..26605cc449a0d
--- /dev/null
+++ b/cpp/src/arrow/compute/exec/hash_join_dict.h
@@ -0,0 +1,315 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <unordered_map>
+
+#include "arrow/compute/exec.h"
+#include "arrow/compute/exec/schema_util.h"
+#include "arrow/compute/kernels/row_encoder.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+
+// This file contains hash join logic related to handling of dictionary encoded key
+// columns.
+//
+// A key column from probe side of the join can be matched against a key column from build
+// side of the join, as long as the underlying value types are equal. That means that:
+// - both scalars and arrays can be used and even mixed in the same column
+// - dictionary column can be matched against non-dictionary column if underlying value
+// types are equal
+// - dictionary column can be matched against dictionary column with a different index
+// type, and potentially using a different dictionary, if underlying value types are equal
+//
+// We currently require in hash join that for all dictionary encoded columns, the same
+// dictionary is used in all input exec batches.
+//
+// In order to allow matching columns with different dictionaries, different dictionary
+// index types, and dictionary key against non-dictionary key, internally comparisons will
+// be evaluated after remapping values on both sides of the join to a common
+// representation (which will be called "unified representation"). This common
+// representation is a column of int32() type (not a dictionary column). It represents an
+// index in the unified dictionary computed for the (only) dictionary present on build
+// side (an empty dictionary is still created for an empty build side). Null value is
+// always represented in this common representation as null int32 value, unified
+// dictionary will never contain a null value (so there is no ambiguity of representing
+// nulls as either index to a null entry in the dictionary or null index).
+//
+// Unified dictionary represents values present on build side. There may be values on
+// probe side that are not present in it. All such values, that are not null, are mapped
+// in the common representation to a special constant kMissingValueId.
+//
+
+namespace arrow {
+namespace compute {
+
+using internal::RowEncoder;
+
+/// Helper class with operations that are stateless and common to processing of dictionary
+/// keys on both build and probe side.
+class HashJoinDictUtil {
+ public:
+  // Null values in unified representation are always represented as null that has
+  // corresponding integer set to this constant
+  static constexpr int32_t kNullId = 0;
+  // Constant representing a value, that is not null, missing on the build side, in
+  // unified representation.
+  static constexpr int32_t kMissingValueId = -1;
+
+  // Check if data types of corresponding pair of key column on build and probe side are
+  // compatible
+  static bool KeyDataTypesValid(const std::shared_ptr<DataType>& probe_data_type,
+                                const std::shared_ptr<DataType>& build_data_type);
+
+  // Input must be dictionary array or dictionary scalar.
+  // A precomputed and provided here lookup table in the form of int32() array will be
+  // used to remap input indices to unified representation.
+  //
+  static Result<std::shared_ptr<ArrayData>> IndexRemapUsingLUT(
+      ExecContext* ctx, const Datum& indices, int64_t batch_length,
+      const std::shared_ptr<ArrayData>& map_array,
+      const std::shared_ptr<DataType>& data_type);
+
+  // Return int32() array that contains indices of input dictionary array or scalar after
+  // type casting.
+  static Result<std::shared_ptr<ArrayData>> ConvertToInt32(
+      const std::shared_ptr<DataType>& from_type, const Datum& input,
+      int64_t batch_length, ExecContext* ctx);
+
+  // Return an array that contains elements of input int32() array after casting to a
+  // given integer type. This is used for mapping unified representation stored in the
+  // hash table on build side back to original input data type of hash join, when
+  // outputting hash join results to parent exec node.
+  //
+  static Result<std::shared_ptr<ArrayData>> ConvertFromInt32(
+      const std::shared_ptr<DataType>& to_type, const Datum& input, int64_t batch_length,
+      ExecContext* ctx);
+
+  // Return dictionary referenced in either dictionary array or dictionary scalar
+  static std::shared_ptr<Array> ExtractDictionary(const Datum& data);
+};
+
+/// Implements processing of dictionary arrays/scalars in key columns on the build side of
+/// a hash join.
+/// Each instance of this class corresponds to a single column and stores and
+/// processes only the information related to that column.
+/// Const methods are thread-safe, non-const methods are not (the caller must make sure
+/// that only one thread at any time will access them).
+///
+class HashJoinDictBuild {
+ public:
+  // Returns true if the key column (described in input by its data type) requires any
+  // pre- or post-processing related to handling dictionaries.
+  //
+  static bool KeyNeedsProcessing(const std::shared_ptr<DataType>& build_data_type) {
+    return (build_data_type->id() == Type::DICTIONARY);
+  }
+
+  // Data type of unified representation
+  static std::shared_ptr<DataType> DataTypeAfterRemapping() { return int32(); }
+
+  // Should be called only once in hash join, before processing any build or probe
+  // batches.
+  //
+  // Takes a pointer to the dictionary for a corresponding key column on the build side as
+  // an input. If the build side is empty, it still needs to be called, but with
+  // dictionary pointer set to null.
+  //
+  // Currently it is required that all input batches on build side share the same
+  // dictionary. For each input batch during its pre-processing, dictionary will be
+  // checked and error will be returned if it is different then the one provided in the
+  // call to this method.
+  //
+  // Unifies the dictionary. The order of the values is still preserved.
+  // Null and duplicate entries are removed. If the dictionary is already unified, its
+  // copy will be produced and stored within this class.
+  //
+  // Prepares the mapping from ids within original dictionary to the ids in the resulting
+  // dictionary. This is used later on to pre-process (map to unified representation) key
+  // column on build side.
+  //
+  // Prepares the reverse mapping (in the form of hash table) from values to the ids in
+  // the resulting dictionary. This will be used later on to pre-process (map to unified
+  // representation) key column on probe side. Values on probe side that are not present
+  // in the original dictionary will be mapped to a special constant kMissingValueId. The
+  // exception is made for nulls, which get always mapped to nulls (both when null is
+  // represented as a dictionary id pointing to a null and a null dictionary id).
+  //
+  Status Init(ExecContext* ctx, std::shared_ptr<Array> dictionary,
+              std::shared_ptr<DataType> index_type, std::shared_ptr<DataType> value_type);
+
+  // Remap array or scalar values into unified representation (array of int32()).
+  // Outputs kMissingValueId if input value is not found in the unified dictionary.
+  // Outputs null for null input value (with corresponding data set to kNullId).
+  //
+  Result<std::shared_ptr<ArrayData>> RemapInputValues(ExecContext* ctx,
+                                                      const Datum& values,
+                                                      int64_t batch_length) const;
+
+  // Remap dictionary array or dictionary scalar on build side to unified representation.
+  // Dictionary referenced in the input must match the dictionary that was
+  // given during initialization.
+  // The output is a dictionary array that references unified dictionary.
+  //
+  Result<std::shared_ptr<ArrayData>> RemapInput(
+      ExecContext* ctx, const Datum& indices, int64_t batch_length,
+      const std::shared_ptr<DataType>& data_type) const;
+
+  // Outputs dictionary array referencing unified dictionary, given an array with 32-bit
+  // ids.
+  // Used to post-process values looked up in a hash table on build side of the hash join
+  // before outputting to the parent exec node.
+  //
+  Result<std::shared_ptr<ArrayData>> RemapOutput(const ArrayData& indices32Bit,
+                                                 ExecContext* ctx) const;
+
+  // Release shared pointers and memory
+  void CleanUp();
+
+ private:
+  // Data type of dictionary ids for the input dictionary on build side
+  std::shared_ptr<DataType> index_type_;
+  // Data type of values for the input dictionary on build side
+  std::shared_ptr<DataType> value_type_;
+  // Mapping from (encoded as string) values to the ids in unified dictionary
+  std::unordered_map<std::string, int32_t> hash_table_;
+  // Mapping from input dictionary ids to unified dictionary ids
+  std::shared_ptr<ArrayData> remapped_ids_;
+  // Input dictionary
+  std::shared_ptr<Array> dictionary_;
+  // Unified dictionary
+  std::shared_ptr<ArrayData> unified_dictionary_;
+};
+
+/// Implements processing of dictionary arrays/scalars in key columns on the probe side of
+/// a hash join.
+/// Each instance of this class corresponds to a single column and stores and
+/// processes only the information related to that column.
+/// It is not thread-safe - every participating thread should use its own instance of
+/// this class.
+///
+class HashJoinDictProbe {
+ public:
+  static bool KeyNeedsProcessing(const std::shared_ptr<DataType>& probe_data_type,
+                                 const std::shared_ptr<DataType>& build_data_type);
+
+  // Data type of the result of remapping input key column.
+  //
+  // The result of remapping is what is used in hash join for matching keys on build and
+  // probe side. The exact data types may be different, as described below, and therefore
+  // a common representation is needed for simplifying comparisons of pairs of keys on
+  // both sides.
+  //
+  // We support matching key that is of non-dictionary type with key that is of dictionary
+  // type, as long as the underlying value types are equal. We support matching when both
+  // keys are of dictionary type, regardless whether underlying dictionary index types are
+  // the same or not.
+  //
+  static std::shared_ptr<DataType> DataTypeAfterRemapping(
+      const std::shared_ptr<DataType>& build_data_type);
+
+  // Should only be called if KeyNeedsProcessing method returns true for a pair of
+  // corresponding key columns from build and probe side.
+  // Converts values in order to match the common representation for
+  // both build and probe side used in hash table comparison.
+  // Supports arrays and scalars as input.
+  // Argument opt_build_side should be null if dictionary key on probe side is matched
+  // with non-dictionary key on build side.
+  //
+  Result<std::shared_ptr<ArrayData>> RemapInput(
+      const HashJoinDictBuild* opt_build_side, const Datum& data, int64_t batch_length,
+      const std::shared_ptr<DataType>& probe_data_type,
+      const std::shared_ptr<DataType>& build_data_type, ExecContext* ctx);
+
+  void CleanUp();
+
+ private:
+  // May be null if probe side key is non-dictionary. Otherwise it is used to verify that
+  // only a single dictionary is referenced in exec batch on probe side of hash join.
+  std::shared_ptr<Array> dictionary_;
+  // Mapping from dictionary on probe side of hash join (if it is used) to unified
+  // representation.
+  std::shared_ptr<ArrayData> remapped_ids_;
+  // Encoder of key columns that uses unified representation instead of original data type
+  // for key columns that need to use it (have dictionaries on either side of the join).
+  internal::RowEncoder encoder_;
+};
+
+// Encapsulates dictionary handling logic for build side of hash join.
+//
+class HashJoinDictBuildMulti {
+ public:
+  Status Init(const SchemaProjectionMaps<HashJoinProjection>& proj_map,
+              const ExecBatch* opt_non_empty_batch, ExecContext* ctx);
+  static void InitEncoder(const SchemaProjectionMaps<HashJoinProjection>& proj_map,
+                          RowEncoder* encoder, ExecContext* ctx);
+  Status EncodeBatch(size_t thread_index,
+                     const SchemaProjectionMaps<HashJoinProjection>& proj_map,
+                     const ExecBatch& batch, RowEncoder* encoder, ExecContext* ctx) const;
+  Status PostDecode(const SchemaProjectionMaps<HashJoinProjection>& proj_map,
+                    ExecBatch* decoded_key_batch, ExecContext* ctx);
+  const HashJoinDictBuild& get_dict_build(int icol) const { return remap_imp_[icol]; }
+
+ private:
+  std::vector<bool> needs_remap_;
+  std::vector<HashJoinDictBuild> remap_imp_;
+};
+
+// Encapsulates dictionary handling logic for probe side of hash join
+//
+class HashJoinDictProbeMulti {
+ public:
+  void Init(size_t num_threads);
+  bool BatchRemapNeeded(size_t thread_index,
+                        const SchemaProjectionMaps<HashJoinProjection>& proj_map_probe,
+                        const SchemaProjectionMaps<HashJoinProjection>& proj_map_build,
+                        ExecContext* ctx);
+  Status EncodeBatch(size_t thread_index,
+                     const SchemaProjectionMaps<HashJoinProjection>& proj_map_probe,
+                     const SchemaProjectionMaps<HashJoinProjection>& proj_map_build,
+                     const HashJoinDictBuildMulti& dict_build, const ExecBatch& batch,
+                     RowEncoder** out_encoder, ExecBatch* opt_out_key_batch,
+                     ExecContext* ctx);
+
+ private:
+  void InitLocalStateIfNeeded(
+      size_t thread_index, const SchemaProjectionMaps<HashJoinProjection>& proj_map_probe,
+      const SchemaProjectionMaps<HashJoinProjection>& proj_map_build, ExecContext* ctx);
+  static void InitEncoder(const SchemaProjectionMaps<HashJoinProjection>& proj_map_probe,
+                          const SchemaProjectionMaps<HashJoinProjection>& proj_map_build,
+                          RowEncoder* encoder, ExecContext* ctx);
+  struct ThreadLocalState {
+    bool is_initialized;
+    // Whether any key column needs remapping (because of dictionaries used) before doing
+    // join hash table lookups
+    bool any_needs_remap;
+    // Whether each key column needs remapping before doing join hash table lookups
+    std::vector<bool> needs_remap;
+    std::vector<HashJoinDictProbe> remap_imp;
+    // Encoder of key columns that uses unified representation instead of original data
+    // type for key columns that need to use it (have dictionaries on either side of the
+    // join).
+    RowEncoder post_remap_encoder;
+  };
+  std::vector<ThreadLocalState> local_states_;
+};
+
+}  // namespace compute
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/exec/hash_join_node.cc b/cpp/src/arrow/compute/exec/hash_join_node.cc
index 3e02054fbedf1..583ac9a14685b 100644
--- a/cpp/src/arrow/compute/exec/hash_join_node.cc
+++ b/cpp/src/arrow/compute/exec/hash_join_node.cc
@@ -19,6 +19,7 @@
 
 #include "arrow/compute/exec/exec_plan.h"
 #include "arrow/compute/exec/hash_join.h"
+#include "arrow/compute/exec/hash_join_dict.h"
 #include "arrow/compute/exec/options.h"
 #include "arrow/compute/exec/schema_util.h"
 #include "arrow/compute/exec/util.h"
@@ -163,13 +164,6 @@ Status HashJoinSchema::ValidateSchemas(JoinType join_type, const Schema& left_sc
     const FieldPath& match = result.ValueUnsafe();
     const std::shared_ptr<DataType>& type =
         (left_side ? left_schema.fields() : right_schema.fields())[match[0]]->type();
-    if (type->id() == Type::DICTIONARY) {
-      return Status::Invalid(
-          "Dictionary type support for join key is not yet implemented, key field "
-          "reference: ",
-          field_ref.ToString(), left_side ? " on left " : " on right ",
-          "side of the join");
-    }
     if ((type->id() != Type::BOOL && !is_fixed_width(type->id()) &&
          !is_binary_like(type->id())) ||
         is_large_binary_like(type->id())) {
@@ -184,11 +178,11 @@ Status HashJoinSchema::ValidateSchemas(JoinType join_type, const Schema& left_sc
     int right_id = right_ref.FindOne(right_schema).ValueUnsafe()[0];
     const std::shared_ptr<DataType>& left_type = left_schema.fields()[left_id]->type();
     const std::shared_ptr<DataType>& right_type = right_schema.fields()[right_id]->type();
-    if (!left_type->Equals(right_type)) {
-      return Status::Invalid("Mismatched data types for corresponding join field keys: ",
-                             left_ref.ToString(), " of type ", left_type->ToString(),
-                             " and ", right_ref.ToString(), " of type ",
-                             right_type->ToString());
+    if (!HashJoinDictUtil::KeyDataTypesValid(left_type, right_type)) {
+      return Status::Invalid(
+          "Incompatible data types for corresponding join field keys: ",
+          left_ref.ToString(), " of type ", left_type->ToString(), " and ",
+          right_ref.ToString(), " of type ", right_type->ToString());
     }
   }
 
@@ -228,16 +222,6 @@ Status HashJoinSchema::ValidateSchemas(JoinType join_type, const Schema& left_sc
                              field_ref.ToString(), left_side ? " on left " : " on right ",
                              "side of the join");
     }
-    const FieldPath& match = result.ValueUnsafe();
-    const std::shared_ptr<DataType>& type =
-        (left_side ? left_schema.fields() : right_schema.fields())[match[0]]->type();
-    if (type->id() == Type::DICTIONARY) {
-      return Status::Invalid(
-          "Dictionary type support for join output field is not yet implemented, output "
-          "field reference: ",
-          field_ref.ToString(), left_side ? " on left " : " on right ",
-          "side of the join");
-    }
   }
   return Status::OK();
 }
diff --git a/cpp/src/arrow/compute/exec/hash_join_node_test.cc b/cpp/src/arrow/compute/exec/hash_join_node_test.cc
index a5410b0d37a22..d20b456fec513 100644
--- a/cpp/src/arrow/compute/exec/hash_join_node_test.cc
+++ b/cpp/src/arrow/compute/exec/hash_join_node_test.cc
@@ -1113,5 +1113,548 @@ TEST(HashJoin, Random) {
   }
 }
 
+void DecodeScalarsAndDictionariesInBatch(ExecBatch* batch, MemoryPool* pool) {
+  for (size_t i = 0; i < batch->values.size(); ++i) {
+    if (batch->values[i].is_scalar()) {
+      ASSERT_OK_AND_ASSIGN(
+          std::shared_ptr<Array> col,
+          MakeArrayFromScalar(*(batch->values[i].scalar()), batch->length, pool));
+      batch->values[i] = Datum(col);
+    }
+    if (batch->values[i].type()->id() == Type::DICTIONARY) {
+      const auto& dict_type =
+          checked_cast<const DictionaryType&>(*batch->values[i].type());
+      std::shared_ptr<ArrayData> indices =
+          ArrayData::Make(dict_type.index_type(), batch->values[i].array()->length,
+                          batch->values[i].array()->buffers);
+      const std::shared_ptr<ArrayData>& dictionary = batch->values[i].array()->dictionary;
+      ASSERT_OK_AND_ASSIGN(Datum col, Take(*dictionary, *indices));
+      batch->values[i] = col;
+    }
+  }
+}
+
+std::shared_ptr<Schema> UpdateSchemaAfterDecodingDictionaries(
+    const std::shared_ptr<Schema>& schema) {
+  std::vector<std::shared_ptr<Field>> output_fields(schema->num_fields());
+  for (int i = 0; i < schema->num_fields(); ++i) {
+    const std::shared_ptr<Field>& field = schema->field(i);
+    if (field->type()->id() == Type::DICTIONARY) {
+      const auto& dict_type = checked_cast<const DictionaryType&>(*field->type());
+      output_fields[i] = std::make_shared<Field>(field->name(), dict_type.value_type(),
+                                                 true /* nullable */);
+    } else {
+      output_fields[i] = field->Copy();
+    }
+  }
+  return std::make_shared<Schema>(std::move(output_fields));
+}
+
+void TestHashJoinDictionaryHelper(
+    JoinType join_type, JoinKeyCmp cmp,
+    // Whether to run parallel hash join.
+    // This requires generating multiple copies of each input batch on one side of the
+    // join. Expected results will be automatically adjusted to reflect the multiplication
+    // of input batches.
+    bool parallel, Datum l_key, Datum l_payload, Datum r_key, Datum r_payload,
+    Datum l_out_key, Datum l_out_payload, Datum r_out_key, Datum r_out_payload,
+    // Number of rows at the end of expected output that represent rows from the right
+    // side that do not have a match on the left side. This number is needed to
+    // automatically adjust expected result when multiplying input batches on the left
+    // side.
+    int expected_num_r_no_match,
+    // Whether to swap two inputs to the hash join
+    bool swap_sides) {
+  int64_t l_length = l_key.is_array()
+                         ? l_key.array()->length
+                         : l_payload.is_array() ? l_payload.array()->length : -1;
+  int64_t r_length = r_key.is_array()
+                         ? r_key.array()->length
+                         : r_payload.is_array() ? r_payload.array()->length : -1;
+  ARROW_DCHECK(l_length >= 0 && r_length >= 0);
+
+  constexpr int batch_multiplicity_for_parallel = 2;
+
+  // Split both sides into exactly two batches
+  int64_t l_first_length = l_length / 2;
+  int64_t r_first_length = r_length / 2;
+  BatchesWithSchema l_batches, r_batches;
+  l_batches.batches.resize(2);
+  r_batches.batches.resize(2);
+  ASSERT_OK_AND_ASSIGN(
+      l_batches.batches[0],
+      ExecBatch::Make({l_key.is_array() ? l_key.array()->Slice(0, l_first_length) : l_key,
+                       l_payload.is_array() ? l_payload.array()->Slice(0, l_first_length)
+                                            : l_payload}));
+  ASSERT_OK_AND_ASSIGN(
+      l_batches.batches[1],
+      ExecBatch::Make(
+          {l_key.is_array()
+               ? l_key.array()->Slice(l_first_length, l_length - l_first_length)
+               : l_key,
+           l_payload.is_array()
+               ? l_payload.array()->Slice(l_first_length, l_length - l_first_length)
+               : l_payload}));
+  ASSERT_OK_AND_ASSIGN(
+      r_batches.batches[0],
+      ExecBatch::Make({r_key.is_array() ? r_key.array()->Slice(0, r_first_length) : r_key,
+                       r_payload.is_array() ? r_payload.array()->Slice(0, r_first_length)
+                                            : r_payload}));
+  ASSERT_OK_AND_ASSIGN(
+      r_batches.batches[1],
+      ExecBatch::Make(
+          {r_key.is_array()
+               ? r_key.array()->Slice(r_first_length, r_length - r_first_length)
+               : r_key,
+           r_payload.is_array()
+               ? r_payload.array()->Slice(r_first_length, r_length - r_first_length)
+               : r_payload}));
+  l_batches.schema =
+      schema({field("l_key", l_key.type()), field("l_payload", l_payload.type())});
+  r_batches.schema =
+      schema({field("r_key", r_key.type()), field("r_payload", r_payload.type())});
+
+  // Add copies of input batches on originally left side of the hash join
+  if (parallel) {
+    for (int i = 0; i < batch_multiplicity_for_parallel - 1; ++i) {
+      l_batches.batches.push_back(l_batches.batches[0]);
+      l_batches.batches.push_back(l_batches.batches[1]);
+    }
+  }
+
+  auto exec_ctx = arrow::internal::make_unique<ExecContext>(
+      default_memory_pool(), parallel ? arrow::internal::GetCpuThreadPool() : nullptr);
+  ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make(exec_ctx.get()));
+  ASSERT_OK_AND_ASSIGN(
+      ExecNode * l_source,
+      MakeExecNode("source", plan.get(), {},
+                   SourceNodeOptions{l_batches.schema, l_batches.gen(parallel,
+                                                                     /*slow=*/false)}));
+  ASSERT_OK_AND_ASSIGN(
+      ExecNode * r_source,
+      MakeExecNode("source", plan.get(), {},
+                   SourceNodeOptions{r_batches.schema, r_batches.gen(parallel,
+                                                                     /*slow=*/false)}));
+  HashJoinNodeOptions join_options{join_type,
+                                   {FieldRef(swap_sides ? "r_key" : "l_key")},
+                                   {FieldRef(swap_sides ? "l_key" : "r_key")},
+                                   {FieldRef(swap_sides ? "r_key" : "l_key"),
+                                    FieldRef(swap_sides ? "r_payload" : "l_payload")},
+                                   {FieldRef(swap_sides ? "l_key" : "r_key"),
+                                    FieldRef(swap_sides ? "l_payload" : "r_payload")},
+                                   {cmp}};
+  ASSERT_OK_AND_ASSIGN(ExecNode * join, MakeExecNode("hashjoin", plan.get(),
+                                                     {(swap_sides ? r_source : l_source),
+                                                      (swap_sides ? l_source : r_source)},
+                                                     join_options));
+  AsyncGenerator<util::optional<ExecBatch>> sink_gen;
+  ASSERT_OK_AND_ASSIGN(
+      std::ignore, MakeExecNode("sink", plan.get(), {join}, SinkNodeOptions{&sink_gen}));
+  ASSERT_FINISHES_OK_AND_ASSIGN(auto res, StartAndCollect(plan.get(), sink_gen));
+
+  for (auto& batch : res) {
+    DecodeScalarsAndDictionariesInBatch(&batch, exec_ctx->memory_pool());
+  }
+  std::shared_ptr<Schema> output_schema =
+      UpdateSchemaAfterDecodingDictionaries(join->output_schema());
+
+  ASSERT_OK_AND_ASSIGN(std::shared_ptr<Table> output,
+                       TableFromExecBatches(output_schema, res));
+
+  ExecBatch expected_batch;
+  if (swap_sides) {
+    ASSERT_OK_AND_ASSIGN(expected_batch, ExecBatch::Make({r_out_key, r_out_payload,
+                                                          l_out_key, l_out_payload}));
+  } else {
+    ASSERT_OK_AND_ASSIGN(expected_batch, ExecBatch::Make({l_out_key, l_out_payload,
+                                                          r_out_key, r_out_payload}));
+  }
+
+  DecodeScalarsAndDictionariesInBatch(&expected_batch, exec_ctx->memory_pool());
+
+  // Slice expected batch into two to separate rows on right side with no matches from
+  // everything else.
+  //
+  std::vector<ExecBatch> expected_batches;
+  ASSERT_OK_AND_ASSIGN(
+      auto prefix_batch,
+      ExecBatch::Make({expected_batch.values[0].array()->Slice(
+                           0, expected_batch.length - expected_num_r_no_match),
+                       expected_batch.values[1].array()->Slice(
+                           0, expected_batch.length - expected_num_r_no_match),
+                       expected_batch.values[2].array()->Slice(
+                           0, expected_batch.length - expected_num_r_no_match),
+                       expected_batch.values[3].array()->Slice(
+                           0, expected_batch.length - expected_num_r_no_match)}));
+  for (int i = 0; i < (parallel ? batch_multiplicity_for_parallel : 1); ++i) {
+    expected_batches.push_back(prefix_batch);
+  }
+  if (expected_num_r_no_match > 0) {
+    ASSERT_OK_AND_ASSIGN(
+        auto suffix_batch,
+        ExecBatch::Make({expected_batch.values[0].array()->Slice(
+                             expected_batch.length - expected_num_r_no_match,
+                             expected_num_r_no_match),
+                         expected_batch.values[1].array()->Slice(
+                             expected_batch.length - expected_num_r_no_match,
+                             expected_num_r_no_match),
+                         expected_batch.values[2].array()->Slice(
+                             expected_batch.length - expected_num_r_no_match,
+                             expected_num_r_no_match),
+                         expected_batch.values[3].array()->Slice(
+                             expected_batch.length - expected_num_r_no_match,
+                             expected_num_r_no_match)}));
+    expected_batches.push_back(suffix_batch);
+  }
+
+  ASSERT_OK_AND_ASSIGN(std::shared_ptr<Table> expected,
+                       TableFromExecBatches(output_schema, expected_batches));
+
+  // Compare results
+  AssertTablesEqual(expected, output);
+}
+
+TEST(HashJoin, Dictionary) {
+  auto int8_utf8 = dictionary(int8(), utf8());
+  auto uint8_utf8 = arrow::dictionary(uint8(), utf8());
+  auto int16_utf8 = arrow::dictionary(int16(), utf8());
+  auto uint16_utf8 = arrow::dictionary(uint16(), utf8());
+  auto int32_utf8 = arrow::dictionary(int32(), utf8());
+  auto uint32_utf8 = arrow::dictionary(uint32(), utf8());
+  auto int64_utf8 = arrow::dictionary(int64(), utf8());
+  auto uint64_utf8 = arrow::dictionary(uint64(), utf8());
+  std::shared_ptr<DataType> dict_types[] = {int8_utf8,   uint8_utf8, int16_utf8,
+                                            uint16_utf8, int32_utf8, uint32_utf8,
+                                            int64_utf8,  uint64_utf8};
+
+  Random64Bit rng(43);
+
+  // Dictionaries in payload columns
+  for (auto parallel : {false, true}) {
+    for (auto swap_sides : {false, true}) {
+      TestHashJoinDictionaryHelper(
+          JoinType::FULL_OUTER, JoinKeyCmp::EQ, parallel,
+          // Input
+          ArrayFromJSON(utf8(), R"(["a", "c", "c", "d"])"),
+          DictArrayFromJSON(int8_utf8, R"([4, 2, 3, 0])",
+                            R"(["p", "q", "r", null, "r"])"),
+          ArrayFromJSON(utf8(), R"(["a", "a", "b", "c"])"),
+          DictArrayFromJSON(int16_utf8, R"([0, 1, 0, 2])", R"(["r", null, "r", "q"])"),
+          // Expected output
+          ArrayFromJSON(utf8(), R"(["a", "a", "c", "c", "d", null])"),
+          DictArrayFromJSON(int8_utf8, R"([4, 4, 2, 3, 0, null])",
+                            R"(["p", "q", "r", null, "r"])"),
+          ArrayFromJSON(utf8(), R"(["a", "a", "c", "c", null, "b"])"),
+          DictArrayFromJSON(int16_utf8, R"([0, 1, 2, 2, null, 0])",
+                            R"(["r", null, "r", "q"])"),
+          1, swap_sides);
+    }
+  }
+
+  // Dictionaries in key columns
+  for (auto parallel : {false, true}) {
+    for (auto swap_sides : {false, true}) {
+      for (auto l_key_dict : {true, false}) {
+        for (auto r_key_dict : {true, false}) {
+          auto l_key_dict_type = dict_types[rng.from_range(0, 7)];
+          auto r_key_dict_type = dict_types[rng.from_range(0, 7)];
+
+          auto l_key = l_key_dict ? DictArrayFromJSON(l_key_dict_type, R"([2, 2, 0, 1])",
+                                                      R"(["b", null, "a"])")
+                                  : ArrayFromJSON(utf8(), R"(["a", "a", "b", null])");
+          auto l_payload = ArrayFromJSON(utf8(), R"(["x", "y", "z", "y"])");
+          auto r_key = r_key_dict
+                           ? DictArrayFromJSON(int16_utf8, R"([1, 0, null, 1, 2])",
+                                               R"([null, "b", "c"])")
+                           : ArrayFromJSON(utf8(), R"(["b", null, null, "b", "c"])");
+          auto r_payload = ArrayFromJSON(utf8(), R"(["p", "r", "p", "q", "s"])");
+
+          // IS comparison function (null is equal to null when matching keys)
+          TestHashJoinDictionaryHelper(
+              JoinType::FULL_OUTER, JoinKeyCmp::IS, parallel,
+              // Input
+              l_key, l_payload, r_key, r_payload,
+              // Expected
+              l_key_dict ? DictArrayFromJSON(l_key_dict_type, R"([2, 2, 0, 0, 1, 1,
+            null])",
+                                             R"(["b", null, "a"])")
+                         : ArrayFromJSON(utf8(), R"(["a", "a", "b", "b", null, null,
+                       null])"),
+              ArrayFromJSON(utf8(), R"(["x", "y", "z", "z", "y", "y", null])"),
+              r_key_dict
+                  ? DictArrayFromJSON(r_key_dict_type, R"([null, null, 0, 0, null, null,
+                1])",
+                                      R"(["b", "c"])")
+                  : ArrayFromJSON(utf8(), R"([null, null, "b", "b", null, null, "c"])"),
+              ArrayFromJSON(utf8(), R"([null, null, "p", "q", "r", "p", "s"])"), 1,
+              swap_sides);
+
+          // EQ comparison function (null is not matching null)
+          TestHashJoinDictionaryHelper(
+              JoinType::FULL_OUTER, JoinKeyCmp::EQ, parallel,
+              // Input
+              l_key, l_payload, r_key, r_payload,
+              // Expected
+              l_key_dict ? DictArrayFromJSON(l_key_dict_type,
+                                             R"([2, 2, 0, 0, 1, null, null, null])",
+                                             R"(["b", null, "a"])")
+                         : ArrayFromJSON(
+                               utf8(), R"(["a", "a", "b", "b", null, null, null, null])"),
+              ArrayFromJSON(utf8(), R"(["x", "y", "z", "z", "y", null, null, null])"),
+              r_key_dict
+                  ? DictArrayFromJSON(r_key_dict_type,
+                                      R"([null, null, 0, 0, null, null, null, 1])",
+                                      R"(["b", "c"])")
+                  : ArrayFromJSON(utf8(),
+                                  R"([null, null, "b", "b", null, null, null, "c"])"),
+              ArrayFromJSON(utf8(), R"([null, null, "p", "q", null, "r", "p", "s"])"), 3,
+              swap_sides);
+        }
+      }
+    }
+  }
+
+  // Empty build side
+  {
+    auto l_key_dict_type = dict_types[rng.from_range(0, 7)];
+    auto l_payload_dict_type = dict_types[rng.from_range(0, 7)];
+    auto r_key_dict_type = dict_types[rng.from_range(0, 7)];
+    auto r_payload_dict_type = dict_types[rng.from_range(0, 7)];
+
+    for (auto parallel : {false, true}) {
+      for (auto swap_sides : {false, true}) {
+        for (auto cmp : {JoinKeyCmp::IS, JoinKeyCmp::EQ}) {
+          TestHashJoinDictionaryHelper(
+              JoinType::FULL_OUTER, cmp, parallel,
+              // Input
+              DictArrayFromJSON(l_key_dict_type, R"([2, 0, 1])", R"(["b", null, "a"])"),
+              DictArrayFromJSON(l_payload_dict_type, R"([2, 2, 0])",
+                                R"(["x", "y", "z"])"),
+              DictArrayFromJSON(r_key_dict_type, R"([])", R"([null, "b", "c"])"),
+              DictArrayFromJSON(r_payload_dict_type, R"([])", R"(["p", "r", "s"])"),
+              // Expected
+              DictArrayFromJSON(l_key_dict_type, R"([2, 0, 1])", R"(["b", null, "a"])"),
+              DictArrayFromJSON(l_payload_dict_type, R"([2, 2, 0])",
+                                R"(["x", "y", "z"])"),
+              DictArrayFromJSON(r_key_dict_type, R"([null, null, null])",
+                                R"(["b", "c"])"),
+              DictArrayFromJSON(r_payload_dict_type, R"([null, null, null])",
+                                R"(["p", "r", "s"])"),
+              0, swap_sides);
+        }
+      }
+    }
+  }
+
+  // Empty probe side
+  {
+    auto l_key_dict_type = dict_types[rng.from_range(0, 7)];
+    auto l_payload_dict_type = dict_types[rng.from_range(0, 7)];
+    auto r_key_dict_type = dict_types[rng.from_range(0, 7)];
+    auto r_payload_dict_type = dict_types[rng.from_range(0, 7)];
+
+    for (auto parallel : {false, true}) {
+      for (auto swap_sides : {false, true}) {
+        for (auto cmp : {JoinKeyCmp::IS, JoinKeyCmp::EQ}) {
+          TestHashJoinDictionaryHelper(
+              JoinType::FULL_OUTER, cmp, parallel,
+              // Input
+              DictArrayFromJSON(l_key_dict_type, R"([])", R"(["b", null, "a"])"),
+              DictArrayFromJSON(l_payload_dict_type, R"([])", R"(["x", "y", "z"])"),
+              DictArrayFromJSON(r_key_dict_type, R"([2, 0, 1, null])",
+                                R"([null, "b", "c"])"),
+              DictArrayFromJSON(r_payload_dict_type, R"([1, 1, null, 0])",
+                                R"(["p", "r", "s"])"),
+              // Expected
+              DictArrayFromJSON(l_key_dict_type, R"([null, null, null, null])",
+                                R"(["b", null, "a"])"),
+              DictArrayFromJSON(l_payload_dict_type, R"([null, null, null, null])",
+                                R"(["x", "y", "z"])"),
+              DictArrayFromJSON(r_key_dict_type, R"([1, null, 0, null])",
+                                R"(["b", "c"])"),
+              DictArrayFromJSON(r_payload_dict_type, R"([1, 1, null, 0])",
+                                R"(["p", "r", "s"])"),
+              4, swap_sides);
+        }
+      }
+    }
+  }
+}
+
+TEST(HashJoin, Scalars) {
+  auto int8_utf8 = std::make_shared<DictionaryType>(int8(), utf8());
+  auto int16_utf8 = std::make_shared<DictionaryType>(int16(), utf8());
+  auto int32_utf8 = std::make_shared<DictionaryType>(int32(), utf8());
+
+  // Scalars in payload columns
+  for (auto use_scalar_dict : {false, true}) {
+    TestHashJoinDictionaryHelper(
+        JoinType::FULL_OUTER, JoinKeyCmp::EQ, false /*parallel*/,
+        // Input
+        ArrayFromJSON(utf8(), R"(["a", "c", "c", "d"])"),
+        use_scalar_dict ? DictScalarFromJSON(int16_utf8, "1", R"(["z", "x", "y"])")
+                        : ScalarFromJSON(utf8(), "\"x\""),
+        ArrayFromJSON(utf8(), R"(["a", "a", "b", "c"])"),
+        use_scalar_dict ? DictScalarFromJSON(int32_utf8, "0", R"(["z", "x", "y"])")
+                        : ScalarFromJSON(utf8(), "\"z\""),
+        // Expected output
+        ArrayFromJSON(utf8(), R"(["a", "a", "c", "c", "d", null])"),
+        ArrayFromJSON(utf8(), R"(["x", "x", "x", "x", "x", null])"),
+        ArrayFromJSON(utf8(), R"(["a", "a", "c", "c", null, "b"])"),
+        ArrayFromJSON(utf8(), R"(["z", "z", "z", "z", null, "z"])"), 1,
+        false /*swap sides*/);
+  }
+
+  // Scalars in key columns
+  for (auto use_scalar_dict : {false, true}) {
+    for (auto swap_sides : {false, true}) {
+      TestHashJoinDictionaryHelper(
+          JoinType::FULL_OUTER, JoinKeyCmp::EQ, false /*parallel*/,
+          // Input
+          use_scalar_dict ? DictScalarFromJSON(int8_utf8, "1", R"(["b", "a", "c"])")
+                          : ScalarFromJSON(utf8(), "\"a\""),
+          ArrayFromJSON(utf8(), R"(["x", "y"])"),
+          ArrayFromJSON(utf8(), R"(["a", null, "b"])"),
+          ArrayFromJSON(utf8(), R"(["p", "q", "r"])"),
+          // Expected output
+          ArrayFromJSON(utf8(), R"(["a", "a", null, null])"),
+          ArrayFromJSON(utf8(), R"(["x", "y", null, null])"),
+          ArrayFromJSON(utf8(), R"(["a", "a", null, "b"])"),
+          ArrayFromJSON(utf8(), R"(["p", "p", "q", "r"])"), 2, swap_sides);
+    }
+  }
+
+  // Null scalars in key columns
+  for (auto use_scalar_dict : {false, true}) {
+    for (auto swap_sides : {false, true}) {
+      TestHashJoinDictionaryHelper(
+          JoinType::FULL_OUTER, JoinKeyCmp::EQ, false /*parallel*/,
+          // Input
+          use_scalar_dict ? DictScalarFromJSON(int16_utf8, "2", R"(["a", "b", null])")
+                          : ScalarFromJSON(utf8(), "null"),
+          ArrayFromJSON(utf8(), R"(["x", "y"])"),
+          ArrayFromJSON(utf8(), R"(["a", null, "b"])"),
+          ArrayFromJSON(utf8(), R"(["p", "q", "r"])"),
+          // Expected output
+          ArrayFromJSON(utf8(), R"([null, null, null, null, null])"),
+          ArrayFromJSON(utf8(), R"(["x", "y", null, null, null])"),
+          ArrayFromJSON(utf8(), R"([null, null, "a", null, "b"])"),
+          ArrayFromJSON(utf8(), R"([null, null, "p", "q", "r"])"), 3, swap_sides);
+      TestHashJoinDictionaryHelper(
+          JoinType::FULL_OUTER, JoinKeyCmp::IS, false /*parallel*/,
+          // Input
+          use_scalar_dict ? DictScalarFromJSON(int16_utf8, "null", R"(["a", "b", null])")
+                          : ScalarFromJSON(utf8(), "null"),
+          ArrayFromJSON(utf8(), R"(["x", "y"])"),
+          ArrayFromJSON(utf8(), R"(["a", null, "b"])"),
+          ArrayFromJSON(utf8(), R"(["p", "q", "r"])"),
+          // Expected output
+          ArrayFromJSON(utf8(), R"([null, null, null, null])"),
+          ArrayFromJSON(utf8(), R"(["x", "y", null, null])"),
+          ArrayFromJSON(utf8(), R"([null, null, "a", "b"])"),
+          ArrayFromJSON(utf8(), R"(["q", "q", "p", "r"])"), 2, swap_sides);
+    }
+  }
+
+  // Scalars with the empty build/probe side
+  for (auto use_scalar_dict : {false, true}) {
+    for (auto swap_sides : {false, true}) {
+      TestHashJoinDictionaryHelper(
+          JoinType::FULL_OUTER, JoinKeyCmp::EQ, false /*parallel*/,
+          // Input
+          use_scalar_dict ? DictScalarFromJSON(int8_utf8, "1", R"(["b", "a", "c"])")
+                          : ScalarFromJSON(utf8(), "\"a\""),
+          ArrayFromJSON(utf8(), R"(["x", "y"])"), ArrayFromJSON(utf8(), R"([])"),
+          ArrayFromJSON(utf8(), R"([])"),
+          // Expected output
+          ArrayFromJSON(utf8(), R"(["a", "a"])"), ArrayFromJSON(utf8(), R"(["x", "y"])"),
+          ArrayFromJSON(utf8(), R"([null, null])"),
+          ArrayFromJSON(utf8(), R"([null, null])"), 0, swap_sides);
+    }
+  }
+
+  // Scalars vs dictionaries in key columns
+  for (auto use_scalar_dict : {false, true}) {
+    for (auto swap_sides : {false, true}) {
+      TestHashJoinDictionaryHelper(
+          JoinType::FULL_OUTER, JoinKeyCmp::EQ, false /*parallel*/,
+          // Input
+          use_scalar_dict ? DictScalarFromJSON(int32_utf8, "1", R"(["b", "a", "c"])")
+                          : ScalarFromJSON(utf8(), "\"a\""),
+          ArrayFromJSON(utf8(), R"(["x", "y"])"),
+          DictArrayFromJSON(int32_utf8, R"([2, 2, 1])", R"(["b", null, "a"])"),
+          ArrayFromJSON(utf8(), R"(["p", "q", "r"])"),
+          // Expected output
+          ArrayFromJSON(utf8(), R"(["a", "a", "a", "a", null])"),
+          ArrayFromJSON(utf8(), R"(["x", "x", "y", "y", null])"),
+          ArrayFromJSON(utf8(), R"(["a", "a", "a", "a", null])"),
+          ArrayFromJSON(utf8(), R"(["p", "q", "p", "q", "r"])"), 1, swap_sides);
+    }
+  }
+}
+
+TEST(HashJoin, DictNegative) {
+  // For dictionary keys, all batches must share a single dictionary.
+  // Eventually, differing dictionaries will be unified and indices transposed
+  // during encoding to relieve this restriction.
+  const auto dictA = ArrayFromJSON(utf8(), R"(["ex", "why", "zee", null])");
+  const auto dictB = ArrayFromJSON(utf8(), R"(["different", "dictionary"])");
+
+  Datum datumFirst = Datum(
+      *DictionaryArray::FromArrays(ArrayFromJSON(int32(), R"([0, 1, 2, 3])"), dictA));
+  Datum datumSecondA = Datum(
+      *DictionaryArray::FromArrays(ArrayFromJSON(int32(), R"([3, 2, 2, 3])"), dictA));
+  Datum datumSecondB = Datum(
+      *DictionaryArray::FromArrays(ArrayFromJSON(int32(), R"([0, 1, 1, 0])"), dictB));
+
+  for (int i = 0; i < 4; ++i) {
+    BatchesWithSchema l, r;
+    l.schema = schema({field("l_key", dictionary(int32(), utf8())),
+                       field("l_payload", dictionary(int32(), utf8()))});
+    r.schema = schema({field("r_key", dictionary(int32(), utf8())),
+                       field("r_payload", dictionary(int32(), utf8()))});
+    l.batches.resize(2);
+    r.batches.resize(2);
+    ASSERT_OK_AND_ASSIGN(l.batches[0], ExecBatch::Make({datumFirst, datumFirst}));
+    ASSERT_OK_AND_ASSIGN(r.batches[0], ExecBatch::Make({datumFirst, datumFirst}));
+    ASSERT_OK_AND_ASSIGN(l.batches[1],
+                         ExecBatch::Make({i == 0 ? datumSecondB : datumSecondA,
+                                          i == 1 ? datumSecondB : datumSecondA}));
+    ASSERT_OK_AND_ASSIGN(r.batches[1],
+                         ExecBatch::Make({i == 2 ? datumSecondB : datumSecondA,
+                                          i == 3 ? datumSecondB : datumSecondA}));
+
+    auto exec_ctx =
+        arrow::internal::make_unique<ExecContext>(default_memory_pool(), nullptr);
+    ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make(exec_ctx.get()));
+    ASSERT_OK_AND_ASSIGN(
+        ExecNode * l_source,
+        MakeExecNode("source", plan.get(), {},
+                     SourceNodeOptions{l.schema, l.gen(/*parallel=*/false,
+                                                       /*slow=*/false)}));
+    ASSERT_OK_AND_ASSIGN(
+        ExecNode * r_source,
+        MakeExecNode("source", plan.get(), {},
+                     SourceNodeOptions{r.schema, r.gen(/*parallel=*/false,
+                                                       /*slow=*/false)}));
+    HashJoinNodeOptions join_options{JoinType::INNER,
+                                     {FieldRef("l_key")},
+                                     {FieldRef("r_key")},
+                                     {FieldRef("l_key"), FieldRef("l_payload")},
+                                     {FieldRef("r_key"), FieldRef("r_payload")},
+                                     {JoinKeyCmp::EQ}};
+    ASSERT_OK_AND_ASSIGN(
+        ExecNode * join,
+        MakeExecNode("hashjoin", plan.get(), {l_source, r_source}, join_options));
+    AsyncGenerator<util::optional<ExecBatch>> sink_gen;
+    ASSERT_OK_AND_ASSIGN(std::ignore, MakeExecNode("sink", plan.get(), {join},
+                                                   SinkNodeOptions{&sink_gen}));
+
+    EXPECT_FINISHES_AND_RAISES_WITH_MESSAGE_THAT(
+        NotImplemented, ::testing::HasSubstr("Unifying differing dictionaries"),
+        StartAndCollect(plan.get(), sink_gen));
+  }
+}
+
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/exec/schema_util.h b/cpp/src/arrow/compute/exec/schema_util.h
index ba14d577dc9e8..33f42701ff586 100644
--- a/cpp/src/arrow/compute/exec/schema_util.h
+++ b/cpp/src/arrow/compute/exec/schema_util.h
@@ -32,6 +32,10 @@ using internal::checked_cast;
 
 namespace compute {
 
+// Identifiers for all different row schemas that are used in a join
+//
+enum class HashJoinProjection : int { INPUT = 0, KEY = 1, PAYLOAD = 2, OUTPUT = 3 };
+
 struct SchemaProjectionMap {
   static constexpr int kMissingField = -1;
   int num_cols;
@@ -86,7 +90,7 @@ class SchemaProjectionMaps {
     return field(schema_handle, field_id).data_type;
   }
 
-  SchemaProjectionMap map(ProjectionIdEnum from, ProjectionIdEnum to) {
+  SchemaProjectionMap map(ProjectionIdEnum from, ProjectionIdEnum to) const {
     int id_from = schema_id(from);
     int id_to = schema_id(to);
     SchemaProjectionMap result;
diff --git a/cpp/src/arrow/compute/exec/source_node.cc b/cpp/src/arrow/compute/exec/source_node.cc
index 127a1b4f9b333..46bba5609d426 100644
--- a/cpp/src/arrow/compute/exec/source_node.cc
+++ b/cpp/src/arrow/compute/exec/source_node.cc
@@ -15,11 +15,10 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include "arrow/compute/exec/exec_plan.h"
-
 #include <mutex>
 
 #include "arrow/compute/exec.h"
+#include "arrow/compute/exec/exec_plan.h"
 #include "arrow/compute/exec/expression.h"
 #include "arrow/compute/exec/options.h"
 #include "arrow/compute/exec/util.h"
@@ -67,7 +66,16 @@ struct SourceNode : ExecNode {
   [[noreturn]] void InputFinished(ExecNode*, int) override { NoInputs(); }
 
   Status StartProducing() override {
-    DCHECK(!stop_requested_) << "Restarted SourceNode";
+    {
+      // If another exec node encountered an error during its StartProducing call
+      // it might have already called StopProducing on all of its inputs (including this
+      // node).
+      //
+      std::unique_lock<std::mutex> lock(mutex_);
+      if (stop_requested_) {
+        return Status::OK();
+      }
+    }
 
     CallbackOptions options;
     auto executor = plan()->exec_context()->executor();
diff --git a/cpp/src/arrow/compute/kernels/row_encoder.cc b/cpp/src/arrow/compute/kernels/row_encoder.cc
index 63bff8c268811..840e4634fc842 100644
--- a/cpp/src/arrow/compute/kernels/row_encoder.cc
+++ b/cpp/src/arrow/compute/kernels/row_encoder.cc
@@ -238,7 +238,9 @@ Result<std::shared_ptr<ArrayData>> DictionaryKeyEncoder::Decode(uint8_t** encode
   if (dictionary_) {
     data->dictionary = dictionary_->data();
   } else {
-    ARROW_ASSIGN_OR_RAISE(auto dict, MakeArrayOfNull(type_, 0));
+    ARROW_DCHECK(type_->id() == Type::DICTIONARY);
+    const auto& dict_type = checked_cast<const DictionaryType&>(*type_);
+    ARROW_ASSIGN_OR_RAISE(auto dict, MakeArrayOfNull(dict_type.value_type(), 0));
     data->dictionary = dict->data();
   }
 
diff --git a/cpp/src/arrow/compute/kernels/row_encoder.h b/cpp/src/arrow/compute/kernels/row_encoder.h
index 49356c5e9fc62..40509f2df7bf6 100644
--- a/cpp/src/arrow/compute/kernels/row_encoder.h
+++ b/cpp/src/arrow/compute/kernels/row_encoder.h
@@ -53,6 +53,10 @@ struct KeyEncoder {
   // extract the null bitmap from the leading nullity bytes of encoded keys
   static Status DecodeNulls(MemoryPool* pool, int32_t length, uint8_t** encoded_bytes,
                             std::shared_ptr<Buffer>* null_bitmap, int32_t* null_count);
+
+  static bool IsNull(const uint8_t* encoded_bytes) {
+    return encoded_bytes[0] == kNullByte;
+  }
 };
 
 struct BooleanKeyEncoder : KeyEncoder {
@@ -156,8 +160,8 @@ struct VarLengthKeyEncoder : KeyEncoder {
           });
     } else {
       const auto& scalar = data.scalar_as<BaseBinaryScalar>();
-      const auto& bytes = *scalar.value;
       if (scalar.is_valid) {
+        const auto& bytes = *scalar.value;
         for (int64_t i = 0; i < batch_length; i++) {
           auto& encoded_ptr = *encoded_bytes++;
           *encoded_ptr++ = kValidByte;
diff --git a/r/tests/testthat/test-dplyr-join.R b/r/tests/testthat/test-dplyr-join.R
index 3ff9ad8ff1a5f..d8239f8108569 100644
--- a/r/tests/testthat/test-dplyr-join.R
+++ b/r/tests/testthat/test-dplyr-join.R
@@ -20,11 +20,6 @@ skip_if_not_available("dataset")
 library(dplyr, warn.conflicts = FALSE)
 
 left <- example_data
-# Error: Invalid: Dictionary type support for join output field
-# is not yet implemented, output field reference: FieldRef.Name(fct)
-# on left side of the join
-# (select(-fct) also solves this but remove once)
-left$fct <- NULL
 left$some_grouping <- rep(c(1, 2), 5)
 
 left_tab <- Table$create(left)
@@ -37,7 +32,6 @@ to_join <- tibble::tibble(
 to_join_tab <- Table$create(to_join)
 
 
-
 test_that("left_join", {
   expect_message(
     compare_dplyr_binding(
@@ -68,8 +62,6 @@ test_that("left_join `by` args", {
     left
   )
 
-  # TODO: allow renaming columns on the right side as well
-  skip("ARROW-14184")
   compare_dplyr_binding(
     .input %>%
       rename(the_grouping = some_grouping) %>%
@@ -82,7 +74,6 @@ test_that("left_join `by` args", {
   )
 })
 
-
 test_that("join two tables", {
   expect_identical(
     left_tab %>%
@@ -146,6 +137,9 @@ test_that("semi_join", {
 test_that("anti_join", {
   compare_dplyr_binding(
     .input %>%
+      # Factor levels when there are no rows in the data don't match
+      # TODO: use better anti_join test data
+      select(-fct) %>%
       anti_join(to_join, by = "some_grouping") %>%
       collect(),
     left

From e907a9f0786cbc5869ed35b0a6cbfa946d89a419 Mon Sep 17 00:00:00 2001
From: Dewey Dunnington <dewey@fishandwhistle.net>
Date: Fri, 5 Nov 2021 14:57:22 -0500
Subject: [PATCH 092/194] ARROW-14227: [R] Implement lubridate is.* methods
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implements `is.Date()`, `is.POSIXct()`, `is.timepoint()`, and `is.instant()` from the lubridate package.

There is also `lubridate::is.POSIXt()` and `lubridate::is.POSIXlt()`...these are a bit more niche and I thought they might be confusing if implemented here.

Another question I had while implementing this...can/should these return an `Expression$scalar()`? This isn't how the other `is.*()` functions are done but maybe this makes a difference?

``` r
library(arrow, warn.conflicts = FALSE)
library(lubridate, warn.conflicts = FALSE)
library(dplyr, warn.conflicts = FALSE)

test_date <- as.POSIXct("2017-01-01 00:00:11.3456789", tz = "Pacific/Marquesas")

test_df <- tibble::tibble(
  datetime = c(test_date, NA) + 1,
  date = c(as.Date("2021-09-09"), NA)
)

RecordBatch$create(test_df) %>%
  mutate(is.POSIXct(datetime), is.POSIXct(date)) %>%
  collect()
#> # A tibble: 2 × 4
#>   datetime            date       `is.POSIXct(datetime)` `is.POSIXct(date)`
#>   <dttm>              <date>     <lgl>                  <lgl>
#> 1 2017-01-01 00:00:12 2021-09-09 TRUE                   FALSE
#> 2 NA                  NA         TRUE                   FALSE

RecordBatch$create(test_df) %>%
  mutate(is.Date(datetime), is.Date(date)) %>%
  collect()
#> # A tibble: 2 × 4
#>   datetime            date       `is.Date(datetime)` `is.Date(date)`
#>   <dttm>              <date>     <lgl>               <lgl>
#> 1 2017-01-01 00:00:12 2021-09-09 FALSE               TRUE
#> 2 NA                  NA         FALSE               TRUE

RecordBatch$create(test_df) %>%
  mutate(is.instant(datetime), is.instant(date)) %>%
  collect()
#> # A tibble: 2 × 4
#>   datetime            date       `is.instant(datetime)` `is.instant(date)`
#>   <dttm>              <date>     <lgl>                  <lgl>
#> 1 2017-01-01 00:00:12 2021-09-09 TRUE                   TRUE
#> 2 NA                  NA         TRUE                   TRUE

RecordBatch$create(test_df) %>%
  mutate(is.timepoint(datetime), is.timepoint(date)) %>%
  collect()
#> # A tibble: 2 × 4
#>   datetime            date       `is.timepoint(datetime)` `is.timepoint(date)`
#>   <dttm>              <date>     <lgl>                    <lgl>
#> 1 2017-01-01 00:00:12 2021-09-09 TRUE                     TRUE
#> 2 NA                  NA         TRUE                     TRUE
```

<sup>Created on 2021-11-02 by the [reprex package](https://reprex.tidyverse.org) (v2.0.1)</sup>

Closes #11592 from paleolimbot/lubridate-is-methods

Authored-by: Dewey Dunnington <dewey@fishandwhistle.net>
Signed-off-by: Jonathan Keane <jkeane@gmail.com>
---
 r/R/dplyr-functions.R                        | 47 ++++++++++++--------
 r/tests/testthat/test-dplyr-funcs-datetime.R | 44 +++++++++++++++++-
 2 files changed, 72 insertions(+), 19 deletions(-)

diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R
index c9e306b961f8e..0ef30b2225562 100644
--- a/r/R/dplyr-functions.R
+++ b/r/R/dplyr-functions.R
@@ -255,10 +255,6 @@ nse_funcs$is_logical <- function(x, n = NULL) {
   assert_that(is.null(n))
   nse_funcs$is.logical(x)
 }
-nse_funcs$is_timestamp <- function(x, n = NULL) {
-  assert_that(is.null(n))
-  inherits(x, "POSIXt") || (inherits(x, "Expression") && x$type_id() %in% Type[c("TIMESTAMP")])
-}
 
 # String functions
 nse_funcs$nchar <- function(x, type = "chars", allowNA = FALSE, keepNA = NA) {
@@ -749,6 +745,19 @@ contains_regex <- function(string) {
   grepl("[.\\|()[{^$*+?]", string)
 }
 
+nse_funcs$trunc <- function(x, ...) {
+  # accepts and ignores ... for consistency with base::trunc()
+  build_expr("trunc", x)
+}
+
+nse_funcs$round <- function(x, digits = 0) {
+  build_expr(
+    "round",
+    x,
+    options = list(ndigits = digits, round_mode = RoundMode$HALF_TO_EVEN)
+  )
+}
+
 nse_funcs$strptime <- function(x, format = "%Y-%m-%d %H:%M:%S", tz = NULL, unit = "ms") {
   # Arrow uses unit for time parsing, strptime() does not.
   # Arrow has no default option for strptime (format, unit),
@@ -775,7 +784,7 @@ nse_funcs$strftime <- function(x, format = "", tz = "", usetz = FALSE) {
   }
   # Arrow's strftime prints in timezone of the timestamp. To match R's strftime behavior we first
   # cast the timestamp to desired timezone. This is a metadata only change.
-  if (nse_funcs$is_timestamp(x)) {
+  if (nse_funcs$is.POSIXct(x)) {
     ts <- Expression$create("cast", x, options = list(to_type = timestamp(x$type()$unit(), tz)))
   } else {
     ts <- x
@@ -818,19 +827,6 @@ nse_funcs$second <- function(x) {
   Expression$create("add", Expression$create("second", x), Expression$create("subsecond", x))
 }
 
-nse_funcs$trunc <- function(x, ...) {
-  # accepts and ignores ... for consistency with base::trunc()
-  build_expr("trunc", x)
-}
-
-nse_funcs$round <- function(x, digits = 0) {
-  build_expr(
-    "round",
-    x,
-    options = list(ndigits = digits, round_mode = RoundMode$HALF_TO_EVEN)
-  )
-}
-
 nse_funcs$wday <- function(x,
                            label = FALSE,
                            abbr = TRUE,
@@ -848,6 +844,21 @@ nse_funcs$wday <- function(x,
   Expression$create("day_of_week", x, options = list(count_from_zero = FALSE, week_start = week_start))
 }
 
+nse_funcs$is.Date <- function(x) {
+  inherits(x, "Date") ||
+    (inherits(x, "Expression") && x$type_id() %in% Type[c("DATE32", "DATE64")])
+}
+
+nse_funcs$is.instant <- nse_funcs$is.timepoint <- function(x) {
+  inherits(x, c("POSIXt", "POSIXct", "POSIXlt", "Date")) ||
+    (inherits(x, "Expression") && x$type_id() %in% Type[c("TIMESTAMP", "DATE32", "DATE64")])
+}
+
+nse_funcs$is.POSIXct <- function(x) {
+  inherits(x, "POSIXct") ||
+    (inherits(x, "Expression") && x$type_id() %in% Type[c("TIMESTAMP")])
+}
+
 nse_funcs$log <- nse_funcs$logb <- function(x, base = exp(1)) {
   # like other binary functions, either `x` or `base` can be Expression or double(1)
   if (is.numeric(x) && length(x) == 1) {
diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R
index 5cb515e69da0e..c8b8005840292 100644
--- a/r/tests/testthat/test-dplyr-funcs-datetime.R
+++ b/r/tests/testthat/test-dplyr-funcs-datetime.R
@@ -41,9 +41,51 @@ test_df <- tibble::tibble(
   # That issue is tough because in C++, "" is the "no timezone" value
   # due to static typing, so we can't distinguish a literal "" from NULL
   datetime = c(test_date, NA) + 1,
-  date = c(as.Date("2021-09-09"), NA)
+  date = c(as.Date("2021-09-09"), NA),
+  integer = 1:2
 )
 
+# These tests test detection of dates and times
+
+test_that("is.* functions from lubridate", {
+  # make sure all true and at least one false value is considered
+  compare_dplyr_binding(
+    .input %>%
+      mutate(x = is.POSIXct(datetime), y = is.POSIXct(integer)) %>%
+      collect(),
+    test_df
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(x = is.Date(date), y = is.Date(integer)) %>%
+      collect(),
+    test_df
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        x = is.instant(datetime),
+        y = is.instant(date),
+        z = is.instant(integer)
+      ) %>%
+      collect(),
+    test_df
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        x = is.timepoint(datetime),
+        y = is.instant(date),
+        z = is.timepoint(integer)
+      ) %>%
+      collect(),
+    test_df
+  )
+})
+
 # These tests test component extraction from timestamp objects
 
 test_that("extract year from timestamp", {

From fe92778cff3b4823eea5c105b4678e65fe7ea928 Mon Sep 17 00:00:00 2001
From: Jonathan Keane <jkeane@gmail.com>
Date: Fri, 5 Nov 2021 16:23:13 -0500
Subject: [PATCH 093/194] ARROW-14515: [R] Add clang sanitizer to crossbow

We should upstream r-fedora-clang-devel-san.dockerfile to rhub and use theirs when we do

Closes #11577 from jonkeane/ARROW-14515-clang-san

Authored-by: Jonathan Keane <jkeane@gmail.com>
Signed-off-by: Jonathan Keane <jkeane@gmail.com>
---
 ci/docker/r-fedora-clang-devel-san.dockerfile | 77 +++++++++++++++++++
 ci/scripts/r_sanitize.sh                      | 12 ++-
 dev/tasks/tasks.yml                           |  6 ++
 docker-compose.yml                            | 38 ++++++++-
 4 files changed, 130 insertions(+), 3 deletions(-)
 create mode 100644 ci/docker/r-fedora-clang-devel-san.dockerfile

diff --git a/ci/docker/r-fedora-clang-devel-san.dockerfile b/ci/docker/r-fedora-clang-devel-san.dockerfile
new file mode 100644
index 0000000000000..7c3aa536b7967
--- /dev/null
+++ b/ci/docker/r-fedora-clang-devel-san.dockerfile
@@ -0,0 +1,77 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Fedora-clang-devel with the sanitizer enabled, this should/will
+# be upstreamed to rhub, so separated out like this
+
+# start with the Docker 'base R' Debian-based image
+FROM rhub/fedora-clang:latest
+
+# TODO: rhub maintainer when we upstream
+
+ENV CRAN http://cran.r-project.org
+
+RUN cd /tmp \
+    && svn co https://svn.r-project.org/R/trunk R-devel
+
+ENV RPREFIX /opt/R-devel
+
+ENV ROPTIONS --with-x --with-recommended-packages --enable-R-shlib --enable-R-static-lib
+
+ENV CC /usr/bin/clang
+ENV CXX /usr/bin/clang++
+ENV F77 gfortran
+ENV CPP cpp
+
+RUN yum -y install rsync
+RUN dnf install -y libcxx-devel
+
+RUN cd /tmp/R-devel \
+    && ./tools/rsync-recommended \
+    && R_PAPERSIZE=letter \
+    R_BATCHSAVE="--no-save --no-restore" \
+    CC="clang -fsanitize=address,undefined -fno-sanitize=float-divide-by-zero -fno-omit-frame-pointer" \
+    CXX="clang++ -stdlib=libc++ -fsanitize=address,undefined -fno-sanitize=float-divide-by-zero -fno-omit-frame-pointer" \
+    CFLAGS="-g -O3 -Wall -pedantic -mtune=native" \
+    FFLAGS="-g -O2 -mtune=native" \
+    FCFLAGS="-g -O2 -mtune=native" \
+    CXXFLAGS="-g -O3 -Wall -pedantic -mtune=native" \
+    MAIN_LD="clang++ -stdlib=libc++ -fsanitize=undefined,address" \
+    R_OPENMP_CFLAGS=-fopenmp \
+    ./configure --prefix=${RPREFIX} ${ROPTIONS} \
+    && make \
+    && make install
+
+# TODO: re-enable when upstreamed?
+# COPY xvfb-run /usr/local/bin/xvfb-run
+
+# RUN chmod +x /usr/local/bin/xvfb-run && \
+#     rm -f /bin/xvfb-run /usr/bin/xvfb-run
+
+ENV RHUB_PLATFORM linux-x86_64-fedora-clang
+
+# More verbose UBSAN/SAN output (cf #3) -- this is still somewhat speculative
+# Entry copied from Prof Ripley's setup described at http://www.stats.ox.ac.uk/pub/bdr/memtests/README.txt
+ENV ASAN_OPTIONS 'alloc_dealloc_mismatch=0:detect_leaks=0:detect_odr_violation=0'
+
+ENV PATH=${RPREFIX}/bin:$PATH
+
+RUN cd $RPREFIX/bin \
+	&& mv R Rdevel \
+	&& cp Rscript Rscriptdevel \
+	&& ln -s Rdevel RDsan \
+	&& ln -s Rscriptdevel RDscriptsan
diff --git a/ci/scripts/r_sanitize.sh b/ci/scripts/r_sanitize.sh
index 6c79c085180a7..8b305a2378c8b 100755
--- a/ci/scripts/r_sanitize.sh
+++ b/ci/scripts/r_sanitize.sh
@@ -34,13 +34,21 @@ unset ARROW_R_DEV
 
 export UBSAN_OPTIONS="print_stacktrace=1,suppressions=/arrow/r/tools/ubsan.supp"
 
+# run tests
 pushd tests
 ${R_BIN} < testthat.R > testthat.out 2>&1 || { cat testthat.out; exit 1; }
-popd
-${R_BIN} -e 'library(arrow); testthat::test_examples(".")' >> testthat.out 2>&1 || { cat testthat.out; exit 1; }
 
 cat testthat.out
 if grep -q "runtime error" testthat.out; then
   exit 1
 fi
+
+# run examples
+popd
+${R_BIN} -e 'library(arrow); testthat::test_examples(".")' >> examples.out 2>&1 || { cat examples.out; exit 1; }
+
+cat examples.out
+if grep -q "runtime error" examples.out; then
+  exit 1
+fi
 popd
diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml
index 33067e8aaff37..7ded109e9868f 100644
--- a/dev/tasks/tasks.yml
+++ b/dev/tasks/tasks.yml
@@ -1161,6 +1161,12 @@ tasks:
         UBUNTU: 18.04
       run: ubuntu-r-sanitizer
 
+  test-fedora-r-clang-sanitizer:
+    ci: azure
+    template: docker-tests/azure.linux.yml
+    params:
+      run: fedora-r-clang-sanitizer
+
   revdep-r-check:
     ci: github
     template: r/github.linux.revdepcheck.yml
diff --git a/docker-compose.yml b/docker-compose.yml
index b917edc22af8a..a9132c466948f 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -136,6 +136,8 @@ x-hierarchy:
   - ubuntu-cpp-sanitizer
   - ubuntu-cpp-thread-sanitizer
   - ubuntu-r-sanitizer
+  - r-fedora-clang-devel-san:
+    - fedora-r-clang-sanitizer
   - ubuntu-r-valgrind
   - python-sdist
   - r
@@ -271,7 +273,7 @@ services:
       ARROW_USE_LD_GOLD: "ON"
       BUILD_WARNING_LEVEL: "PRODUCTION"
     volumes: *conda-volumes
-    command: 
+    command:
       ["/arrow/ci/scripts/cpp_build.sh /arrow /build &&
         /arrow/ci/scripts/cpp_test.sh /arrow /build"]
 
@@ -1154,6 +1156,40 @@ services:
       /bin/bash -c "
         /arrow/ci/scripts/r_sanitize.sh /arrow"
 
+  fedora-r-clang-sanitizer:
+    image: ${REPO}:r-rhub-fedora-clang-devel-latest
+    build:
+      context: .
+      dockerfile: ci/docker/linux-r.dockerfile
+      cache_from:
+        - ${REPO}:r-rhub-fedora-clang-devel-latest
+      args:
+        base: ${REPO}:r-fedora-clang-devel-san
+        r_dev: ${ARROW_R_DEV}
+        devtoolset_version: ${DEVTOOLSET_VERSION}
+        r_bin: RDsan
+        tz: ${TZ}
+    shm_size: *shm-size
+    environment:
+      LIBARROW_DOWNLOAD: "false"
+      ARROW_SOURCE_HOME: "/arrow"
+      ARROW_R_DEV: ${ARROW_R_DEV}
+      # To test for CRAN release, delete ^^ these two env vars so we download the Apache release
+      ARROW_USE_PKG_CONFIG: "false"
+    volumes:
+      - .:/arrow:delegated
+    command: >
+      /bin/bash -c "
+        /arrow/ci/scripts/r_sanitize.sh /arrow"
+
+  r-fedora-clang-devel-san:
+    image: ${REPO}:r-fedora-clang-devel-san
+    build:
+      context: .
+      dockerfile: ci/docker/r-fedora-clang-devel-san.dockerfile
+      cache_from:
+        - ${REPO}:r-fedora-clang-devel-san
+
   ubuntu-r-valgrind:
     # Only 18.04 and amd64 supported
     # Usage:

From da1868b6ff2d3de5d8b52f79361ed57a311bbd5f Mon Sep 17 00:00:00 2001
From: David Li <li.davidm96@gmail.com>
Date: Fri, 5 Nov 2021 17:46:51 -0400
Subject: [PATCH 094/194] ARROW-14616: [C++] Fix build errors on master

Closes #11627 from lidavidm/arrow-14616

Authored-by: David Li <li.davidm96@gmail.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 .../compute/kernels/scalar_arithmetic_test.cc | 34 ++++++++++++-------
 1 file changed, 22 insertions(+), 12 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc b/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc
index 52414042e926f..0ac99f58a3f35 100644
--- a/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc
@@ -1496,10 +1496,14 @@ TEST_F(TestUnaryArithmeticDecimal, AbsoluteValue) {
       CheckScalar(func, {ArrayFromJSON(ty, R"(["1.00", "-42.15", null])")},
                   ArrayFromJSON(ty, R"(["1.00", "42.15", null])"));
     }
-    CheckScalar(func, {std::make_shared<Decimal128Scalar>(-max128, decimal128(38, 0))},
-                std::make_shared<Decimal128Scalar>(max128, decimal128(38, 0)));
-    CheckScalar(func, {std::make_shared<Decimal256Scalar>(-max256, decimal256(76, 0))},
-                std::make_shared<Decimal256Scalar>(max256, decimal256(76, 0)));
+    CheckScalar(
+        func,
+        ScalarVector{std::make_shared<Decimal128Scalar>(-max128, decimal128(38, 0))},
+        std::make_shared<Decimal128Scalar>(max128, decimal128(38, 0)));
+    CheckScalar(
+        func,
+        ScalarVector{std::make_shared<Decimal256Scalar>(-max256, decimal256(76, 0))},
+        std::make_shared<Decimal256Scalar>(max256, decimal256(76, 0)));
     for (const auto& ty : NegativeScaleTypes()) {
       CheckScalar(func, {ArrayFromJSON(ty, R"([])")}, ArrayFromJSON(ty, R"([])"));
       CheckScalar(func, {DecimalArrayFromJSON(ty, R"(["12E2", "-42E2", null])")},
@@ -1548,14 +1552,20 @@ TEST_F(TestUnaryArithmeticDecimal, Negate) {
       CheckScalar(func, {ArrayFromJSON(ty, R"(["0.00", "1.00", "-42.15", null])")},
                   ArrayFromJSON(ty, R"(["0.00", "-1.00", "42.15", null])"));
     }
-    CheckScalar(func, {std::make_shared<Decimal128Scalar>(-max128, decimal128(38, 0))},
-                std::make_shared<Decimal128Scalar>(max128, decimal128(38, 0)));
-    CheckScalar(func, {std::make_shared<Decimal128Scalar>(max128, decimal128(38, 0))},
-                std::make_shared<Decimal128Scalar>(-max128, decimal128(38, 0)));
-    CheckScalar(func, {std::make_shared<Decimal256Scalar>(-max256, decimal256(76, 0))},
-                std::make_shared<Decimal256Scalar>(max256, decimal256(76, 0)));
-    CheckScalar(func, {std::make_shared<Decimal256Scalar>(max256, decimal256(76, 0))},
-                std::make_shared<Decimal256Scalar>(-max256, decimal256(76, 0)));
+    CheckScalar(
+        func,
+        ScalarVector{std::make_shared<Decimal128Scalar>(-max128, decimal128(38, 0))},
+        std::make_shared<Decimal128Scalar>(max128, decimal128(38, 0)));
+    CheckScalar(
+        func, ScalarVector{std::make_shared<Decimal128Scalar>(max128, decimal128(38, 0))},
+        std::make_shared<Decimal128Scalar>(-max128, decimal128(38, 0)));
+    CheckScalar(
+        func,
+        ScalarVector{std::make_shared<Decimal256Scalar>(-max256, decimal256(76, 0))},
+        std::make_shared<Decimal256Scalar>(max256, decimal256(76, 0)));
+    CheckScalar(
+        func, ScalarVector{std::make_shared<Decimal256Scalar>(max256, decimal256(76, 0))},
+        std::make_shared<Decimal256Scalar>(-max256, decimal256(76, 0)));
     for (const auto& ty : NegativeScaleTypes()) {
       CheckScalar(func, {ArrayFromJSON(ty, R"([])")}, ArrayFromJSON(ty, R"([])"));
       CheckScalar(func, {DecimalArrayFromJSON(ty, R"(["0", "12E2", "-42E2", null])")},

From e73821d11ed4bf979188bce496df9aa6885b2db4 Mon Sep 17 00:00:00 2001
From: liukun4515 <liukun@apache.org>
Date: Fri, 5 Nov 2021 20:50:03 -0400
Subject: [PATCH 095/194] ARROW-14601: [JAVA] fix the comment for timestamp sec

Closes #11618 from liukun4515/ARROW-14601

Authored-by: liukun4515 <liukun@apache.org>
Signed-off-by: Micah Kornfield <emkornfield@gmail.com>
---
 .../src/main/java/org/apache/arrow/vector/types/Types.java      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java b/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java
index d4c827859128f..f29157524f2df 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java
@@ -314,7 +314,7 @@ public FieldWriter getNewFieldWriter(ValueVector vector) {
         return new TimeNanoWriterImpl((TimeNanoVector) vector);
       }
     },
-    // time in second from the Unix epoch, 00:00:00.000000 on 1 January 1970, UTC.
+    // time in second from the Unix epoch, 00:00:00 on 1 January 1970, UTC.
     TIMESTAMPSEC(new Timestamp(org.apache.arrow.vector.types.TimeUnit.SECOND, null)) {
       @Override
       public FieldVector getNewVector(

From b1c0aa3971f2b169d208d615b7ce7dee5cde7307 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Sat, 6 Nov 2021 11:22:56 +0900
Subject: [PATCH 096/194] ARROW-14618: [Release] Add missing AlmaLinux
 artifacts URL to vote email template

Closes #11629 from kou/release-vote-email-missing-almalinux

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 dev/release/02-source-test.rb | 23 ++++++++++++-----------
 dev/release/02-source.sh      | 23 ++++++++++++-----------
 2 files changed, 24 insertions(+), 22 deletions(-)

diff --git a/dev/release/02-source-test.rb b/dev/release/02-source-test.rb
index 652d4c07fa23e..0af4aafca22f6 100644
--- a/dev/release/02-source-test.rb
+++ b/dev/release/02-source-test.rb
@@ -120,11 +120,11 @@ def test_vote
 #{@current_commit} [2]
 
 The source release rc0 is hosted at [3].
-The binary artifacts are hosted at [4][5][6][7][8][9].
-The changelog is located at [10].
+The binary artifacts are hosted at [4][5][6][7][8][9][10].
+The changelog is located at [11].
 
 Please download, verify checksums and signatures, run the unit tests,
-and vote on the release. See [11] for how to validate a release candidate.
+and vote on the release. See [12] for how to validate a release candidate.
 
 The vote will be open for at least 72 hours.
 
@@ -135,14 +135,15 @@ def test_vote
 [1]: https://issues.apache.org/jira/issues/?jql=project%20%3D%20ARROW%20AND%20status%20in%20%28Resolved%2C%20Closed%29%20AND%20fixVersion%20%3D%20#{@release_version}
 [2]: https://github.com/apache/arrow/tree/#{@current_commit}
 [3]: https://dist.apache.org/repos/dist/dev/arrow/apache-arrow-#{@release_version}-rc0
-[4]: https://apache.jfrog.io/artifactory/arrow/amazon-linux-rc/
-[5]: https://apache.jfrog.io/artifactory/arrow/centos-rc/
-[6]: https://apache.jfrog.io/artifactory/arrow/debian-rc/
-[7]: https://apache.jfrog.io/artifactory/arrow/nuget-rc/#{@release_version}-rc0
-[8]: https://apache.jfrog.io/artifactory/arrow/python-rc/#{@release_version}-rc0
-[9]: https://apache.jfrog.io/artifactory/arrow/ubuntu-rc/
-[10]: https://github.com/apache/arrow/blob/#{@current_commit}/CHANGELOG.md
-[11]: https://cwiki.apache.org/confluence/display/ARROW/How+to+Verify+Release+Candidates
+[4]: https://apache.jfrog.io/artifactory/arrow/almalinux-rc/
+[5]: https://apache.jfrog.io/artifactory/arrow/amazon-linux-rc/
+[6]: https://apache.jfrog.io/artifactory/arrow/centos-rc/
+[7]: https://apache.jfrog.io/artifactory/arrow/debian-rc/
+[8]: https://apache.jfrog.io/artifactory/arrow/nuget-rc/#{@release_version}-rc0
+[9]: https://apache.jfrog.io/artifactory/arrow/python-rc/#{@release_version}-rc0
+[10]: https://apache.jfrog.io/artifactory/arrow/ubuntu-rc/
+[11]: https://github.com/apache/arrow/blob/#{@current_commit}/CHANGELOG.md
+[12]: https://cwiki.apache.org/confluence/display/ARROW/How+to+Verify+Release+Candidates
     VOTE
   end
 end
diff --git a/dev/release/02-source.sh b/dev/release/02-source.sh
index 156eccc1b7a9d..4a4f301289019 100755
--- a/dev/release/02-source.sh
+++ b/dev/release/02-source.sh
@@ -136,11 +136,11 @@ This release candidate is based on commit:
 ${release_hash} [2]
 
 The source release rc${rc} is hosted at [3].
-The binary artifacts are hosted at [4][5][6][7][8][9].
-The changelog is located at [10].
+The binary artifacts are hosted at [4][5][6][7][8][9][10].
+The changelog is located at [11].
 
 Please download, verify checksums and signatures, run the unit tests,
-and vote on the release. See [11] for how to validate a release candidate.
+and vote on the release. See [12] for how to validate a release candidate.
 
 The vote will be open for at least 72 hours.
 
@@ -151,14 +151,15 @@ The vote will be open for at least 72 hours.
 [1]: ${jira_url}/issues/?jql=${jql}
 [2]: https://github.com/apache/arrow/tree/${release_hash}
 [3]: ${rc_url}
-[4]: https://apache.jfrog.io/artifactory/arrow/amazon-linux-rc/
-[5]: https://apache.jfrog.io/artifactory/arrow/centos-rc/
-[6]: https://apache.jfrog.io/artifactory/arrow/debian-rc/
-[7]: https://apache.jfrog.io/artifactory/arrow/nuget-rc/${version}-rc${rc}
-[8]: https://apache.jfrog.io/artifactory/arrow/python-rc/${version}-rc${rc}
-[9]: https://apache.jfrog.io/artifactory/arrow/ubuntu-rc/
-[10]: https://github.com/apache/arrow/blob/${release_hash}/CHANGELOG.md
-[11]: https://cwiki.apache.org/confluence/display/ARROW/How+to+Verify+Release+Candidates
+[4]: https://apache.jfrog.io/artifactory/arrow/almalinux-rc/
+[5]: https://apache.jfrog.io/artifactory/arrow/amazon-linux-rc/
+[6]: https://apache.jfrog.io/artifactory/arrow/centos-rc/
+[7]: https://apache.jfrog.io/artifactory/arrow/debian-rc/
+[8]: https://apache.jfrog.io/artifactory/arrow/nuget-rc/${version}-rc${rc}
+[9]: https://apache.jfrog.io/artifactory/arrow/python-rc/${version}-rc${rc}
+[10]: https://apache.jfrog.io/artifactory/arrow/ubuntu-rc/
+[11]: https://github.com/apache/arrow/blob/${release_hash}/CHANGELOG.md
+[12]: https://cwiki.apache.org/confluence/display/ARROW/How+to+Verify+Release+Candidates
 MAIL
   echo "---------------------------------------------------------"
 fi

From ae808e0fabc3698d8a84b3e542c9470c95d1af16 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Sat, 6 Nov 2021 16:45:22 +0900
Subject: [PATCH 097/194] ARROW-14619: [Ruby] Use no @ openssl Homebrew package
 for pkg-config

Closes #11630 from kou/ruby-macos-openssl

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 ruby/Rakefile                       | 34 +++++++++++++++++++++--------
 ruby/red-arrow/ext/arrow/extconf.rb |  2 +-
 2 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/ruby/Rakefile b/ruby/Rakefile
index 64559eff9c331..acb8019324051 100644
--- a/ruby/Rakefile
+++ b/ruby/Rakefile
@@ -30,13 +30,26 @@ Dir.glob("#{base_dir}/*/*.gemspec") do |gemspec|
 end
 
 packages.each do |package|
-  desc "Run test for #{package}"
-  task package do
-    cd(File.join(base_dir, package)) do
-      if ENV["USE_BUNDLER"]
-        sh("bundle", "exec", "rake")
-      else
-        ruby("-S", "rake")
+  namespace package do
+    desc "Run test for #{package}"
+    task :test do
+      cd(File.join(base_dir, package)) do
+        if ENV["USE_BUNDLER"]
+          sh("bundle", "exec", "rake", "test")
+        else
+          ruby("-S", "rake", "test")
+        end
+      end
+    end
+
+    desc "Install #{package}"
+    task :install do
+      cd(File.join(base_dir, package)) do
+        if ENV["USE_BUNDLER"]
+          sh("bundle", "exec", "rake", "install")
+        else
+          ruby("-S", "rake", "install")
+        end
       end
     end
   end
@@ -51,6 +64,9 @@ sorted_packages = packages.sort_by do |package|
 end
 
 desc "Run test for all packages"
-task all: sorted_packages
+task test: sorted_packages.collect {|package| "#{package}:test"}
+
+desc "Install all packages"
+task install: sorted_packages.collect {|package| "#{package}:install"}
 
-task default: :all
+task default: :test
diff --git a/ruby/red-arrow/ext/arrow/extconf.rb b/ruby/red-arrow/ext/arrow/extconf.rb
index 9e92bd31665ff..390872645f9d9 100644
--- a/ruby/red-arrow/ext/arrow/extconf.rb
+++ b/ruby/red-arrow/ext/arrow/extconf.rb
@@ -28,7 +28,7 @@
 checking_for(checking_message("Homebrew")) do
   platform = NativePackageInstaller::Platform.detect
   if platform.is_a?(NativePackageInstaller::Platform::Homebrew)
-    openssl_prefix = `brew --prefix openssl@1.1`.chomp
+    openssl_prefix = `brew --prefix openssl`.chomp
     unless openssl_prefix.empty?
       PKGConfig.add_path("#{openssl_prefix}/lib/pkgconfig")
     end

From 412da89146f2366925abda86d34c49d25a78b294 Mon Sep 17 00:00:00 2001
From: David Li <li.davidm96@gmail.com>
Date: Sat, 6 Nov 2021 09:48:09 -0400
Subject: [PATCH 098/194] ARROW-14519: [C++] Properly error if joining on
 unsupported type

Instead of DCHECK, return a NotImplemented.

Closes #11625 from lidavidm/arrow-14519

Authored-by: David Li <li.davidm96@gmail.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 cpp/src/arrow/compute/exec/hash_join.h        |  1 +
 cpp/src/arrow/compute/exec/hash_join_node.cc  | 33 +++++++++++++++----
 .../arrow/compute/exec/hash_join_node_test.cc | 33 +++++++++++++++++++
 cpp/src/arrow/compute/exec/schema_util.h      | 32 ++----------------
 4 files changed, 63 insertions(+), 36 deletions(-)

diff --git a/cpp/src/arrow/compute/exec/hash_join.h b/cpp/src/arrow/compute/exec/hash_join.h
index 11b36d9af27c9..6520e4ae4a3f3 100644
--- a/cpp/src/arrow/compute/exec/hash_join.h
+++ b/cpp/src/arrow/compute/exec/hash_join.h
@@ -66,6 +66,7 @@ class ARROW_EXPORT HashJoinSchema {
   SchemaProjectionMaps<HashJoinProjection> proj_maps[2];
 
  private:
+  static bool IsTypeSupported(const DataType& type);
   static Result<std::vector<FieldRef>> VectorDiff(const Schema& schema,
                                                   const std::vector<FieldRef>& a,
                                                   const std::vector<FieldRef>& b);
diff --git a/cpp/src/arrow/compute/exec/hash_join_node.cc b/cpp/src/arrow/compute/exec/hash_join_node.cc
index 583ac9a14685b..4bccb761070f4 100644
--- a/cpp/src/arrow/compute/exec/hash_join_node.cc
+++ b/cpp/src/arrow/compute/exec/hash_join_node.cc
@@ -34,6 +34,15 @@ using internal::checked_cast;
 
 namespace compute {
 
+// Check if a type is supported in a join (as either a key or non-key column)
+bool HashJoinSchema::IsTypeSupported(const DataType& type) {
+  const Type::type id = type.id();
+  if (id == Type::DICTIONARY) {
+    return IsTypeSupported(*checked_cast<const DictionaryType&>(type).value_type());
+  }
+  return is_fixed_width(id) || is_binary_like(id) || is_large_binary_like(id);
+}
+
 Result<std::vector<FieldRef>> HashJoinSchema::VectorDiff(const Schema& schema,
                                                          const std::vector<FieldRef>& a,
                                                          const std::vector<FieldRef>& b) {
@@ -141,8 +150,7 @@ Status HashJoinSchema::ValidateSchemas(JoinType join_type, const Schema& left_sc
   // 2. Same number of key fields on left and right
   // 3. At least one key field
   // 4. Equal data types for corresponding key fields
-  // 5. Dictionary type is not supported in a key field
-  // 6. Some other data types may not be allowed in a key field
+  // 5. Some data types may not be allowed in a key field or non-key field
   //
   if (left_keys.size() != right_keys.size()) {
     return Status::Invalid("Different number of key fields on left (", left_keys.size(),
@@ -164,11 +172,8 @@ Status HashJoinSchema::ValidateSchemas(JoinType join_type, const Schema& left_sc
     const FieldPath& match = result.ValueUnsafe();
     const std::shared_ptr<DataType>& type =
         (left_side ? left_schema.fields() : right_schema.fields())[match[0]]->type();
-    if ((type->id() != Type::BOOL && !is_fixed_width(type->id()) &&
-         !is_binary_like(type->id())) ||
-        is_large_binary_like(type->id())) {
-      return Status::Invalid("Data type ", type->ToString(),
-                             " is not supported in join key field");
+    if (!IsTypeSupported(*type)) {
+      return Status::Invalid("Data type ", *type, " is not supported in join key field");
     }
   }
   for (size_t i = 0; i < left_keys.size(); ++i) {
@@ -185,6 +190,20 @@ Status HashJoinSchema::ValidateSchemas(JoinType join_type, const Schema& left_sc
           right_ref.ToString(), " of type ", right_type->ToString());
     }
   }
+  for (const auto& field : left_schema.fields()) {
+    const auto& type = *field->type();
+    if (!IsTypeSupported(type)) {
+      return Status::Invalid("Data type ", type,
+                             " is not supported in join non-key field");
+    }
+  }
+  for (const auto& field : right_schema.fields()) {
+    const auto& type = *field->type();
+    if (!IsTypeSupported(type)) {
+      return Status::Invalid("Data type ", type,
+                             " is not supported in join non-key field");
+    }
+  }
 
   // Check for output fields:
   // 1. Output field refs must match exactly one input field
diff --git a/cpp/src/arrow/compute/exec/hash_join_node_test.cc b/cpp/src/arrow/compute/exec/hash_join_node_test.cc
index d20b456fec513..9afddf3c5dc2f 100644
--- a/cpp/src/arrow/compute/exec/hash_join_node_test.cc
+++ b/cpp/src/arrow/compute/exec/hash_join_node_test.cc
@@ -1656,5 +1656,38 @@ TEST(HashJoin, DictNegative) {
   }
 }
 
+TEST(HashJoin, UnsupportedTypes) {
+  // ARROW-14519
+  const bool parallel = false;
+  const bool slow = false;
+
+  auto l_schema = schema({field("l_i32", int32()), field("l_list", list(int32()))});
+  auto l_schema_nolist = schema({field("l_i32", int32())});
+  auto r_schema = schema({field("r_i32", int32()), field("r_list", list(int32()))});
+  auto r_schema_nolist = schema({field("r_i32", int32())});
+
+  std::vector<std::pair<std::shared_ptr<Schema>, std::shared_ptr<Schema>>> cases{
+      {l_schema, r_schema}, {l_schema_nolist, r_schema}, {l_schema, r_schema_nolist}};
+  std::vector<FieldRef> l_keys{{"l_i32"}};
+  std::vector<FieldRef> r_keys{{"r_i32"}};
+
+  for (const auto& schemas : cases) {
+    BatchesWithSchema l_batches = GenerateBatchesFromString(schemas.first, {R"([])"});
+    BatchesWithSchema r_batches = GenerateBatchesFromString(schemas.second, {R"([])"});
+
+    ExecContext exec_ctx;
+    ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make(&exec_ctx));
+
+    HashJoinNodeOptions join_options{JoinType::LEFT_SEMI, l_keys, r_keys};
+    Declaration join{"hashjoin", join_options};
+    join.inputs.emplace_back(Declaration{
+        "source", SourceNodeOptions{l_batches.schema, l_batches.gen(parallel, slow)}});
+    join.inputs.emplace_back(Declaration{
+        "source", SourceNodeOptions{r_batches.schema, r_batches.gen(parallel, slow)}});
+
+    ASSERT_RAISES(Invalid, join.AddToPlan(plan.get()));
+  }
+}
+
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/exec/schema_util.h b/cpp/src/arrow/compute/exec/schema_util.h
index 33f42701ff586..279cbb806db32 100644
--- a/cpp/src/arrow/compute/exec/schema_util.h
+++ b/cpp/src/arrow/compute/exec/schema_util.h
@@ -62,7 +62,7 @@ class SchemaProjectionMaps {
               const std::vector<ProjectionIdEnum>& projection_handles,
               const std::vector<const std::vector<FieldRef>*>& projections) {
     ARROW_DCHECK(projection_handles.size() == projections.size());
-    RegisterSchema(full_schema_handle, schema);
+    ARROW_RETURN_NOT_OK(RegisterSchema(full_schema_handle, schema));
     for (size_t i = 0; i < projections.size(); ++i) {
       ARROW_RETURN_NOT_OK(
           RegisterProjectedSchema(projection_handles[i], *(projections[i]), schema));
@@ -76,11 +76,6 @@ class SchemaProjectionMaps {
     return static_cast<int>(schemas_[id].second.size());
   }
 
-  const KeyEncoder::KeyColumnMetadata& column_metadata(ProjectionIdEnum schema_handle,
-                                                       int field_id) const {
-    return field(schema_handle, field_id).column_metadata;
-  }
-
   const std::string& field_name(ProjectionIdEnum schema_handle, int field_id) const {
     return field(schema_handle, field_id).field_name;
   }
@@ -105,10 +100,9 @@ class SchemaProjectionMaps {
     int field_path;
     std::string field_name;
     std::shared_ptr<DataType> data_type;
-    KeyEncoder::KeyColumnMetadata column_metadata;
   };
 
-  void RegisterSchema(ProjectionIdEnum handle, const Schema& schema) {
+  Status RegisterSchema(ProjectionIdEnum handle, const Schema& schema) {
     std::vector<FieldInfo> out_fields;
     const FieldVector& in_fields = schema.fields();
     out_fields.resize(in_fields.size());
@@ -118,9 +112,9 @@ class SchemaProjectionMaps {
       out_fields[i].field_path = static_cast<int>(i);
       out_fields[i].field_name = name;
       out_fields[i].data_type = type;
-      out_fields[i].column_metadata = ColumnMetadataFromDataType(type);
     }
     schemas_.push_back(std::make_pair(handle, out_fields));
+    return Status::OK();
   }
 
   Status RegisterProjectedSchema(ProjectionIdEnum handle,
@@ -137,7 +131,6 @@ class SchemaProjectionMaps {
       out_fields[i].field_path = match[0];
       out_fields[i].field_name = name;
       out_fields[i].data_type = type;
-      out_fields[i].column_metadata = ColumnMetadataFromDataType(type);
     }
     schemas_.push_back(std::make_pair(handle, out_fields));
     return Status::OK();
@@ -153,25 +146,6 @@ class SchemaProjectionMaps {
     }
   }
 
-  KeyEncoder::KeyColumnMetadata ColumnMetadataFromDataType(
-      const std::shared_ptr<DataType>& type) {
-    if (type->id() == Type::DICTIONARY) {
-      auto bit_width = checked_cast<const FixedWidthType&>(*type).bit_width();
-      ARROW_DCHECK(bit_width % 8 == 0);
-      return KeyEncoder::KeyColumnMetadata(true, bit_width / 8);
-    } else if (type->id() == Type::BOOL) {
-      return KeyEncoder::KeyColumnMetadata(true, 0);
-    } else if (is_fixed_width(type->id())) {
-      return KeyEncoder::KeyColumnMetadata(
-          true, checked_cast<const FixedWidthType&>(*type).bit_width() / 8);
-    } else if (is_binary_like(type->id())) {
-      return KeyEncoder::KeyColumnMetadata(false, sizeof(uint32_t));
-    } else {
-      ARROW_DCHECK(false);
-      return KeyEncoder::KeyColumnMetadata(true, 0);
-    }
-  }
-
   int schema_id(ProjectionIdEnum schema_handle) const {
     for (size_t i = 0; i < schemas_.size(); ++i) {
       if (schemas_[i].first == schema_handle) {

From 41000a134e1007a587ac3afd70493a4aeed690c9 Mon Sep 17 00:00:00 2001
From: Pradeep Garigipati <pradeep.garigipati@gmail.com>
Date: Mon, 8 Nov 2021 08:03:52 -0500
Subject: [PATCH 099/194] ARROW-13462: [C++] Fix example code stub in Compute
 API documentation

Closes #11636 from 9prady9/ARROW-13462-Incorrect-examples-in-compute-rst

Authored-by: Pradeep Garigipati <pradeep.garigipati@gmail.com>
Signed-off-by: Neal Richardson <neal.p.richardson@gmail.com>
---
 docs/source/cpp/compute.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst
index 26c44a67e5bb7..824170481a3ac 100644
--- a/docs/source/cpp/compute.rst
+++ b/docs/source/cpp/compute.rst
@@ -64,7 +64,7 @@ Compute functions can be invoked by name using
 
    ARROW_ASSIGN_OR_RAISE(incremented_datum,
                          arrow::compute::CallFunction("add", {numbers_array, increment}));
-   std::shared_ptr<Array> incremented_array = std::move(incremented_datum).array();
+   std::shared_ptr<Array> incremented_array = std::move(incremented_datum).make_array();
 
 (note this example uses implicit conversion from ``std::shared_ptr<Array>``
 to ``Datum``)
@@ -78,7 +78,7 @@ Many compute functions are also available directly as concrete APIs, here
 
    ARROW_ASSIGN_OR_RAISE(incremented_datum,
                          arrow::compute::Add(numbers_array, increment));
-   std::shared_ptr<Array> incremented_array = std::move(incremented_datum).array();
+   std::shared_ptr<Array> incremented_array = std::move(incremented_datum).make_array();
 
 Some functions accept or require an options structure that determines the
 exact semantics of the function::

From 8ebc505886f0cb44d68509cc2be930a293e7b061 Mon Sep 17 00:00:00 2001
From: David Li <li.davidm96@gmail.com>
Date: Mon, 8 Nov 2021 09:30:51 -0500
Subject: [PATCH 100/194] ARROW-13987: [C++] Support nested field refs

This implements nested field refs in C++ only, using a SmallVector to hold the FieldPath. This only lets us extract from a struct (I'm not so sure it makes sense for other types?).

The JIRA also requests being able to extract a field as an expression. I think this could be done by implementing a small kernel that we could call. (Or otherwise I think we'd have to add a new case to the Expression variant, which maybe isn't a big deal.) If that sounds reasonable it can be added here.

A microbenchmark was added to see if this impacts the common case of a non-nested field ref. On my local machine it does not appear to (~10-20ns difference).

Closes #11466 from lidavidm/arrow-13987

Authored-by: David Li <li.davidm96@gmail.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 cpp/src/arrow/array/array_nested.cc           | 91 ++++++++++---------
 cpp/src/arrow/array/array_nested.h            |  8 ++
 cpp/src/arrow/compute/exec/expression.cc      | 35 +++++--
 cpp/src/arrow/compute/exec/expression.h       |  3 +-
 .../compute/exec/expression_benchmark.cc      | 36 ++++++++
 cpp/src/arrow/compute/exec/expression_test.cc | 79 ++++++++++++++--
 6 files changed, 195 insertions(+), 57 deletions(-)

diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc
index 22ad728a4ecdf..2b4006961c76f 100644
--- a/cpp/src/arrow/array/array_nested.cc
+++ b/cpp/src/arrow/array/array_nested.cc
@@ -541,56 +541,63 @@ std::shared_ptr<Array> StructArray::GetFieldByName(const std::string& name) cons
 
 Result<ArrayVector> StructArray::Flatten(MemoryPool* pool) const {
   ArrayVector flattened;
-  flattened.reserve(data_->child_data.size());
+  flattened.resize(data_->child_data.size());
   std::shared_ptr<Buffer> null_bitmap = data_->buffers[0];
 
-  for (const auto& child_data_ptr : data_->child_data) {
-    auto child_data = child_data_ptr->Copy();
+  for (int i = 0; static_cast<size_t>(i) < data_->child_data.size(); i++) {
+    ARROW_ASSIGN_OR_RAISE(flattened[i], GetFlattenedField(i, pool));
+  }
 
-    std::shared_ptr<Buffer> flattened_null_bitmap;
-    int64_t flattened_null_count = kUnknownNullCount;
+  return flattened;
+}
 
-    // Need to adjust for parent offset
-    if (data_->offset != 0 || data_->length != child_data->length) {
-      child_data = child_data->Slice(data_->offset, data_->length);
-    }
-    std::shared_ptr<Buffer> child_null_bitmap = child_data->buffers[0];
-    const int64_t child_offset = child_data->offset;
-
-    // The validity of a flattened datum is the logical AND of the struct
-    // element's validity and the individual field element's validity.
-    if (null_bitmap && child_null_bitmap) {
-      ARROW_ASSIGN_OR_RAISE(
-          flattened_null_bitmap,
-          BitmapAnd(pool, child_null_bitmap->data(), child_offset, null_bitmap_data_,
-                    data_->offset, data_->length, child_offset));
-    } else if (child_null_bitmap) {
-      flattened_null_bitmap = child_null_bitmap;
-      flattened_null_count = child_data->null_count;
-    } else if (null_bitmap) {
-      if (child_offset == data_->offset) {
-        flattened_null_bitmap = null_bitmap;
-      } else {
-        // If the child has an offset, need to synthesize a validity
-        // buffer with an offset too
-        ARROW_ASSIGN_OR_RAISE(flattened_null_bitmap,
-                              AllocateEmptyBitmap(child_offset + data_->length, pool));
-        CopyBitmap(null_bitmap_data_, data_->offset, data_->length,
-                   flattened_null_bitmap->mutable_data(), child_offset);
-      }
-      flattened_null_count = data_->null_count;
-    } else {
-      flattened_null_count = 0;
-    }
+Result<std::shared_ptr<Array>> StructArray::GetFlattenedField(int index,
+                                                              MemoryPool* pool) const {
+  std::shared_ptr<Buffer> null_bitmap = data_->buffers[0];
+
+  auto child_data = data_->child_data[index]->Copy();
 
-    auto flattened_data = child_data->Copy();
-    flattened_data->buffers[0] = flattened_null_bitmap;
-    flattened_data->null_count = flattened_null_count;
+  std::shared_ptr<Buffer> flattened_null_bitmap;
+  int64_t flattened_null_count = kUnknownNullCount;
 
-    flattened.push_back(MakeArray(flattened_data));
+  // Need to adjust for parent offset
+  if (data_->offset != 0 || data_->length != child_data->length) {
+    child_data = child_data->Slice(data_->offset, data_->length);
   }
+  std::shared_ptr<Buffer> child_null_bitmap = child_data->buffers[0];
+  const int64_t child_offset = child_data->offset;
 
-  return flattened;
+  // The validity of a flattened datum is the logical AND of the struct
+  // element's validity and the individual field element's validity.
+  if (null_bitmap && child_null_bitmap) {
+    ARROW_ASSIGN_OR_RAISE(
+        flattened_null_bitmap,
+        BitmapAnd(pool, child_null_bitmap->data(), child_offset, null_bitmap_data_,
+                  data_->offset, data_->length, child_offset));
+  } else if (child_null_bitmap) {
+    flattened_null_bitmap = child_null_bitmap;
+    flattened_null_count = child_data->null_count;
+  } else if (null_bitmap) {
+    if (child_offset == data_->offset) {
+      flattened_null_bitmap = null_bitmap;
+    } else {
+      // If the child has an offset, need to synthesize a validity
+      // buffer with an offset too
+      ARROW_ASSIGN_OR_RAISE(flattened_null_bitmap,
+                            AllocateEmptyBitmap(child_offset + data_->length, pool));
+      CopyBitmap(null_bitmap_data_, data_->offset, data_->length,
+                 flattened_null_bitmap->mutable_data(), child_offset);
+    }
+    flattened_null_count = data_->null_count;
+  } else {
+    flattened_null_count = 0;
+  }
+
+  auto flattened_data = child_data->Copy();
+  flattened_data->buffers[0] = flattened_null_bitmap;
+  flattened_data->null_count = flattened_null_count;
+
+  return MakeArray(flattened_data);
 }
 
 // ----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array/array_nested.h b/cpp/src/arrow/array/array_nested.h
index 97e470f550ad5..178a0589d5a3b 100644
--- a/cpp/src/arrow/array/array_nested.h
+++ b/cpp/src/arrow/array/array_nested.h
@@ -370,6 +370,14 @@ class ARROW_EXPORT StructArray : public Array {
   /// \param[in] pool The pool to allocate null bitmaps from, if necessary
   Result<ArrayVector> Flatten(MemoryPool* pool = default_memory_pool()) const;
 
+  /// \brief Get one of the child arrays, combining its null bitmap
+  /// with the parent struct array's bitmap.
+  ///
+  /// \param[in] index Which child array to get
+  /// \param[in] pool The pool to allocate null bitmaps from, if necessary
+  Result<std::shared_ptr<Array>> GetFlattenedField(
+      int index, MemoryPool* pool = default_memory_pool()) const;
+
  private:
   // For caching boxed child data
   // XXX This is not handled in a thread-safe manner.
diff --git a/cpp/src/arrow/compute/exec/expression.cc b/cpp/src/arrow/compute/exec/expression.cc
index 64e3305825d10..03db24b5413bd 100644
--- a/cpp/src/arrow/compute/exec/expression.cc
+++ b/cpp/src/arrow/compute/exec/expression.cc
@@ -63,7 +63,7 @@ Expression::Expression(Parameter parameter)
 Expression literal(Datum lit) { return Expression(std::move(lit)); }
 
 Expression field_ref(FieldRef ref) {
-  return Expression(Expression::Parameter{std::move(ref), ValueDescr{}, -1});
+  return Expression(Expression::Parameter{std::move(ref), ValueDescr{}, {-1}});
 }
 
 Expression call(std::string function, std::vector<Expression> arguments,
@@ -394,14 +394,11 @@ Result<Expression> BindImpl(Expression expr, const TypeOrSchema& in,
   if (expr.literal()) return expr;
 
   if (auto ref = expr.field_ref()) {
-    if (ref->IsNested()) {
-      return Status::NotImplemented("nested field references");
-    }
-
     ARROW_ASSIGN_OR_RAISE(auto path, ref->FindOne(in));
 
     auto bound = *expr.parameter();
-    bound.index = path[0];
+    bound.indices.resize(path.indices().size());
+    std::copy(path.indices().begin(), path.indices().end(), bound.indices.begin());
     ARROW_ASSIGN_OR_RAISE(auto field, path.Get(in));
     bound.descr.type = field->type();
     bound.descr.shape = shape;
@@ -512,7 +509,31 @@ Result<Datum> ExecuteScalarExpression(const Expression& expr, const ExecBatch& i
       return MakeNullScalar(null());
     }
 
-    const Datum& field = input[param->index];
+    Datum field = input[param->indices[0]];
+    for (auto it = param->indices.begin() + 1; it != param->indices.end(); ++it) {
+      if (field.type()->id() != Type::STRUCT) {
+        return Status::Invalid("Nested field reference into a non-struct: ",
+                               *field.type());
+      }
+      const int index = *it;
+      if (index < 0 || index >= field.type()->num_fields()) {
+        return Status::Invalid("Out of bounds field reference: ", index, " but type has ",
+                               field.type()->num_fields(), " fields");
+      }
+      if (field.is_scalar()) {
+        const auto& struct_scalar = field.scalar_as<StructScalar>();
+        if (!struct_scalar.is_valid) {
+          return MakeNullScalar(param->descr.type);
+        }
+        field = struct_scalar.value[index];
+      } else if (field.is_array()) {
+        const auto& struct_array = field.array_as<StructArray>();
+        ARROW_ASSIGN_OR_RAISE(
+            field, struct_array->GetFlattenedField(index, exec_context->memory_pool()));
+      } else {
+        return Status::NotImplemented("Nested field reference into a ", field.ToString());
+      }
+    }
     if (!field.type()->Equals(param->descr.type)) {
       return Status::Invalid("Referenced field ", expr.ToString(), " was ",
                              field.type()->ToString(), " but should have been ",
diff --git a/cpp/src/arrow/compute/exec/expression.h b/cpp/src/arrow/compute/exec/expression.h
index dac5728ab46d6..7c567cc8fc694 100644
--- a/cpp/src/arrow/compute/exec/expression.h
+++ b/cpp/src/arrow/compute/exec/expression.h
@@ -27,6 +27,7 @@
 #include "arrow/compute/type_fwd.h"
 #include "arrow/datum.h"
 #include "arrow/type_fwd.h"
+#include "arrow/util/small_vector.h"
 #include "arrow/util/variant.h"
 
 namespace arrow {
@@ -112,7 +113,7 @@ class ARROW_EXPORT Expression {
 
     // post-bind properties
     ValueDescr descr;
-    int index;
+    internal::SmallVector<int, 2> indices;
   };
   const Parameter* parameter() const;
 
diff --git a/cpp/src/arrow/compute/exec/expression_benchmark.cc b/cpp/src/arrow/compute/exec/expression_benchmark.cc
index 1899b7caab6df..d1738c9c23cbb 100644
--- a/cpp/src/arrow/compute/exec/expression_benchmark.cc
+++ b/cpp/src/arrow/compute/exec/expression_benchmark.cc
@@ -19,6 +19,7 @@
 
 #include "arrow/compute/cast.h"
 #include "arrow/compute/exec/expression.h"
+#include "arrow/compute/exec/test_util.h"
 #include "arrow/dataset/partition.h"
 #include "arrow/testing/gtest_util.h"
 #include "arrow/type.h"
@@ -29,6 +30,34 @@ namespace compute {
 std::shared_ptr<Scalar> ninety_nine_dict =
     DictionaryScalar::Make(MakeScalar(0), ArrayFromJSON(int64(), "[99]"));
 
+static void BindAndEvaluate(benchmark::State& state, Expression expr) {
+  ExecContext ctx;
+  auto struct_type = struct_({
+      field("int", int64()),
+      field("float", float64()),
+  });
+  auto dataset_schema = schema({
+      field("int_arr", int64()),
+      field("struct_arr", struct_type),
+      field("int_scalar", int64()),
+      field("struct_scalar", struct_type),
+  });
+  ExecBatch input(
+      {
+          Datum(ArrayFromJSON(int64(), "[0, 2, 4, 8]")),
+          Datum(ArrayFromJSON(struct_type,
+                              "[[0, 2.0], [4, 8.0], [16, 32.0], [64, 128.0]]")),
+          Datum(ScalarFromJSON(int64(), "16")),
+          Datum(ScalarFromJSON(struct_type, "[32, 64.0]")),
+      },
+      /*length=*/4);
+
+  for (auto _ : state) {
+    ASSIGN_OR_ABORT(auto bound, expr.Bind(*dataset_schema));
+    ABORT_NOT_OK(ExecuteScalarExpression(bound, input, &ctx).status());
+  }
+}
+
 // A benchmark of SimplifyWithGuarantee using expressions arising from partitioning.
 static void SimplifyFilterWithGuarantee(benchmark::State& state, Expression filter,
                                         Expression guarantee) {
@@ -84,5 +113,12 @@ BENCHMARK_CAPTURE(SimplifyFilterWithGuarantee,
 BENCHMARK_CAPTURE(SimplifyFilterWithGuarantee, positive_filter_cast_guarantee_dictionary,
                   filter_cast_positive, guarantee_dictionary);
 
+BENCHMARK_CAPTURE(BindAndEvaluate, simple_array, field_ref("int_arr"));
+BENCHMARK_CAPTURE(BindAndEvaluate, simple_scalar, field_ref("int_scalar"));
+BENCHMARK_CAPTURE(BindAndEvaluate, nested_array,
+                  field_ref(FieldRef("struct_arr", "float")));
+BENCHMARK_CAPTURE(BindAndEvaluate, nested_scalar,
+                  field_ref(FieldRef("struct_scalar", "float")));
+
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/exec/expression_test.cc b/cpp/src/arrow/compute/exec/expression_test.cc
index 88b94e80434b6..94ca20748351c 100644
--- a/cpp/src/arrow/compute/exec/expression_test.cc
+++ b/cpp/src/arrow/compute/exec/expression_test.cc
@@ -476,15 +476,16 @@ TEST(Expression, BindLiteral) {
 }
 
 void ExpectBindsTo(Expression expr, util::optional<Expression> expected,
-                   Expression* bound_out = nullptr) {
+                   Expression* bound_out = nullptr,
+                   const Schema& schema = *kBoringSchema) {
   if (!expected) {
     expected = expr;
   }
 
-  ASSERT_OK_AND_ASSIGN(auto bound, expr.Bind(*kBoringSchema));
+  ASSERT_OK_AND_ASSIGN(auto bound, expr.Bind(schema));
   EXPECT_TRUE(bound.IsBound());
 
-  ASSERT_OK_AND_ASSIGN(expected, expected->Bind(*kBoringSchema));
+  ASSERT_OK_AND_ASSIGN(expected, expected->Bind(schema));
   EXPECT_EQ(bound, *expected) << " unbound: " << expr.ToString();
 
   if (bound_out) {
@@ -508,11 +509,24 @@ TEST(Expression, BindFieldRef) {
   // in the input schema
   ASSERT_RAISES(Invalid, field_ref("alpha").Bind(Schema(
                              {field("alpha", int32()), field("alpha", float32())})));
+}
+
+TEST(Expression, BindNestedFieldRef) {
+  Expression expr;
+  auto schema = Schema({field("a", struct_({field("b", int32())}))});
+
+  ExpectBindsTo(field_ref(FieldRef("a", "b")), no_change, &expr, schema);
+  EXPECT_TRUE(expr.IsBound());
+  EXPECT_EQ(expr.descr(), ValueDescr::Array(int32()));
 
-  // referencing nested fields is not supported
-  ASSERT_RAISES(NotImplemented,
-                field_ref(FieldRef("a", "b"))
-                    .Bind(Schema({field("a", struct_({field("b", int32())}))})));
+  ExpectBindsTo(field_ref(FieldRef(FieldPath({0, 0}))), no_change, &expr, schema);
+  EXPECT_TRUE(expr.IsBound());
+  EXPECT_EQ(expr.descr(), ValueDescr::Array(int32()));
+
+  ASSERT_RAISES(Invalid, field_ref(FieldPath({0, 1})).Bind(schema));
+  ASSERT_RAISES(Invalid, field_ref(FieldRef("a", "b"))
+                             .Bind(Schema({field("a", struct_({field("b", int32()),
+                                                               field("b", int64())}))})));
 }
 
 TEST(Expression, BindCall) {
@@ -614,6 +628,45 @@ TEST(Expression, ExecuteFieldRef) {
     {"a": -1,    "b": 4.0}
   ])"),
               ArrayFromJSON(float64(), R"([7.5, 2.125, 4.0])"));
+
+  ExpectRefIs(FieldRef(FieldPath({0, 0})),
+              ArrayFromJSON(struct_({field("a", struct_({field("b", float64())}))}), R"([
+    {"a": {"b": 6.125}},
+    {"a": {"b": 0.0}},
+    {"a": {"b": -1}}
+  ])"),
+              ArrayFromJSON(float64(), R"([6.125, 0.0, -1])"));
+
+  ExpectRefIs(FieldRef("a", "b"),
+              ArrayFromJSON(struct_({field("a", struct_({field("b", float64())}))}), R"([
+    {"a": {"b": 6.125}},
+    {"a": {"b": 0.0}},
+    {"a": {"b": -1}}
+  ])"),
+              ArrayFromJSON(float64(), R"([6.125, 0.0, -1])"));
+
+  ExpectRefIs(FieldRef("a", "b"),
+              ArrayFromJSON(struct_({field("a", struct_({field("b", float64())}))}), R"([
+    {"a": {"b": 6.125}},
+    {"a": null},
+    {"a": {"b": null}}
+  ])"),
+              ArrayFromJSON(float64(), R"([6.125, null, null])"));
+
+  ExpectRefIs(
+      FieldRef("a", "b"),
+      ScalarFromJSON(struct_({field("a", struct_({field("b", float64())}))}), "[[64.0]]"),
+      ScalarFromJSON(float64(), "64.0"));
+
+  ExpectRefIs(
+      FieldRef("a", "b"),
+      ScalarFromJSON(struct_({field("a", struct_({field("b", float64())}))}), "[[null]]"),
+      ScalarFromJSON(float64(), "null"));
+
+  ExpectRefIs(
+      FieldRef("a", "b"),
+      ScalarFromJSON(struct_({field("a", struct_({field("b", float64())}))}), "[null]"),
+      ScalarFromJSON(float64(), "null"));
 }
 
 Result<Datum> NaiveExecuteScalarExpression(const Expression& expr, const Datum& input) {
@@ -697,6 +750,18 @@ TEST(Expression, ExecuteCall) {
     {"a": 0.0},
     {"a": -1}
   ])"));
+
+  ExpectExecute(
+      call("add", {field_ref(FieldRef("a", "a")), field_ref(FieldRef("a", "b"))}),
+      ArrayFromJSON(struct_({field("a", struct_({
+                                            field("a", float64()),
+                                            field("b", float64()),
+                                        }))}),
+                    R"([
+    {"a": {"a": 6.125, "b": 3.375}},
+    {"a": {"a": 0.0,   "b": 1}},
+    {"a": {"a": -1,    "b": 4.75}}
+  ])"));
 }
 
 TEST(Expression, ExecuteDictionaryTransparent) {

From f59e03ffb4570dd575715b7991b0d11ac64fe86a Mon Sep 17 00:00:00 2001
From: Pradeep Garigipati <pradeep.garigipati@gmail.com>
Date: Mon, 8 Nov 2021 15:51:36 +0100
Subject: [PATCH 101/194] ARROW-14627: [C++] Fix tests compilation error using
 GCC 11.1

The following files have some uninitialised static arrays which are giving compilation errors with new GCC version 11.1

1) align_util_test.cc
2) bit_util_test.cc

Closes #11638 from 9prady9/ARROW-14627-Uninitialized-static-arrays-give-compila

Authored-by: Pradeep Garigipati <pradeep.garigipati@gmail.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/arrow/util/align_util_test.cc | 4 ++--
 cpp/src/arrow/util/bit_util_test.cc   | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/src/arrow/util/align_util_test.cc b/cpp/src/arrow/util/align_util_test.cc
index 2f6380c62e9be..9d497e6845809 100644
--- a/cpp/src/arrow/util/align_util_test.cc
+++ b/cpp/src/arrow/util/align_util_test.cc
@@ -60,7 +60,7 @@ void CheckBitmapWordAlign(const uint8_t* data, int64_t bit_offset, int64_t lengt
 }
 
 TEST(BitmapWordAlign, AlignedDataStart) {
-  alignas(8) char buf[136];
+  alignas(8) char buf[136] = {0};
 
   // A 8-byte aligned pointer
   const uint8_t* P = reinterpret_cast<const uint8_t*>(buf);
@@ -104,7 +104,7 @@ TEST(BitmapWordAlign, AlignedDataStart) {
 }
 
 TEST(BitmapWordAlign, UnalignedDataStart) {
-  alignas(8) char buf[136];
+  alignas(8) char buf[136] = {0};
 
   const uint8_t* P = reinterpret_cast<const uint8_t*>(buf) + 1;
   const uint8_t* A = P + 7;
diff --git a/cpp/src/arrow/util/bit_util_test.cc b/cpp/src/arrow/util/bit_util_test.cc
index c3fb083219813..4a12384a41690 100644
--- a/cpp/src/arrow/util/bit_util_test.cc
+++ b/cpp/src/arrow/util/bit_util_test.cc
@@ -2187,7 +2187,7 @@ TEST(Bitmap, VisitWords) {
 
 // This test reads uninitialized memory
 TEST(Bitmap, VisitPartialWords) {
-  uint64_t words[2];
+  uint64_t words[2] = {0};
   constexpr auto nbytes = sizeof(words);
   constexpr auto nbits = nbytes * 8;
 

From 1420544e5bc9082415f0167fd98a647ce04849be Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Mon, 8 Nov 2021 15:53:36 +0100
Subject: [PATCH 102/194] ARROW-14622: [C++] Fix initialization-order-fiasco
 reports

Signed-off-by: Vitaly Buka <vitalybuka@google.com>

Closes #11633 from vitalybuka/master

Authored-by: Vitaly Buka <vitalybuka@google.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/arrow/compute/kernels/scalar_string.cc |  7 +++++--
 .../arrow/compute/kernels/vector_array_sort.cc |  7 +++++--
 cpp/src/arrow/compute/kernels/vector_hash.cc   | 13 +++++++++----
 .../arrow/compute/kernels/vector_selection.cc  | 18 ++++++++++++------
 cpp/src/arrow/compute/kernels/vector_sort.cc   | 14 ++++++++++----
 5 files changed, 41 insertions(+), 18 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index 09e189bc562cb..7eeb80b013b0b 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -4617,7 +4617,10 @@ const FunctionDoc binary_join_element_wise_doc(
      "emit a null (the default), are skipped, or replaced with a given string.\n"),
     {"*strings"}, "JoinOptions");
 
-const auto kDefaultJoinOptions = JoinOptions::Defaults();
+const JoinOptions* GetDefaultJoinOptions() {
+  static const auto kDefaultJoinOptions = JoinOptions::Defaults();
+  return &kDefaultJoinOptions;
+}
 
 template <typename ListType>
 void AddBinaryJoinForListType(ScalarFunction* func) {
@@ -4639,7 +4642,7 @@ void AddBinaryJoin(FunctionRegistry* registry) {
   {
     auto func = std::make_shared<ScalarFunction>(
         "binary_join_element_wise", Arity::VarArgs(/*min_args=*/1),
-        &binary_join_element_wise_doc, &kDefaultJoinOptions);
+        &binary_join_element_wise_doc, GetDefaultJoinOptions());
     for (const auto& ty : BaseBinaryTypes()) {
       ScalarKernel kernel{KernelSignature::Make({InputType(ty)}, ty, /*is_varargs=*/true),
                           GenerateTypeAgnosticVarBinaryBase<BinaryJoinElementWise>(ty),
diff --git a/cpp/src/arrow/compute/kernels/vector_array_sort.cc b/cpp/src/arrow/compute/kernels/vector_array_sort.cc
index 1de122663d6d7..64cc1fe180bba 100644
--- a/cpp/src/arrow/compute/kernels/vector_array_sort.cc
+++ b/cpp/src/arrow/compute/kernels/vector_array_sort.cc
@@ -497,7 +497,10 @@ void AddArraySortingKernels(VectorKernel base, VectorFunction* func) {
   DCHECK_OK(func->AddKernel(base));
 }
 
-const auto kDefaultArraySortOptions = ArraySortOptions::Defaults();
+const ArraySortOptions* GetDefaultArraySortOptions() {
+  static const auto kDefaultArraySortOptions = ArraySortOptions::Defaults();
+  return &kDefaultArraySortOptions;
+}
 
 const FunctionDoc array_sort_indices_doc(
     "Return the indices that would sort an array",
@@ -543,7 +546,7 @@ void RegisterVectorArraySort(FunctionRegistry* registry) {
 
   auto array_sort_indices = std::make_shared<VectorFunction>(
       "array_sort_indices", Arity::Unary(), &array_sort_indices_doc,
-      &kDefaultArraySortOptions);
+      GetDefaultArraySortOptions());
   base.init = ArraySortIndicesState::Init;
   AddArraySortingKernels<ArraySortIndices>(base, array_sort_indices.get());
   DCHECK_OK(registry->AddFunction(std::move(array_sort_indices)));
diff --git a/cpp/src/arrow/compute/kernels/vector_hash.cc b/cpp/src/arrow/compute/kernels/vector_hash.cc
index 02c443bafe8b9..80fc29c94ad27 100644
--- a/cpp/src/arrow/compute/kernels/vector_hash.cc
+++ b/cpp/src/arrow/compute/kernels/vector_hash.cc
@@ -737,7 +737,11 @@ const FunctionDoc value_counts_doc(
      "Nulls in the input are ignored."),
     {"array"});
 
-const auto kDefaultDictionaryEncodeOptions = DictionaryEncodeOptions::Defaults();
+const DictionaryEncodeOptions* GetDefaultDictionaryEncodeOptions() {
+  static const auto kDefaultDictionaryEncodeOptions = DictionaryEncodeOptions::Defaults();
+  return &kDefaultDictionaryEncodeOptions;
+}
+
 const FunctionDoc dictionary_encode_doc(
     "Dictionary-encode array",
     ("Return a dictionary-encoded version of the input array."), {"array"},
@@ -790,9 +794,10 @@ void RegisterVectorHash(FunctionRegistry* registry) {
   base.finalize = DictEncodeFinalize;
   // Unique and ValueCounts output unchunked arrays
   base.output_chunked = true;
-  auto dict_encode = std::make_shared<VectorFunction>("dictionary_encode", Arity::Unary(),
-                                                      &dictionary_encode_doc,
-                                                      &kDefaultDictionaryEncodeOptions);
+
+  auto dict_encode = std::make_shared<VectorFunction>(
+      "dictionary_encode", Arity::Unary(), &dictionary_encode_doc,
+      GetDefaultDictionaryEncodeOptions());
   AddHashKernels<DictEncodeAction>(dict_encode.get(), base, OutputType(DictEncodeOutput));
 
   // Calling dictionary_encode on dictionary input not supported, but if it
diff --git a/cpp/src/arrow/compute/kernels/vector_selection.cc b/cpp/src/arrow/compute/kernels/vector_selection.cc
index a33c4c5cd749b..35e5e390e2242 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection.cc
@@ -1942,7 +1942,10 @@ Result<std::shared_ptr<Table>> FilterTable(const Table& table, const Datum& filt
   return Table::Make(table.schema(), std::move(out_chunks), out_num_rows);
 }
 
-static auto kDefaultFilterOptions = FilterOptions::Defaults();
+const FilterOptions* GetDefaultFilterOptions() {
+  static const auto kDefaultFilterOptions = FilterOptions::Defaults();
+  return &kDefaultFilterOptions;
+}
 
 const FunctionDoc filter_doc(
     "Filter with a boolean selection filter",
@@ -1954,7 +1957,7 @@ const FunctionDoc filter_doc(
 class FilterMetaFunction : public MetaFunction {
  public:
   FilterMetaFunction()
-      : MetaFunction("filter", Arity::Binary(), &filter_doc, &kDefaultFilterOptions) {}
+      : MetaFunction("filter", Arity::Binary(), &filter_doc, GetDefaultFilterOptions()) {}
 
   Result<Datum> ExecuteImpl(const std::vector<Datum>& args,
                             const FunctionOptions* options,
@@ -2087,7 +2090,10 @@ Result<std::shared_ptr<Table>> TakeTC(const Table& table, const ChunkedArray& in
   return Table::Make(table.schema(), std::move(columns));
 }
 
-static auto kDefaultTakeOptions = TakeOptions::Defaults();
+const TakeOptions* GetDefaultTakeOptions() {
+  static const auto kDefaultTakeOptions = TakeOptions::Defaults();
+  return &kDefaultTakeOptions;
+}
 
 const FunctionDoc take_doc(
     "Select values from an input based on indices from another array",
@@ -2103,7 +2109,7 @@ const FunctionDoc take_doc(
 class TakeMetaFunction : public MetaFunction {
  public:
   TakeMetaFunction()
-      : MetaFunction("take", Arity::Binary(), &take_doc, &kDefaultTakeOptions) {}
+      : MetaFunction("take", Arity::Binary(), &take_doc, GetDefaultTakeOptions()) {}
 
   Result<Datum> ExecuteImpl(const std::vector<Datum>& args,
                             const FunctionOptions* options,
@@ -2392,7 +2398,7 @@ void RegisterVectorSelection(FunctionRegistry* registry) {
   filter_base.init = FilterState::Init;
   RegisterSelectionFunction("array_filter", &array_filter_doc, filter_base,
                             /*selection_type=*/InputType::Array(boolean()),
-                            filter_kernel_descrs, &kDefaultFilterOptions, registry);
+                            filter_kernel_descrs, GetDefaultFilterOptions(), registry);
 
   DCHECK_OK(registry->AddFunction(std::make_shared<FilterMetaFunction>()));
 
@@ -2424,7 +2430,7 @@ void RegisterVectorSelection(FunctionRegistry* registry) {
   RegisterSelectionFunction(
       "array_take", &array_take_doc, take_base,
       /*selection_type=*/InputType(match::Integer(), ValueDescr::ARRAY),
-      take_kernel_descrs, &kDefaultTakeOptions, registry);
+      take_kernel_descrs, GetDefaultTakeOptions(), registry);
 
   DCHECK_OK(registry->AddFunction(std::make_shared<TakeMetaFunction>()));
 
diff --git a/cpp/src/arrow/compute/kernels/vector_sort.cc b/cpp/src/arrow/compute/kernels/vector_sort.cc
index 448f140da1c4b..58c86bfe054a5 100644
--- a/cpp/src/arrow/compute/kernels/vector_sort.cc
+++ b/cpp/src/arrow/compute/kernels/vector_sort.cc
@@ -1153,7 +1153,10 @@ class TableSorter {
 // ----------------------------------------------------------------------
 // Top-level sort functions
 
-const auto kDefaultSortOptions = SortOptions::Defaults();
+const SortOptions* GetDefaultSortOptions() {
+  static const auto kDefaultSortOptions = SortOptions::Defaults();
+  return &kDefaultSortOptions;
+}
 
 const FunctionDoc sort_indices_doc(
     "Return the indices that would sort an array, record batch or table",
@@ -1170,7 +1173,7 @@ class SortIndicesMetaFunction : public MetaFunction {
  public:
   SortIndicesMetaFunction()
       : MetaFunction("sort_indices", Arity::Unary(), &sort_indices_doc,
-                     &kDefaultSortOptions) {}
+                     GetDefaultSortOptions()) {}
 
   Result<Datum> ExecuteImpl(const std::vector<Datum>& args,
                             const FunctionOptions* options,
@@ -1308,7 +1311,10 @@ class SortIndicesMetaFunction : public MetaFunction {
 // ----------------------------------------------------------------------
 // TopK/BottomK implementations
 
-const auto kDefaultSelectKOptions = SelectKOptions::Defaults();
+const SelectKOptions* GetDefaultSelectKOptions() {
+  static const auto kDefaultSelectKOptions = SelectKOptions::Defaults();
+  return &kDefaultSelectKOptions;
+}
 
 const FunctionDoc select_k_unstable_doc(
     "Selects the indices of the first `k` ordered elements from the input",
@@ -1842,7 +1848,7 @@ class SelectKUnstableMetaFunction : public MetaFunction {
  public:
   SelectKUnstableMetaFunction()
       : MetaFunction("select_k_unstable", Arity::Unary(), &select_k_unstable_doc,
-                     &kDefaultSelectKOptions) {}
+                     GetDefaultSelectKOptions()) {}
 
   Result<Datum> ExecuteImpl(const std::vector<Datum>& args,
                             const FunctionOptions* options, ExecContext* ctx) const {

From 2eafc600baa9cc75935a77e490099d9193db5d12 Mon Sep 17 00:00:00 2001
From: Carlos O'Ryan <coryan@google.com>
Date: Mon, 8 Nov 2021 16:22:18 +0100
Subject: [PATCH 103/194] ARROW-14346: [C++] Implement
 GcsFileSystem::OpenOutputStream

Closes #11550 from coryan/ARROW-14346-implement-GcsFileSystem-streaming-writes

Authored-by: Carlos O'Ryan <coryan@google.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/arrow/filesystem/gcsfs.cc          |  80 +++++++++--
 cpp/src/arrow/filesystem/gcsfs_internal.cc | 131 ++++++++++++++++++
 cpp/src/arrow/filesystem/gcsfs_internal.h  |  15 +++
 cpp/src/arrow/filesystem/gcsfs_test.cc     | 150 +++++++++++++++++++++
 4 files changed, 366 insertions(+), 10 deletions(-)

diff --git a/cpp/src/arrow/filesystem/gcsfs.cc b/cpp/src/arrow/filesystem/gcsfs.cc
index 73d599fffa520..aa69890d336a4 100644
--- a/cpp/src/arrow/filesystem/gcsfs.cc
+++ b/cpp/src/arrow/filesystem/gcsfs.cc
@@ -25,6 +25,9 @@
 #include "arrow/result.h"
 #include "arrow/util/checked_cast.h"
 
+#define ARROW_GCS_RETURN_NOT_OK(expr) \
+  if (!expr.ok()) return internal::ToArrowStatus(expr)
+
 namespace arrow {
 namespace fs {
 namespace {
@@ -32,6 +35,13 @@ namespace {
 namespace gcs = google::cloud::storage;
 
 auto constexpr kSep = '/';
+// Change the default upload buffer size. In general, sending larger buffers is more
+// efficient with GCS, as each buffer requires a roundtrip to the service. With formatted
+// output (when using `operator<<`), keeping a larger buffer in memory before uploading
+// makes sense.  With unformatted output (the only choice given gcs::io::OutputStream's
+// API) it is better to let the caller provide as large a buffer as they want. The GCS C++
+// client library will upload this buffer with zero copies if possible.
+auto constexpr kUploadBufferSize = 256 * 1024;
 
 struct GcsPath {
   std::string full_path;
@@ -83,18 +93,14 @@ class GcsInputStream : public arrow::io::InputStream {
 
   Result<int64_t> Read(int64_t nbytes, void* out) override {
     stream_.read(static_cast<char*>(out), nbytes);
-    if (!stream_.status().ok()) {
-      return internal::ToArrowStatus(stream_.status());
-    }
+    ARROW_GCS_RETURN_NOT_OK(stream_.status());
     return stream_.gcount();
   }
 
   Result<std::shared_ptr<Buffer>> Read(int64_t nbytes) override {
     ARROW_ASSIGN_OR_RAISE(auto buffer, arrow::AllocateResizableBuffer(nbytes));
     stream_.read(reinterpret_cast<char*>(buffer->mutable_data()), nbytes);
-    if (!stream_.status().ok()) {
-      return internal::ToArrowStatus(stream_.status());
-    }
+    ARROW_GCS_RETURN_NOT_OK(stream_.status());
     RETURN_NOT_OK(buffer->Resize(stream_.gcount(), true));
     return buffer;
   }
@@ -103,6 +109,43 @@ class GcsInputStream : public arrow::io::InputStream {
   mutable gcs::ObjectReadStream stream_;
 };
 
+class GcsOutputStream : public arrow::io::OutputStream {
+ public:
+  explicit GcsOutputStream(gcs::ObjectWriteStream stream) : stream_(std::move(stream)) {}
+  ~GcsOutputStream() override = default;
+
+  Status Close() override {
+    stream_.Close();
+    return internal::ToArrowStatus(stream_.last_status());
+  }
+
+  Result<int64_t> Tell() const override {
+    if (!stream_) {
+      return Status::IOError("invalid stream");
+    }
+    return tell_;
+  }
+
+  bool closed() const override { return !stream_.IsOpen(); }
+
+  Status Write(const void* data, int64_t nbytes) override {
+    if (stream_.write(reinterpret_cast<const char*>(data), nbytes)) {
+      tell_ += nbytes;
+      return Status::OK();
+    }
+    return internal::ToArrowStatus(stream_.last_status());
+  }
+
+  Status Flush() override {
+    stream_.flush();
+    return Status::OK();
+  }
+
+ private:
+  gcs::ObjectWriteStream stream_;
+  int64_t tell_ = 0;
+};
+
 }  // namespace
 
 google::cloud::Options AsGoogleCloudOptions(const GcsOptions& o) {
@@ -116,6 +159,7 @@ google::cloud::Options AsGoogleCloudOptions(const GcsOptions& o) {
     options.set<google::cloud::UnifiedCredentialsOption>(
         google::cloud::MakeInsecureCredentials());
   }
+  options.set<gcs::UploadBufferSizeOption>(kUploadBufferSize);
   if (!o.endpoint_override.empty()) {
     options.set<gcs::RestEndpointOption>(scheme + "://" + o.endpoint_override);
   }
@@ -140,12 +184,27 @@ class GcsFileSystem::Impl {
 
   Result<std::shared_ptr<io::InputStream>> OpenInputStream(const GcsPath& path) {
     auto stream = client_.ReadObject(path.bucket, path.object);
-    if (!stream.status().ok()) {
-      return internal::ToArrowStatus(stream.status());
-    }
+    ARROW_GCS_RETURN_NOT_OK(stream.status());
     return std::make_shared<GcsInputStream>(std::move(stream));
   }
 
+  Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(
+      const GcsPath& path, const std::shared_ptr<const KeyValueMetadata>& metadata) {
+    gcs::EncryptionKey encryption_key;
+    ARROW_ASSIGN_OR_RAISE(encryption_key, internal::ToEncryptionKey(metadata));
+    gcs::PredefinedAcl predefined_acl;
+    ARROW_ASSIGN_OR_RAISE(predefined_acl, internal::ToPredefinedAcl(metadata));
+    gcs::KmsKeyName kms_key_name;
+    ARROW_ASSIGN_OR_RAISE(kms_key_name, internal::ToKmsKeyName(metadata));
+    gcs::WithObjectMetadata with_object_metadata;
+    ARROW_ASSIGN_OR_RAISE(with_object_metadata, internal::ToObjectMetadata(metadata));
+
+    auto stream = client_.WriteObject(path.bucket, path.object, encryption_key,
+                                      predefined_acl, kms_key_name, with_object_metadata);
+    ARROW_GCS_RETURN_NOT_OK(stream.last_status());
+    return std::make_shared<GcsOutputStream>(std::move(stream));
+  }
+
  private:
   static Result<FileInfo> GetFileInfoImpl(const GcsPath& path,
                                           const google::cloud::Status& status,
@@ -245,7 +304,8 @@ Result<std::shared_ptr<io::RandomAccessFile>> GcsFileSystem::OpenInputFile(
 
 Result<std::shared_ptr<io::OutputStream>> GcsFileSystem::OpenOutputStream(
     const std::string& path, const std::shared_ptr<const KeyValueMetadata>& metadata) {
-  return Status::NotImplemented("The GCS FileSystem is not fully implemented");
+  ARROW_ASSIGN_OR_RAISE(auto p, GcsPath::FromString(path));
+  return impl_->OpenOutputStream(p, metadata);
 }
 
 Result<std::shared_ptr<io::OutputStream>> GcsFileSystem::OpenAppendStream(
diff --git a/cpp/src/arrow/filesystem/gcsfs_internal.cc b/cpp/src/arrow/filesystem/gcsfs_internal.cc
index 898015859c297..759e95feb7375 100644
--- a/cpp/src/arrow/filesystem/gcsfs_internal.cc
+++ b/cpp/src/arrow/filesystem/gcsfs_internal.cc
@@ -17,9 +17,13 @@
 
 #include "arrow/filesystem/gcsfs_internal.h"
 
+#include <absl/time/time.h>  // NOLINT
 #include <google/cloud/storage/client.h>
 
 #include <sstream>
+#include <unordered_map>
+
+#include "arrow/util/key_value_metadata.h"
 
 namespace arrow {
 namespace fs {
@@ -62,6 +66,133 @@ Status ToArrowStatus(const google::cloud::Status& s) {
   return Status::OK();
 }
 
+namespace gcs = ::google::cloud::storage;
+
+Result<gcs::EncryptionKey> ToEncryptionKey(
+    const std::shared_ptr<const KeyValueMetadata>& metadata) {
+  if (!metadata) {
+    return gcs::EncryptionKey{};
+  }
+
+  const auto& keys = metadata->keys();
+  const auto& values = metadata->values();
+
+  for (std::size_t i = 0; i < keys.size(); ++i) {
+    if (keys[i] == "encryptionKeyBase64") {
+      return gcs::EncryptionKey::FromBase64Key(values[i]);
+    }
+  }
+  return gcs::EncryptionKey{};
+}
+
+Result<gcs::KmsKeyName> ToKmsKeyName(
+    const std::shared_ptr<const KeyValueMetadata>& metadata) {
+  if (!metadata) {
+    return gcs::KmsKeyName{};
+  }
+
+  const auto& keys = metadata->keys();
+  const auto& values = metadata->values();
+
+  for (std::size_t i = 0; i < keys.size(); ++i) {
+    if (keys[i] == "kmsKeyName") {
+      return gcs::KmsKeyName(values[i]);
+    }
+  }
+  return gcs::KmsKeyName{};
+}
+
+Result<gcs::PredefinedAcl> ToPredefinedAcl(
+    const std::shared_ptr<const KeyValueMetadata>& metadata) {
+  if (!metadata) {
+    return gcs::PredefinedAcl{};
+  }
+
+  const auto& keys = metadata->keys();
+  const auto& values = metadata->values();
+
+  for (std::size_t i = 0; i < keys.size(); ++i) {
+    if (keys[i] == "predefinedAcl") {
+      return gcs::PredefinedAcl(values[i]);
+    }
+  }
+  return gcs::PredefinedAcl{};
+}
+
+Result<gcs::WithObjectMetadata> ToObjectMetadata(
+    const std::shared_ptr<const KeyValueMetadata>& metadata) {
+  if (!metadata) {
+    return gcs::WithObjectMetadata{};
+  }
+
+  static auto const setters = [] {
+    using setter = std::function<Status(gcs::ObjectMetadata&, const std::string&)>;
+    return std::unordered_map<std::string, setter>{
+        {"Cache-Control",
+         [](gcs::ObjectMetadata& m, const std::string& v) {
+           m.set_cache_control(v);
+           return Status::OK();
+         }},
+        {"Content-Disposition",
+         [](gcs::ObjectMetadata& m, const std::string& v) {
+           m.set_content_disposition(v);
+           return Status::OK();
+         }},
+        {"Content-Encoding",
+         [](gcs::ObjectMetadata& m, const std::string& v) {
+           m.set_content_encoding(v);
+           return Status::OK();
+         }},
+        {"Content-Language",
+         [](gcs::ObjectMetadata& m, const std::string& v) {
+           m.set_content_language(v);
+           return Status::OK();
+         }},
+        {"Content-Type",
+         [](gcs::ObjectMetadata& m, const std::string& v) {
+           m.set_content_type(v);
+           return Status::OK();
+         }},
+        {"customTime",
+         [](gcs::ObjectMetadata& m, const std::string& v) {
+           std::string err;
+           absl::Time t;
+           if (!absl::ParseTime(absl::RFC3339_full, v, &t, &err)) {
+             return Status::Invalid("Error parsing RFC-3339 timestamp: '", v, "': ", err);
+           }
+           m.set_custom_time(absl::ToChronoTime(t));
+           return Status::OK();
+         }},
+        {"storageClass",
+         [](gcs::ObjectMetadata& m, const std::string& v) {
+           m.set_storage_class(v);
+           return Status::OK();
+         }},
+        {"predefinedAcl",
+         [](gcs::ObjectMetadata&, const std::string&) { return Status::OK(); }},
+        {"encryptionKeyBase64",
+         [](gcs::ObjectMetadata&, const std::string&) { return Status::OK(); }},
+        {"kmsKeyName",
+         [](gcs::ObjectMetadata&, const std::string&) { return Status::OK(); }},
+    };
+  }();
+
+  const auto& keys = metadata->keys();
+  const auto& values = metadata->values();
+
+  gcs::ObjectMetadata object_metadata;
+  for (std::size_t i = 0; i < keys.size(); ++i) {
+    auto it = setters.find(keys[i]);
+    if (it != setters.end()) {
+      auto status = it->second(object_metadata, values[i]);
+      if (!status.ok()) return status;
+    } else {
+      object_metadata.upsert_metadata(keys[i], values[i]);
+    }
+  }
+  return gcs::WithObjectMetadata(std::move(object_metadata));
+}
+
 }  // namespace internal
 }  // namespace fs
 }  // namespace arrow
diff --git a/cpp/src/arrow/filesystem/gcsfs_internal.h b/cpp/src/arrow/filesystem/gcsfs_internal.h
index 8d568701ed52c..938d08d632399 100644
--- a/cpp/src/arrow/filesystem/gcsfs_internal.h
+++ b/cpp/src/arrow/filesystem/gcsfs_internal.h
@@ -18,6 +18,9 @@
 #pragma once
 
 #include <google/cloud/status.h>
+#include <google/cloud/storage/object_metadata.h>
+#include <google/cloud/storage/well_known_headers.h>
+#include <google/cloud/storage/well_known_parameters.h>
 
 #include <memory>
 #include <string>
@@ -31,6 +34,18 @@ namespace internal {
 
 Status ToArrowStatus(const google::cloud::Status& s);
 
+Result<google::cloud::storage::EncryptionKey> ToEncryptionKey(
+    const std::shared_ptr<const KeyValueMetadata>& metadata);
+
+Result<google::cloud::storage::PredefinedAcl> ToPredefinedAcl(
+    const std::shared_ptr<const KeyValueMetadata>& metadata);
+
+Result<google::cloud::storage::KmsKeyName> ToKmsKeyName(
+    const std::shared_ptr<const KeyValueMetadata>& metadata);
+
+Result<google::cloud::storage::WithObjectMetadata> ToObjectMetadata(
+    const std::shared_ptr<const KeyValueMetadata>& metadata);
+
 }  // namespace internal
 }  // namespace fs
 }  // namespace arrow
diff --git a/cpp/src/arrow/filesystem/gcsfs_test.cc b/cpp/src/arrow/filesystem/gcsfs_test.cc
index 369317fbb349b..3304d4bcc736f 100644
--- a/cpp/src/arrow/filesystem/gcsfs_test.cc
+++ b/cpp/src/arrow/filesystem/gcsfs_test.cc
@@ -32,6 +32,7 @@
 #include "arrow/filesystem/test_util.h"
 #include "arrow/testing/gtest_util.h"
 #include "arrow/testing/util.h"
+#include "arrow/util/key_value_metadata.h"
 
 namespace arrow {
 namespace fs {
@@ -45,6 +46,8 @@ using ::testing::HasSubstr;
 using ::testing::IsEmpty;
 using ::testing::Not;
 using ::testing::NotNull;
+using ::testing::Pair;
+using ::testing::UnorderedElementsAre;
 
 auto const* kPreexistingBucket = "test-bucket-name";
 auto const* kPreexistingObject = "test-object-name";
@@ -183,6 +186,96 @@ TEST(GcsFileSystem, FileSystemCompare) {
   EXPECT_FALSE(a->Equals(*b));
 }
 
+std::shared_ptr<const KeyValueMetadata> KeyValueMetadataForTest() {
+  return arrow::key_value_metadata({
+      {"Cache-Control", "test-only-cache-control"},
+      {"Content-Disposition", "test-only-content-disposition"},
+      {"Content-Encoding", "test-only-content-encoding"},
+      {"Content-Language", "test-only-content-language"},
+      {"Content-Type", "test-only-content-type"},
+      {"customTime", "2021-10-26T01:02:03.456Z"},
+      {"storageClass", "test-only-storage-class"},
+      {"key", "test-only-value"},
+      {"kmsKeyName", "test-only-kms-key-name"},
+      {"predefinedAcl", "test-only-predefined-acl"},
+      // computed with: /bin/echo -n "01234567" | openssl base64
+      {"encryptionKeyBase64", "MDEyMzQ1Njc="},
+  });
+}
+
+TEST(GcsFileSystem, ToEncryptionKey) {
+  gcs::EncryptionKey key;
+  ASSERT_OK_AND_ASSIGN(key,
+                       arrow::fs::internal::ToEncryptionKey(KeyValueMetadataForTest()));
+  ASSERT_TRUE(key.has_value());
+  EXPECT_EQ(key.value().algorithm, "AES256");
+  EXPECT_EQ(key.value().key, "MDEyMzQ1Njc=");
+  // /bin/echo -n "01234567" | sha256sum | awk '{printf("%s", $1);}' |
+  //       xxd -r -p | openssl base64
+  // to get the SHA256 value of the key.
+  EXPECT_EQ(key.value().sha256, "kkWSubED8U+DP6r7Z/SAaR8BmIqkV8AGF2n1jNRzEbw=");
+}
+
+TEST(GcsFileSystem, ToEncryptionKeyEmpty) {
+  gcs::EncryptionKey key;
+  ASSERT_OK_AND_ASSIGN(key, arrow::fs::internal::ToEncryptionKey({}));
+  ASSERT_FALSE(key.has_value());
+}
+
+TEST(GcsFileSystem, ToKmsKeyName) {
+  gcs::KmsKeyName key;
+  ASSERT_OK_AND_ASSIGN(key, arrow::fs::internal::ToKmsKeyName(KeyValueMetadataForTest()));
+  EXPECT_EQ(key.value_or(""), "test-only-kms-key-name");
+}
+
+TEST(GcsFileSystem, ToKmsKeyNameEmpty) {
+  gcs::KmsKeyName key;
+  ASSERT_OK_AND_ASSIGN(key, arrow::fs::internal::ToKmsKeyName({}));
+  ASSERT_FALSE(key.has_value());
+}
+
+TEST(GcsFileSystem, ToPredefinedAcl) {
+  gcs::PredefinedAcl predefined;
+  ASSERT_OK_AND_ASSIGN(predefined,
+                       arrow::fs::internal::ToPredefinedAcl(KeyValueMetadataForTest()));
+  EXPECT_EQ(predefined.value_or(""), "test-only-predefined-acl");
+}
+
+TEST(GcsFileSystem, ToPredefinedAclEmpty) {
+  gcs::PredefinedAcl predefined;
+  ASSERT_OK_AND_ASSIGN(predefined, arrow::fs::internal::ToPredefinedAcl({}));
+  ASSERT_FALSE(predefined.has_value());
+}
+
+TEST(GcsFileSystem, ToObjectMetadata) {
+  gcs::WithObjectMetadata metadata;
+  ASSERT_OK_AND_ASSIGN(metadata,
+                       arrow::fs::internal::ToObjectMetadata(KeyValueMetadataForTest()));
+  ASSERT_TRUE(metadata.has_value());
+  EXPECT_EQ(metadata.value().cache_control(), "test-only-cache-control");
+  EXPECT_EQ(metadata.value().content_disposition(), "test-only-content-disposition");
+  EXPECT_EQ(metadata.value().content_encoding(), "test-only-content-encoding");
+  EXPECT_EQ(metadata.value().content_language(), "test-only-content-language");
+  EXPECT_EQ(metadata.value().content_type(), "test-only-content-type");
+  ASSERT_TRUE(metadata.value().has_custom_time());
+  EXPECT_THAT(metadata.value().metadata(),
+              UnorderedElementsAre(Pair("key", "test-only-value")));
+}
+
+TEST(GcsFileSystem, ToObjectMetadataEmpty) {
+  gcs::WithObjectMetadata metadata;
+  ASSERT_OK_AND_ASSIGN(metadata, arrow::fs::internal::ToObjectMetadata({}));
+  ASSERT_FALSE(metadata.has_value());
+}
+
+TEST(GcsFileSystem, ToObjectMetadataInvalidCustomTime) {
+  auto metadata = arrow::fs::internal::ToObjectMetadata(arrow::key_value_metadata({
+      {"customTime", "invalid"},
+  }));
+  EXPECT_EQ(metadata.status().code(), StatusCode::Invalid);
+  EXPECT_THAT(metadata.status().message(), HasSubstr("Error parsing RFC-3339"));
+}
+
 TEST_F(GcsIntegrationTest, GetFileInfoBucket) {
   auto fs = internal::MakeGcsFileSystemForTest(TestGcsOptions());
   arrow::fs::AssertFileInfo(fs.get(), kPreexistingBucket, FileType::Directory);
@@ -259,6 +352,63 @@ TEST_F(GcsIntegrationTest, ReadObjectInfoInvalid) {
   EXPECT_EQ(result.status().code(), StatusCode::IOError);
 }
 
+TEST_F(GcsIntegrationTest, WriteObjectSmall) {
+  auto fs = internal::MakeGcsFileSystemForTest(TestGcsOptions());
+
+  const auto path = kPreexistingBucket + std::string("/test-write-object");
+  std::shared_ptr<io::OutputStream> output;
+  ASSERT_OK_AND_ASSIGN(output, fs->OpenOutputStream(path, {}));
+  const auto expected = std::string(kLoremIpsum);
+  ASSERT_OK(output->Write(expected.data(), expected.size()));
+  ASSERT_OK(output->Close());
+
+  // Verify we can read the object back.
+  std::shared_ptr<io::InputStream> input;
+  ASSERT_OK_AND_ASSIGN(input, fs->OpenInputStream(path));
+
+  std::array<char, 1024> inbuf{};
+  std::int64_t size;
+  ASSERT_OK_AND_ASSIGN(size, input->Read(inbuf.size(), inbuf.data()));
+
+  EXPECT_EQ(std::string(inbuf.data(), size), expected);
+}
+
+TEST_F(GcsIntegrationTest, WriteObjectLarge) {
+  auto fs = internal::MakeGcsFileSystemForTest(TestGcsOptions());
+
+  const auto path = kPreexistingBucket + std::string("/test-write-object");
+  std::shared_ptr<io::OutputStream> output;
+  ASSERT_OK_AND_ASSIGN(output, fs->OpenOutputStream(path, {}));
+  // These buffer sizes are intentionally not multiples of the upload quantum (256 KiB).
+  std::array<std::int64_t, 3> sizes{257 * 1024, 258 * 1024, 259 * 1024};
+  std::array<std::string, 3> buffers{
+      std::string(sizes[0], 'A'),
+      std::string(sizes[1], 'B'),
+      std::string(sizes[2], 'C'),
+  };
+  auto expected = std::int64_t{0};
+  for (auto i = 0; i != 3; ++i) {
+    ASSERT_OK(output->Write(buffers[i].data(), buffers[i].size()));
+    expected += sizes[i];
+    ASSERT_EQ(output->Tell(), expected);
+  }
+  ASSERT_OK(output->Close());
+
+  // Verify we can read the object back.
+  std::shared_ptr<io::InputStream> input;
+  ASSERT_OK_AND_ASSIGN(input, fs->OpenInputStream(path));
+
+  std::string contents;
+  std::shared_ptr<Buffer> buffer;
+  do {
+    ASSERT_OK_AND_ASSIGN(buffer, input->Read(128 * 1024));
+    ASSERT_TRUE(buffer);
+    contents.append(buffer->ToString());
+  } while (buffer->size() != 0);
+
+  EXPECT_EQ(contents, buffers[0] + buffers[1] + buffers[2]);
+}
+
 }  // namespace
 }  // namespace fs
 }  // namespace arrow

From 76c12424f12eff7d342a1339a25160888d740157 Mon Sep 17 00:00:00 2001
From: Benson Muite <bkmgit@users.noreply.github.com>
Date: Mon, 8 Nov 2021 16:56:36 +0100
Subject: [PATCH 104/194] ARROW-14628: [Release][Python] Use python -m pytest

Ensure runs on AlmaLinux

Closes #11639 from bkmgit/ARROW-14628

Authored-by: Benson Muite <bkmgit@users.noreply.github.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 ci/scripts/python_wheel_unix_test.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/scripts/python_wheel_unix_test.sh b/ci/scripts/python_wheel_unix_test.sh
index fda53a8da528f..ec703abfc0320 100755
--- a/ci/scripts/python_wheel_unix_test.sh
+++ b/ci/scripts/python_wheel_unix_test.sh
@@ -80,5 +80,5 @@ if [ "${CHECK_UNITTESTS}" == "ON" ]; then
   pip install -U -r ${source_dir}/python/requirements-wheel-test.txt
   # Execute unittest, test dependencies must be installed
   python -c 'import pyarrow; pyarrow.create_library_symlinks()'
-  pytest -r s --pyargs pyarrow
+  python -m pytest -r s --pyargs pyarrow
 fi

From 01b4caa7bcf41fc18528e4f0831e4b5f2bae6482 Mon Sep 17 00:00:00 2001
From: David Li <li.davidm96@gmail.com>
Date: Mon, 8 Nov 2021 17:17:59 +0100
Subject: [PATCH 105/194] ARROW-13558: [C++] Validate decimal arrays/scalars

Split out of ARROW-13130. This also adjusts various tests generating random decimals, though in a very imprecise way (such that we aren't generating the full range of possible decimals for a given precision).

Closes #11322 from lidavidm/arrow-13558

Authored-by: David Li <li.davidm96@gmail.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/arrow/array/array_test.cc             | 63 ++++++++++++---
 cpp/src/arrow/array/validate.cc               | 22 ++++++
 .../arrow/compute/kernels/scalar_cast_test.cc | 76 ++++++++++---------
 cpp/src/arrow/ipc/test_common.cc              | 27 ++-----
 cpp/src/arrow/scalar.cc                       | 12 ++-
 cpp/src/arrow/scalar_test.cc                  |  6 ++
 .../arrow/testing/json_integration_test.cc    | 24 +++++-
 cpp/src/arrow/testing/util.cc                 | 19 -----
 cpp/src/arrow/testing/util.h                  |  2 -
 cpp/src/arrow/util/decimal.h                  |  3 +-
 cpp/src/arrow/util/decimal_test.cc            | 24 ++++++
 cpp/src/parquet/arrow/test_util.h             | 25 +++---
 dev/archery/archery/cli.py                    |  2 +-
 dev/archery/archery/integration/datagen.py    | 24 ++----
 dev/archery/archery/integration/runner.py     | 16 +++-
 dev/archery/archery/integration/tester.py     |  2 +-
 dev/archery/archery/integration/tester_cpp.py | 11 ++-
 .../archery/integration/tester_csharp.py      |  2 +-
 dev/archery/archery/integration/tester_go.py  |  2 +-
 .../archery/integration/tester_java.py        |  2 +-
 dev/archery/archery/integration/tester_js.py  |  2 +-
 .../archery/integration/tester_rust.py        |  2 +-
 22 files changed, 231 insertions(+), 137 deletions(-)

diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc
index 62ee032db706f..5841b35c389d3 100644
--- a/cpp/src/arrow/array/array_test.cc
+++ b/cpp/src/arrow/array/array_test.cc
@@ -2668,8 +2668,9 @@ class DecimalTest : public ::testing::TestWithParam<int> {
   }
 
   template <size_t BYTE_WIDTH = 16>
-  void TestCreate(int32_t precision, const DecimalVector& draw,
-                  const std::vector<uint8_t>& valid_bytes, int64_t offset) const {
+  std::shared_ptr<Array> TestCreate(int32_t precision, const DecimalVector& draw,
+                                    const std::vector<uint8_t>& valid_bytes,
+                                    int64_t offset) const {
     auto type = std::make_shared<TYPE>(precision, 4);
     auto builder = std::make_shared<DecimalBuilder>(type);
 
@@ -2677,20 +2678,20 @@ class DecimalTest : public ::testing::TestWithParam<int> {
 
     const size_t size = draw.size();
 
-    ASSERT_OK(builder->Reserve(size));
+    ARROW_EXPECT_OK(builder->Reserve(size));
 
     for (size_t i = 0; i < size; ++i) {
       if (valid_bytes[i]) {
-        ASSERT_OK(builder->Append(draw[i]));
+        ARROW_EXPECT_OK(builder->Append(draw[i]));
       } else {
-        ASSERT_OK(builder->AppendNull());
+        ARROW_EXPECT_OK(builder->AppendNull());
         ++null_count;
       }
     }
 
     std::shared_ptr<Array> out;
     FinishAndCheckPadding(builder.get(), &out);
-    ASSERT_EQ(builder->length(), 0);
+    EXPECT_EQ(builder->length(), 0);
 
     std::vector<uint8_t> raw_bytes;
 
@@ -2699,7 +2700,7 @@ class DecimalTest : public ::testing::TestWithParam<int> {
 
     auto expected_data = std::make_shared<Buffer>(raw_bytes.data(), BYTE_WIDTH);
     std::shared_ptr<Buffer> expected_null_bitmap;
-    ASSERT_OK_AND_ASSIGN(expected_null_bitmap, internal::BytesToBits(valid_bytes));
+    EXPECT_OK_AND_ASSIGN(expected_null_bitmap, internal::BytesToBits(valid_bytes));
 
     int64_t expected_null_count = CountNulls(valid_bytes);
     auto expected = std::make_shared<DecimalArray>(
@@ -2708,6 +2709,8 @@ class DecimalTest : public ::testing::TestWithParam<int> {
     std::shared_ptr<Array> lhs = out->Slice(offset);
     std::shared_ptr<Array> rhs = expected->Slice(offset);
     ASSERT_ARRAYS_EQUAL(*rhs, *lhs);
+
+    return out;
   }
 };
 
@@ -2741,6 +2744,21 @@ TEST_P(Decimal128Test, WithNulls) {
   this->TestCreate(precision, draw, valid_bytes, 2);
 }
 
+TEST_P(Decimal128Test, ValidateFull) {
+  int32_t precision = GetParam();
+  std::vector<Decimal128> draw;
+  Decimal128 val = Decimal128::GetMaxValue(precision) + 1;
+
+  draw = {Decimal128(), val};
+  auto arr = this->TestCreate(precision, draw, {true, false}, 0);
+  ASSERT_OK(arr->ValidateFull());
+
+  draw = {val, Decimal128()};
+  arr = this->TestCreate(precision, draw, {true, false}, 0);
+  EXPECT_RAISES_WITH_MESSAGE_THAT(
+      Invalid, ::testing::HasSubstr("does not fit in precision of"), arr->ValidateFull());
+}
+
 INSTANTIATE_TEST_SUITE_P(Decimal128Test, Decimal128Test, ::testing::Range(1, 38));
 
 using Decimal256Test = DecimalTest<Decimal256Type>;
@@ -2777,6 +2795,21 @@ TEST_P(Decimal256Test, WithNulls) {
   this->TestCreate(precision, draw, valid_bytes, 2);
 }
 
+TEST_P(Decimal256Test, ValidateFull) {
+  int32_t precision = GetParam();
+  std::vector<Decimal256> draw;
+  Decimal256 val = Decimal256::GetMaxValue(precision) + 1;
+
+  draw = {Decimal256(), val};
+  auto arr = this->TestCreate(precision, draw, {true, false}, 0);
+  ASSERT_OK(arr->ValidateFull());
+
+  draw = {val, Decimal256()};
+  arr = this->TestCreate(precision, draw, {true, false}, 0);
+  EXPECT_RAISES_WITH_MESSAGE_THAT(
+      Invalid, ::testing::HasSubstr("does not fit in precision of"), arr->ValidateFull());
+}
+
 INSTANTIATE_TEST_SUITE_P(Decimal256Test, Decimal256Test,
                          ::testing::Values(1, 2, 5, 10, 38, 39, 40, 75, 76));
 
@@ -2904,16 +2937,24 @@ TEST(TestSwapEndianArrayData, PrimitiveType) {
   expected_data = ArrayData::Make(uint64(), 1, {null_buffer, data_int64_buffer}, 0);
   AssertArrayDataEqualsWithSwapEndian(data, expected_data);
 
-  auto data_16byte_buffer = Buffer::FromString("0123456789abcdef");
+  auto data_16byte_buffer = Buffer::FromString(
+      "\x01"
+      "123456789abcde\x01");
   data = ArrayData::Make(decimal128(38, 10), 1, {null_buffer, data_16byte_buffer});
-  auto data_decimal128_buffer = Buffer::FromString("fedcba9876543210");
+  auto data_decimal128_buffer = Buffer::FromString(
+      "\x01"
+      "edcba987654321\x01");
   expected_data =
       ArrayData::Make(decimal128(38, 10), 1, {null_buffer, data_decimal128_buffer}, 0);
   AssertArrayDataEqualsWithSwapEndian(data, expected_data);
 
-  auto data_32byte_buffer = Buffer::FromString("0123456789abcdef123456789ABCDEF0");
+  auto data_32byte_buffer = Buffer::FromString(
+      "\x01"
+      "123456789abcdef123456789ABCDEF\x01");
   data = ArrayData::Make(decimal256(76, 20), 1, {null_buffer, data_32byte_buffer});
-  auto data_decimal256_buffer = Buffer::FromString("0FEDCBA987654321fedcba9876543210");
+  auto data_decimal256_buffer = Buffer::FromString(
+      "\x01"
+      "FEDCBA987654321fedcba987654321\x01");
   expected_data =
       ArrayData::Make(decimal256(76, 20), 1, {null_buffer, data_decimal256_buffer}, 0);
   AssertArrayDataEqualsWithSwapEndian(data, expected_data);
diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc
index a19c3716595e5..8bf53cc5ca912 100644
--- a/cpp/src/arrow/array/validate.cc
+++ b/cpp/src/arrow/array/validate.cc
@@ -483,6 +483,10 @@ struct ValidateArrayFullImpl {
 
   Status Visit(const LargeBinaryType& type) { return ValidateBinaryLike(type); }
 
+  Status Visit(const Decimal128Type& type) { return ValidateDecimal(type); }
+
+  Status Visit(const Decimal256Type& type) { return ValidateDecimal(type); }
+
   Status Visit(const ListType& type) { return ValidateListLike(type); }
 
   Status Visit(const LargeListType& type) { return ValidateListLike(type); }
@@ -593,6 +597,24 @@ struct ValidateArrayFullImpl {
     return ValidateOffsets(type, data_buffer->size());
   }
 
+  template <typename DecimalType>
+  Status ValidateDecimal(const DecimalType& type) {
+    using CType = typename TypeTraits<DecimalType>::CType;
+    const int32_t precision = type.precision();
+    return VisitArrayDataInline<DecimalType>(
+        data,
+        [&](util::string_view bytes) {
+          DCHECK_EQ(bytes.size(), DecimalType::kByteWidth);
+          CType value(reinterpret_cast<const uint8_t*>(bytes.data()));
+          if (!value.FitsInPrecision(precision)) {
+            return Status::Invalid("Decimal value ", value.ToIntegerString(),
+                                   " does not fit in precision of ", type);
+          }
+          return Status::OK();
+        },
+        []() { return Status::OK(); });
+  }
+
   template <typename ListType>
   Status ValidateListLike(const ListType& type) {
     const ArrayData& child = *data.child_data[0];
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
index d21fa24c761a8..5d516677669c1 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
@@ -437,7 +437,7 @@ TEST(Cast, Decimal128ToInt) {
       options.allow_int_overflow = allow_int_overflow;
       options.allow_decimal_truncate = allow_decimal_truncate;
 
-      auto no_overflow_no_truncation = ArrayFromJSON(decimal(38, 10), R"([
+      auto no_overflow_no_truncation = ArrayFromJSON(decimal128(38, 10), R"([
           "02.0000000000",
          "-11.0000000000",
           "22.0000000000",
@@ -450,7 +450,7 @@ TEST(Cast, Decimal128ToInt) {
 
   for (bool allow_int_overflow : {false, true}) {
     options.allow_int_overflow = allow_int_overflow;
-    auto truncation_but_no_overflow = ArrayFromJSON(decimal(38, 10), R"([
+    auto truncation_but_no_overflow = ArrayFromJSON(decimal128(38, 10), R"([
           "02.1000000000",
          "-11.0000004500",
           "22.0000004500",
@@ -468,7 +468,7 @@ TEST(Cast, Decimal128ToInt) {
   for (bool allow_decimal_truncate : {false, true}) {
     options.allow_decimal_truncate = allow_decimal_truncate;
 
-    auto overflow_no_truncation = ArrayFromJSON(decimal(38, 10), R"([
+    auto overflow_no_truncation = ArrayFromJSON(decimal128(38, 10), R"([
         "12345678901234567890000.0000000000",
         "99999999999999999999999.0000000000",
         null])");
@@ -490,7 +490,7 @@ TEST(Cast, Decimal128ToInt) {
       options.allow_int_overflow = allow_int_overflow;
       options.allow_decimal_truncate = allow_decimal_truncate;
 
-      auto overflow_and_truncation = ArrayFromJSON(decimal(38, 10), R"([
+      auto overflow_and_truncation = ArrayFromJSON(decimal128(38, 10), R"([
         "12345678901234567890000.0045345000",
         "99999999999999999999999.0000344300",
         null])");
@@ -508,7 +508,7 @@ TEST(Cast, Decimal128ToInt) {
     }
   }
 
-  Decimal128Builder builder(decimal(38, -4));
+  Decimal128Builder builder(decimal128(38, -4));
   for (auto d : {Decimal128("1234567890000."), Decimal128("-120000.")}) {
     ASSERT_OK_AND_ASSIGN(d, d.Rescale(0, -4));
     ASSERT_OK(builder.Append(d));
@@ -611,7 +611,7 @@ TEST(Cast, Decimal256ToInt) {
 }
 
 TEST(Cast, IntegerToDecimal) {
-  for (auto decimal_type : {decimal128(21, 2), decimal256(21, 2)}) {
+  for (auto decimal_type : {decimal128(22, 2), decimal256(22, 2)}) {
     for (auto integer_type : kIntegerTypes) {
       CheckCast(
           ArrayFromJSON(integer_type, "[0, 7, null, 100, 99]"),
@@ -624,6 +624,8 @@ TEST(Cast, IntegerToDecimal) {
     CheckCast(ArrayFromJSON(int64(), "[-9223372036854775808, 9223372036854775807]"),
               ArrayFromJSON(decimal_type,
                             R"(["-9223372036854775808", "9223372036854775807"])"));
+  }
+  for (auto decimal_type : {decimal128(20, 0), decimal256(20, 0)}) {
     CheckCast(ArrayFromJSON(uint64(), "[0, 18446744073709551615]"),
               ArrayFromJSON(decimal_type, R"(["0", "18446744073709551615"])"));
   }
@@ -646,13 +648,13 @@ TEST(Cast, Decimal128ToDecimal128) {
   for (bool allow_decimal_truncate : {false, true}) {
     options.allow_decimal_truncate = allow_decimal_truncate;
 
-    auto no_truncation = ArrayFromJSON(decimal(38, 10), R"([
+    auto no_truncation = ArrayFromJSON(decimal128(38, 10), R"([
           "02.0000000000",
           "30.0000000000",
           "22.0000000000",
         "-121.0000000000",
         null])");
-    auto expected = ArrayFromJSON(decimal(28, 0), R"([
+    auto expected = ArrayFromJSON(decimal128(28, 0), R"([
           "02.",
           "30.",
           "22.",
@@ -667,10 +669,10 @@ TEST(Cast, Decimal128ToDecimal128) {
     options.allow_decimal_truncate = allow_decimal_truncate;
 
     // Same scale, different precision
-    auto d_5_2 = ArrayFromJSON(decimal(5, 2), R"([
+    auto d_5_2 = ArrayFromJSON(decimal128(5, 2), R"([
           "12.34",
            "0.56"])");
-    auto d_4_2 = ArrayFromJSON(decimal(4, 2), R"([
+    auto d_4_2 = ArrayFromJSON(decimal128(4, 2), R"([
           "12.34",
            "0.56"])");
 
@@ -678,17 +680,17 @@ TEST(Cast, Decimal128ToDecimal128) {
     CheckCast(d_4_2, d_5_2, options);
   }
 
-  auto d_38_10 = ArrayFromJSON(decimal(38, 10), R"([
+  auto d_38_10 = ArrayFromJSON(decimal128(38, 10), R"([
       "-02.1234567890",
        "30.1234567890",
       null])");
 
-  auto d_28_0 = ArrayFromJSON(decimal(28, 0), R"([
+  auto d_28_0 = ArrayFromJSON(decimal128(28, 0), R"([
       "-02.",
        "30.",
       null])");
 
-  auto d_38_10_roundtripped = ArrayFromJSON(decimal(38, 10), R"([
+  auto d_38_10_roundtripped = ArrayFromJSON(decimal128(38, 10), R"([
       "-02.0000000000",
        "30.0000000000",
       null])");
@@ -704,14 +706,15 @@ TEST(Cast, Decimal128ToDecimal128) {
   CheckCast(d_28_0, d_38_10_roundtripped, options);
 
   // Precision loss without rescale leads to truncation
-  auto d_4_2 = ArrayFromJSON(decimal(4, 2), R"(["12.34"])");
+  auto d_4_2 = ArrayFromJSON(decimal128(4, 2), R"(["12.34"])");
   for (auto expected : {
-           ArrayFromJSON(decimal(3, 2), R"(["12.34"])"),
-           ArrayFromJSON(decimal(4, 3), R"(["12.340"])"),
-           ArrayFromJSON(decimal(2, 1), R"(["12.3"])"),
+           ArrayFromJSON(decimal128(3, 2), R"(["12.34"])"),
+           ArrayFromJSON(decimal128(4, 3), R"(["12.340"])"),
+           ArrayFromJSON(decimal128(2, 1), R"(["12.3"])"),
        }) {
     options.allow_decimal_truncate = true;
-    CheckCast(d_4_2, expected, options);
+    ASSERT_OK_AND_ASSIGN(auto invalid, Cast(d_4_2, expected->type(), options));
+    ASSERT_RAISES(Invalid, invalid.make_array()->ValidateFull());
 
     options.allow_decimal_truncate = false;
     options.to_type = expected->type();
@@ -790,7 +793,8 @@ TEST(Cast, Decimal256ToDecimal256) {
            ArrayFromJSON(decimal256(2, 1), R"(["12.3"])"),
        }) {
     options.allow_decimal_truncate = true;
-    CheckCast(d_4_2, expected, options);
+    ASSERT_OK_AND_ASSIGN(auto invalid, Cast(d_4_2, expected->type(), options));
+    ASSERT_RAISES(Invalid, invalid.make_array()->ValidateFull());
 
     options.allow_decimal_truncate = false;
     options.to_type = expected->type();
@@ -804,7 +808,7 @@ TEST(Cast, Decimal128ToDecimal256) {
   for (bool allow_decimal_truncate : {false, true}) {
     options.allow_decimal_truncate = allow_decimal_truncate;
 
-    auto no_truncation = ArrayFromJSON(decimal(38, 10), R"([
+    auto no_truncation = ArrayFromJSON(decimal128(38, 10), R"([
           "02.0000000000",
           "30.0000000000",
           "22.0000000000",
@@ -824,7 +828,7 @@ TEST(Cast, Decimal128ToDecimal256) {
     options.allow_decimal_truncate = allow_decimal_truncate;
 
     // Same scale, different precision
-    auto d_5_2 = ArrayFromJSON(decimal(5, 2), R"([
+    auto d_5_2 = ArrayFromJSON(decimal128(5, 2), R"([
           "12.34",
            "0.56"])");
     auto d_4_2 = ArrayFromJSON(decimal256(4, 2), R"([
@@ -838,12 +842,12 @@ TEST(Cast, Decimal128ToDecimal256) {
     CheckCast(d_5_2, d_40_2, options);
   }
 
-  auto d128_38_10 = ArrayFromJSON(decimal(38, 10), R"([
+  auto d128_38_10 = ArrayFromJSON(decimal128(38, 10), R"([
       "-02.1234567890",
        "30.1234567890",
       null])");
 
-  auto d128_28_0 = ArrayFromJSON(decimal(28, 0), R"([
+  auto d128_28_0 = ArrayFromJSON(decimal128(28, 0), R"([
       "-02.",
        "30.",
       null])");
@@ -869,14 +873,15 @@ TEST(Cast, Decimal128ToDecimal256) {
   CheckCast(d128_28_0, d256_38_10_roundtripped, options);
 
   // Precision loss without rescale leads to truncation
-  auto d128_4_2 = ArrayFromJSON(decimal(4, 2), R"(["12.34"])");
+  auto d128_4_2 = ArrayFromJSON(decimal128(4, 2), R"(["12.34"])");
   for (auto expected : {
            ArrayFromJSON(decimal256(3, 2), R"(["12.34"])"),
            ArrayFromJSON(decimal256(4, 3), R"(["12.340"])"),
            ArrayFromJSON(decimal256(2, 1), R"(["12.3"])"),
        }) {
     options.allow_decimal_truncate = true;
-    CheckCast(d128_4_2, expected, options);
+    ASSERT_OK_AND_ASSIGN(auto invalid, Cast(d128_4_2, expected->type(), options));
+    ASSERT_RAISES(Invalid, invalid.make_array()->ValidateFull());
 
     options.allow_decimal_truncate = false;
     options.to_type = expected->type();
@@ -896,7 +901,7 @@ TEST(Cast, Decimal256ToDecimal128) {
           "22.0000000000",
         "-121.0000000000",
         null])");
-    auto expected = ArrayFromJSON(decimal(28, 0), R"([
+    auto expected = ArrayFromJSON(decimal128(28, 0), R"([
           "02.",
           "30.",
           "22.",
@@ -913,7 +918,7 @@ TEST(Cast, Decimal256ToDecimal128) {
     auto d_5_2 = ArrayFromJSON(decimal256(42, 2), R"([
           "12.34",
            "0.56"])");
-    auto d_4_2 = ArrayFromJSON(decimal(4, 2), R"([
+    auto d_4_2 = ArrayFromJSON(decimal128(4, 2), R"([
           "12.34",
            "0.56"])");
 
@@ -930,12 +935,12 @@ TEST(Cast, Decimal256ToDecimal128) {
        "30.",
       null])");
 
-  auto d128_28_0 = ArrayFromJSON(decimal(28, 0), R"([
+  auto d128_28_0 = ArrayFromJSON(decimal128(28, 0), R"([
       "-02.",
        "30.",
       null])");
 
-  auto d128_38_10_roundtripped = ArrayFromJSON(decimal(38, 10), R"([
+  auto d128_38_10_roundtripped = ArrayFromJSON(decimal128(38, 10), R"([
       "-02.0000000000",
        "30.0000000000",
       null])");
@@ -953,12 +958,13 @@ TEST(Cast, Decimal256ToDecimal128) {
   // Precision loss without rescale leads to truncation
   auto d256_4_2 = ArrayFromJSON(decimal256(4, 2), R"(["12.34"])");
   for (auto expected : {
-           ArrayFromJSON(decimal(3, 2), R"(["12.34"])"),
-           ArrayFromJSON(decimal(4, 3), R"(["12.340"])"),
-           ArrayFromJSON(decimal(2, 1), R"(["12.3"])"),
+           ArrayFromJSON(decimal128(3, 2), R"(["12.34"])"),
+           ArrayFromJSON(decimal128(4, 3), R"(["12.340"])"),
+           ArrayFromJSON(decimal128(2, 1), R"(["12.3"])"),
        }) {
     options.allow_decimal_truncate = true;
-    CheckCast(d256_4_2, expected, options);
+    ASSERT_OK_AND_ASSIGN(auto invalid, Cast(d256_4_2, expected->type(), options));
+    ASSERT_RAISES(Invalid, invalid.make_array()->ValidateFull());
 
     options.allow_decimal_truncate = false;
     options.to_type = expected->type();
@@ -968,7 +974,7 @@ TEST(Cast, Decimal256ToDecimal128) {
 
 TEST(Cast, FloatingToDecimal) {
   for (auto float_type : {float32(), float64()}) {
-    for (auto decimal_type : {decimal(5, 2), decimal256(5, 2)}) {
+    for (auto decimal_type : {decimal128(5, 2), decimal256(5, 2)}) {
       CheckCast(
           ArrayFromJSON(float_type, "[0.0, null, 123.45, 123.456, 999.994]"),
           ArrayFromJSON(decimal_type, R"(["0.00", null, "123.45", "123.46", "999.99"])"));
@@ -1012,7 +1018,7 @@ TEST(Cast, FloatingToDecimal) {
 
 TEST(Cast, DecimalToFloating) {
   for (auto float_type : {float32(), float64()}) {
-    for (auto decimal_type : {decimal(5, 2), decimal256(5, 2)}) {
+    for (auto decimal_type : {decimal128(5, 2), decimal256(5, 2)}) {
       CheckCast(ArrayFromJSON(decimal_type, R"(["0.00", null, "123.45", "999.99"])"),
                 ArrayFromJSON(float_type, "[0.0, null, 123.45, 999.99]"));
     }
diff --git a/cpp/src/arrow/ipc/test_common.cc b/cpp/src/arrow/ipc/test_common.cc
index e31bb530c034e..d499aba699996 100644
--- a/cpp/src/arrow/ipc/test_common.cc
+++ b/cpp/src/arrow/ipc/test_common.cc
@@ -928,32 +928,17 @@ Status MakeFWBinary(std::shared_ptr<RecordBatch>* out) {
 }
 
 Status MakeDecimal(std::shared_ptr<RecordBatch>* out) {
-  constexpr int kDecimalPrecision = 38;
-  auto type = decimal(kDecimalPrecision, 4);
+  constexpr int kLength = 10;
+  auto type = decimal128(38, 4);
   auto f0 = field("f0", type);
   auto f1 = field("f1", type);
   auto schema = ::arrow::schema({f0, f1});
 
-  constexpr int kDecimalSize = 16;
-  constexpr int length = 10;
+  auto gen = random::RandomArrayGenerator(/*seed=*/1);
+  auto a1 = gen.Decimal128(type, kLength, /*null_probability=*/0.1);
+  auto a2 = std::make_shared<Decimal128Array>(type, kLength, a1->data()->buffers[1]);
 
-  std::vector<uint8_t> is_valid_bytes(length);
-
-  ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> data,
-                        AllocateBuffer(kDecimalSize * length));
-
-  random_decimals(length, 1, kDecimalPrecision, data->mutable_data());
-  random_null_bytes(length, 0.1, is_valid_bytes.data());
-
-  ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> is_valid,
-                        internal::BytesToBits(is_valid_bytes));
-
-  auto a1 = std::make_shared<Decimal128Array>(f0->type(), length, data, is_valid,
-                                              kUnknownNullCount);
-
-  auto a2 = std::make_shared<Decimal128Array>(f1->type(), length, data);
-
-  *out = RecordBatch::Make(schema, length, {a1, a2});
+  *out = RecordBatch::Make(schema, kLength, {a1, a2});
   return Status::OK();
 }
 
diff --git a/cpp/src/arrow/scalar.cc b/cpp/src/arrow/scalar.cc
index df8badae657c6..f268e75a50ba0 100644
--- a/cpp/src/arrow/scalar.cc
+++ b/cpp/src/arrow/scalar.cc
@@ -230,12 +230,20 @@ struct ScalarValidateImpl {
   }
 
   Status Visit(const Decimal128Scalar& s) {
-    // XXX validate precision?
+    const auto& ty = checked_cast<const DecimalType&>(*s.type);
+    if (!s.value.FitsInPrecision(ty.precision())) {
+      return Status::Invalid("Decimal value ", s.value.ToIntegerString(),
+                             " does not fit in precision of ", ty);
+    }
     return Status::OK();
   }
 
   Status Visit(const Decimal256Scalar& s) {
-    // XXX validate precision?
+    const auto& ty = checked_cast<const DecimalType&>(*s.type);
+    if (!s.value.FitsInPrecision(ty.precision())) {
+      return Status::Invalid("Decimal value ", s.value.ToIntegerString(),
+                             " does not fit in precision of ", ty);
+    }
     return Status::OK();
   }
 
diff --git a/cpp/src/arrow/scalar_test.cc b/cpp/src/arrow/scalar_test.cc
index 99bcaec095b65..9d2d89c977dbe 100644
--- a/cpp/src/arrow/scalar_test.cc
+++ b/cpp/src/arrow/scalar_test.cc
@@ -23,6 +23,7 @@
 #include <utility>
 #include <vector>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
 #include "arrow/buffer.h"
@@ -407,6 +408,11 @@ class TestDecimalScalar : public ::testing::Test {
     ASSERT_FALSE(first->Equals(pi));
     ASSERT_TRUE(second->Equals(pi));
     ASSERT_FALSE(second->Equals(null));
+
+    auto invalid = ScalarType(ValueType::GetMaxValue(6), std::make_shared<T>(5, 2));
+    EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid,
+                                    ::testing::HasSubstr("does not fit in precision of"),
+                                    invalid.ValidateFull());
   }
 };
 
diff --git a/cpp/src/arrow/testing/json_integration_test.cc b/cpp/src/arrow/testing/json_integration_test.cc
index 55620119550e0..146f16a95b4f7 100644
--- a/cpp/src/arrow/testing/json_integration_test.cc
+++ b/cpp/src/arrow/testing/json_integration_test.cc
@@ -54,6 +54,10 @@ DEFINE_string(
     "Mode of integration testing tool (ARROW_TO_JSON, JSON_TO_ARROW, VALIDATE)");
 DEFINE_bool(integration, false, "Run in integration test mode");
 DEFINE_bool(verbose, true, "Verbose output");
+DEFINE_bool(
+    validate_decimals, true,
+    "Validate that decimal values are in range for the given precision (ARROW-13558: "
+    "'golden' test data from previous versions may have out-of-range decimal values)");
 
 namespace arrow {
 
@@ -120,6 +124,22 @@ static Status ConvertArrowToJson(const std::string& arrow_path,
   return out_file->Write(result.c_str(), static_cast<int64_t>(result.size()));
 }
 
+// Validate the batch, accounting for the -validate_decimals flag
+static Status ValidateFull(const RecordBatch& batch) {
+  if (FLAGS_validate_decimals) {
+    return batch.ValidateFull();
+  }
+  // Decimal validation disabled, so individually validate columns
+  RETURN_NOT_OK(batch.Validate());
+  for (const auto& column : batch.columns()) {
+    if (is_decimal(column->type()->id())) {
+      continue;
+    }
+    RETURN_NOT_OK(column->ValidateFull());
+  }
+  return Status::OK();
+}
+
 static Status ValidateArrowVsJson(const std::string& arrow_path,
                                   const std::string& json_path) {
   // Construct JSON reader
@@ -166,12 +186,12 @@ static Status ValidateArrowVsJson(const std::string& arrow_path,
   for (int i = 0; i < json_nbatches; ++i) {
     RETURN_NOT_OK(json_reader->ReadRecordBatch(i, &json_batch));
     ARROW_ASSIGN_OR_RAISE(arrow_batch, arrow_reader->ReadRecordBatch(i));
-    Status valid_st = json_batch->ValidateFull();
+    Status valid_st = ValidateFull(*json_batch);
     if (!valid_st.ok()) {
       return Status::Invalid("JSON record batch ", i, " did not validate:\n",
                              valid_st.ToString());
     }
-    valid_st = arrow_batch->ValidateFull();
+    valid_st = ValidateFull(*arrow_batch);
     if (!valid_st.ok()) {
       return Status::Invalid("Arrow record batch ", i, " did not validate:\n",
                              valid_st.ToString());
diff --git a/cpp/src/arrow/testing/util.cc b/cpp/src/arrow/testing/util.cc
index 9e3e27174790f..412978048843a 100644
--- a/cpp/src/arrow/testing/util.cc
+++ b/cpp/src/arrow/testing/util.cc
@@ -82,25 +82,6 @@ std::string random_string(int64_t n, uint32_t seed) {
   return s;
 }
 
-void random_decimals(int64_t n, uint32_t seed, int32_t precision, uint8_t* out) {
-  pcg32_fast gen(seed);
-  std::uniform_int_distribution<uint32_t> d(0, std::numeric_limits<uint8_t>::max());
-  const int32_t required_bytes = DecimalType::DecimalSize(precision);
-  constexpr int32_t byte_width = 16;
-  std::fill(out, out + byte_width * n, '\0');
-
-  for (int64_t i = 0; i < n; ++i, out += byte_width) {
-    std::generate(out, out + required_bytes,
-                  [&d, &gen] { return static_cast<uint8_t>(d(gen)); });
-
-    // sign extend if the sign bit is set for the last byte generated
-    // 0b10000000 == 0x80 == 128
-    if ((out[required_bytes - 1] & '\x80') != 0) {
-      std::fill(out + required_bytes, out + byte_width, '\xFF');
-    }
-  }
-}
-
 void random_ascii(int64_t n, uint32_t seed, uint8_t* out) {
   rand_uniform_int(n, seed, static_cast<int32_t>('A'), static_cast<int32_t>('z'), out);
 }
diff --git a/cpp/src/arrow/testing/util.h b/cpp/src/arrow/testing/util.h
index 05fb8c68e3fb2..0ce5f714ca74c 100644
--- a/cpp/src/arrow/testing/util.h
+++ b/cpp/src/arrow/testing/util.h
@@ -63,8 +63,6 @@ ARROW_TESTING_EXPORT void random_is_valid(int64_t n, double pct_null,
 ARROW_TESTING_EXPORT void random_bytes(int64_t n, uint32_t seed, uint8_t* out);
 ARROW_TESTING_EXPORT std::string random_string(int64_t n, uint32_t seed);
 ARROW_TESTING_EXPORT int32_t DecimalSize(int32_t precision);
-ARROW_TESTING_EXPORT void random_decimals(int64_t n, uint32_t seed, int32_t precision,
-                                          uint8_t* out);
 ARROW_TESTING_EXPORT void random_ascii(int64_t n, uint32_t seed, uint8_t* out);
 ARROW_TESTING_EXPORT int64_t CountNulls(const std::vector<uint8_t>& valid_bytes);
 
diff --git a/cpp/src/arrow/util/decimal.h b/cpp/src/arrow/util/decimal.h
index da88fbeb379ab..5b26f1f543170 100644
--- a/cpp/src/arrow/util/decimal.h
+++ b/cpp/src/arrow/util/decimal.h
@@ -303,8 +303,9 @@ inline Result<int32_t> MaxDecimalDigitsForInteger(Type::type type_id) {
     case Type::UINT32:
       return 10;
     case Type::INT64:
-    case Type::UINT64:
       return 19;
+    case Type::UINT64:
+      return 20;
     default:
       break;
   }
diff --git a/cpp/src/arrow/util/decimal_test.cc b/cpp/src/arrow/util/decimal_test.cc
index 75716f943c6f6..d4e042ad2b269 100644
--- a/cpp/src/arrow/util/decimal_test.cc
+++ b/cpp/src/arrow/util/decimal_test.cc
@@ -594,6 +594,28 @@ class DecimalFromIntegerTest : public ::testing::Test {
       AssertArrayBits(value.little_endian_array(), 0, 0);
     }
   }
+
+  void TestNumericLimits() {
+    TestNumericLimit<Int8Type>();
+    TestNumericLimit<UInt8Type>();
+    TestNumericLimit<Int16Type>();
+    TestNumericLimit<UInt16Type>();
+    TestNumericLimit<Int32Type>();
+    TestNumericLimit<UInt32Type>();
+    TestNumericLimit<Int64Type>();
+    TestNumericLimit<UInt64Type>();
+  }
+
+  template <typename ArrowType>
+  void TestNumericLimit() {
+    using c_type = typename ArrowType::c_type;
+    ASSERT_OK_AND_ASSIGN(const int32_t precision,
+                         MaxDecimalDigitsForInteger(ArrowType::type_id));
+    DecimalType min_value(std::numeric_limits<c_type>::min());
+    ASSERT_TRUE(min_value.FitsInPrecision(precision));
+    DecimalType max_value(std::numeric_limits<c_type>::max());
+    ASSERT_TRUE(max_value.FitsInPrecision(precision));
+  }
 };
 
 TYPED_TEST_SUITE(DecimalFromIntegerTest, DecimalTypes);
@@ -606,6 +628,8 @@ TYPED_TEST(DecimalFromIntegerTest, ConstructibleFromBool) {
   this->TestConstructibleFromBool();
 }
 
+TYPED_TEST(DecimalFromIntegerTest, TestNumericLimits) { this->TestNumericLimits(); }
+
 TEST(Decimal128Test, Division) {
   const std::string expected_string_value("-23923094039234029");
   const Decimal128 value(expected_string_value);
diff --git a/cpp/src/parquet/arrow/test_util.h b/cpp/src/parquet/arrow/test_util.h
index fb1d398768147..a70185f709d21 100644
--- a/cpp/src/parquet/arrow/test_util.h
+++ b/cpp/src/parquet/arrow/test_util.h
@@ -130,22 +130,17 @@ ::arrow::enable_if_fixed_size_binary<ArrowType, Status> NonNullArray(
 }
 
 static void random_decimals(int64_t n, uint32_t seed, int32_t precision, uint8_t* out) {
-  std::default_random_engine gen(seed);
-  std::uniform_int_distribution<uint32_t> d(0, std::numeric_limits<uint8_t>::max());
-  const int32_t required_bytes = ::arrow::DecimalType::DecimalSize(precision);
-  int32_t byte_width = precision <= 38 ? 16 : 32;
-  std::fill(out, out + byte_width * n, '\0');
-
-  for (int64_t i = 0; i < n; ++i, out += byte_width) {
-    std::generate(out, out + required_bytes,
-                  [&d, &gen] { return static_cast<uint8_t>(d(gen)); });
-
-    // sign extend if the sign bit is set for the last byte generated
-    // 0b10000000 == 0x80 == 128
-    if ((out[required_bytes - 1] & '\x80') != 0) {
-      std::fill(out + required_bytes, out + byte_width, '\xFF');
-    }
+  auto gen = ::arrow::random::RandomArrayGenerator(seed);
+  std::shared_ptr<Array> decimals;
+  int32_t byte_width = 0;
+  if (precision <= ::arrow::Decimal128Type::kMaxPrecision) {
+    decimals = gen.Decimal128(::arrow::decimal128(precision, 0), n);
+    byte_width = ::arrow::Decimal128Type::kByteWidth;
+  } else {
+    decimals = gen.Decimal256(::arrow::decimal256(precision, 0), n);
+    byte_width = ::arrow::Decimal256Type::kByteWidth;
   }
+  std::memcpy(out, decimals->data()->GetValues<uint8_t>(1, 0), byte_width * n);
 }
 
 template <typename ArrowType, int32_t precision = ArrowType::precision>
diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py
index d408be3cce41d..9a0941155b73e 100644
--- a/dev/archery/archery/cli.py
+++ b/dev/archery/archery/cli.py
@@ -703,7 +703,7 @@ def _set_default(opt, default):
 @click.option('--with-rust', type=bool, default=False,
               help='Include Rust in integration tests',
               envvar="ARCHERY_INTEGRATION_WITH_RUST")
-@click.option('--write_generated_json', default=False,
+@click.option('--write_generated_json', default="",
               help='Generate test JSON to indicated path')
 @click.option('--run-flight', is_flag=True, default=False,
               help='Run Flight integration tests')
diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py
index b764982bd81ab..6a077a89361e0 100644
--- a/dev/archery/archery/integration/datagen.py
+++ b/dev/archery/archery/integration/datagen.py
@@ -431,23 +431,10 @@ def generate_column(self, size, name=None):
         return PrimitiveColumn(name, size, is_valid, values)
 
 
-DECIMAL_PRECISION_TO_VALUE = {
-    key: (1 << (8 * i - 1)) - 1 for i, key in enumerate(
-        [1, 3, 5, 7, 10, 12, 15, 17, 19, 22, 24, 27, 29, 32, 34, 36,
-         40, 42, 44, 50, 60, 70],
-        start=1,
-    )
-}
-
-
 def decimal_range_from_precision(precision):
     assert 1 <= precision <= 76
-    try:
-        max_value = DECIMAL_PRECISION_TO_VALUE[precision]
-    except KeyError:
-        return decimal_range_from_precision(precision - 1)
-    else:
-        return ~max_value, max_value
+    max_value = (10 ** precision) - 1
+    return -max_value, max_value
 
 
 class DecimalField(PrimitiveField):
@@ -1138,7 +1125,7 @@ def get_json(self):
 class File(object):
 
     def __init__(self, name, schema, batches, dictionaries=None,
-                 skip=None, path=None):
+                 skip=None, path=None, quirks=None):
         self.name = name
         self.schema = schema
         self.dictionaries = dictionaries or []
@@ -1147,6 +1134,11 @@ def __init__(self, name, schema, batches, dictionaries=None,
         self.path = path
         if skip:
             self.skip.update(skip)
+        # For tracking flags like whether to validate decimal values
+        # fit into the given precision (ARROW-13558).
+        self.quirks = set()
+        if quirks:
+            self.quirks.update(quirks)
 
     def get_json(self):
         entries = [
diff --git a/dev/archery/archery/integration/runner.py b/dev/archery/archery/integration/runner.py
index 463917b81619d..96ebd48912b1c 100644
--- a/dev/archery/archery/integration/runner.py
+++ b/dev/archery/archery/integration/runner.py
@@ -147,7 +147,15 @@ def _gold_tests(self, gold_dir):
                 skip.add("C#")
                 skip.add("Go")
 
-            yield datagen.File(name, None, None, skip=skip, path=out_path)
+            quirks = set()
+            if prefix in {'0.14.1', '0.17.1',
+                          '1.0.0-bigendian', '1.0.0-littleendian'}:
+                # ARROW-13558: older versions generated decimal values that
+                # were out of range for the given precision.
+                quirks.add("no_decimal_validate")
+
+            yield datagen.File(name, None, None, skip=skip, path=out_path,
+                               quirks=quirks)
 
     def _run_test_cases(self, producer, consumer, case_runner,
                         test_cases):
@@ -254,7 +262,8 @@ def _run_gold(self, gold_dir, producer, consumer, outcome, test_case):
         log('-- Validating file')
         producer_file_path = os.path.join(
             gold_dir, "generated_" + test_case.name + ".arrow_file")
-        consumer.validate(json_path, producer_file_path)
+        consumer.validate(json_path, producer_file_path,
+                          quirks=test_case.quirks)
 
         log('-- Validating stream')
         consumer_stream_path = os.path.join(
@@ -266,7 +275,8 @@ def _run_gold(self, gold_dir, producer, consumer, outcome, test_case):
                                           name + '.consumer_stream_as_file')
 
         consumer.stream_to_file(consumer_stream_path, consumer_file_path)
-        consumer.validate(json_path, consumer_file_path)
+        consumer.validate(json_path, consumer_file_path,
+                          quirks=test_case.quirks)
 
     def _compare_flight_implementations(self, producer, consumer):
         log('##########################################################')
diff --git a/dev/archery/archery/integration/tester.py b/dev/archery/archery/integration/tester.py
index 122e4f2e4a78b..ee6269dc4ffe6 100644
--- a/dev/archery/archery/integration/tester.py
+++ b/dev/archery/archery/integration/tester.py
@@ -47,7 +47,7 @@ def stream_to_file(self, stream_path, file_path):
     def file_to_stream(self, file_path, stream_path):
         raise NotImplementedError
 
-    def validate(self, json_path, arrow_path):
+    def validate(self, json_path, arrow_path, quirks=None):
         raise NotImplementedError
 
     def flight_server(self, scenario_name=None):
diff --git a/dev/archery/archery/integration/tester_cpp.py b/dev/archery/archery/integration/tester_cpp.py
index d35c9550e58ea..eab520a69f00e 100644
--- a/dev/archery/archery/integration/tester_cpp.py
+++ b/dev/archery/archery/integration/tester_cpp.py
@@ -45,7 +45,8 @@ class CPPTester(Tester):
 
     name = 'C++'
 
-    def _run(self, arrow_path=None, json_path=None, command='VALIDATE'):
+    def _run(self, arrow_path=None, json_path=None, command='VALIDATE',
+             quirks=None):
         cmd = [self.CPP_INTEGRATION_EXE, '--integration']
 
         if arrow_path is not None:
@@ -56,13 +57,17 @@ def _run(self, arrow_path=None, json_path=None, command='VALIDATE'):
 
         cmd.append('--mode=' + command)
 
+        if quirks:
+            if "no_decimal_validate" in quirks:
+                cmd.append("--validate_decimals=false")
+
         if self.debug:
             log(' '.join(cmd))
 
         run_cmd(cmd)
 
-    def validate(self, json_path, arrow_path):
-        return self._run(arrow_path, json_path, 'VALIDATE')
+    def validate(self, json_path, arrow_path, quirks=None):
+        return self._run(arrow_path, json_path, 'VALIDATE', quirks=quirks)
 
     def json_to_file(self, json_path, arrow_path):
         return self._run(arrow_path, json_path, 'JSON_TO_ARROW')
diff --git a/dev/archery/archery/integration/tester_csharp.py b/dev/archery/archery/integration/tester_csharp.py
index 130c49cfeaf50..f1cedc02ef00f 100644
--- a/dev/archery/archery/integration/tester_csharp.py
+++ b/dev/archery/archery/integration/tester_csharp.py
@@ -48,7 +48,7 @@ def _run(self, json_path=None, arrow_path=None, command='validate'):
 
         run_cmd(cmd)
 
-    def validate(self, json_path, arrow_path):
+    def validate(self, json_path, arrow_path, quirks=None):
         return self._run(json_path, arrow_path, 'validate')
 
     def json_to_file(self, json_path, arrow_path):
diff --git a/dev/archery/archery/integration/tester_go.py b/dev/archery/archery/integration/tester_go.py
index eeba38fe5017c..b8cba349ac732 100644
--- a/dev/archery/archery/integration/tester_go.py
+++ b/dev/archery/archery/integration/tester_go.py
@@ -62,7 +62,7 @@ def _run(self, arrow_path=None, json_path=None, command='VALIDATE'):
 
         run_cmd(cmd)
 
-    def validate(self, json_path, arrow_path):
+    def validate(self, json_path, arrow_path, quirks=None):
         return self._run(arrow_path, json_path, 'VALIDATE')
 
     def json_to_file(self, json_path, arrow_path):
diff --git a/dev/archery/archery/integration/tester_java.py b/dev/archery/archery/integration/tester_java.py
index f283f6cd255c7..5104a0cc75557 100644
--- a/dev/archery/archery/integration/tester_java.py
+++ b/dev/archery/archery/integration/tester_java.py
@@ -75,7 +75,7 @@ def _run(self, arrow_path=None, json_path=None, command='VALIDATE'):
 
         run_cmd(cmd)
 
-    def validate(self, json_path, arrow_path):
+    def validate(self, json_path, arrow_path, quirks=None):
         return self._run(arrow_path, json_path, 'VALIDATE')
 
     def json_to_file(self, json_path, arrow_path):
diff --git a/dev/archery/archery/integration/tester_js.py b/dev/archery/archery/integration/tester_js.py
index e24eec0cadaa7..b0ace66a34190 100644
--- a/dev/archery/archery/integration/tester_js.py
+++ b/dev/archery/archery/integration/tester_js.py
@@ -50,7 +50,7 @@ def _run(self, exe_cmd, arrow_path=None, json_path=None,
 
         run_cmd(cmd)
 
-    def validate(self, json_path, arrow_path):
+    def validate(self, json_path, arrow_path, quirks=None):
         return self._run(self.VALIDATE, arrow_path, json_path, 'VALIDATE')
 
     def json_to_file(self, json_path, arrow_path):
diff --git a/dev/archery/archery/integration/tester_rust.py b/dev/archery/archery/integration/tester_rust.py
index bca80ebae3c60..af471b576fda9 100644
--- a/dev/archery/archery/integration/tester_rust.py
+++ b/dev/archery/archery/integration/tester_rust.py
@@ -60,7 +60,7 @@ def _run(self, arrow_path=None, json_path=None, command='VALIDATE'):
 
         run_cmd(cmd)
 
-    def validate(self, json_path, arrow_path):
+    def validate(self, json_path, arrow_path, quirks=None):
         return self._run(arrow_path, json_path, 'VALIDATE')
 
     def json_to_file(self, json_path, arrow_path):

From b97965ab3748ce22f6601bd439b739ff33d44a22 Mon Sep 17 00:00:00 2001
From: Jayjeet Chakraborty <jc.github@rediffmail.com>
Date: Mon, 8 Nov 2021 18:12:24 +0100
Subject: [PATCH 106/194] ARROW-14516: [CI] Disable privileged mode for Docker
 runs

Closes #11626 from JayjeetAtGithub/bugfix/remove-privileged-ci

Authored-by: Jayjeet Chakraborty <jc.github@rediffmail.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 docker-compose.yml | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/docker-compose.yml b/docker-compose.yml
index a9132c466948f..1f47735c85de1 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -329,8 +329,13 @@ services:
         llvm: ${LLVM}
         gcc_version: ${GCC_VERSION}
     shm_size: *shm-size
+    cap_add:
+      - SYS_ADMIN
+    devices:
+      - "/dev/fuse:/dev/fuse"
+    security_opt:
+      - apparmor:unconfined
     ulimits: *ulimits
-    privileged: true
     environment:
       <<: *ccache
       ARROW_ENABLE_TIMING_TESTS:  # inherit

From 00c94e0fa9074defc8818aab945a1db8420aefc8 Mon Sep 17 00:00:00 2001
From: shanhuuang <shanhuuang@gmail.com>
Date: Mon, 8 Nov 2021 19:18:59 +0100
Subject: [PATCH 107/194] PARQUET-492: [C++][Parquet] Basic support for reading
 DELTA_BYTE_ARRAY data.

1. Add "Advance" method in arrow::BitUtil::BitReader.
2. Basic implement of DeltaLengthByteArrayDecoder and DeltaByteArrayDecoder.
2. Test case of DeltaByteArrayDecoder, which relies on DeltaLengthByteArrayDecoder.

TODO:
1. Read corrupted files written with bug([PARQUET-246](https://issues.apache.org/jira/browse/PARQUET-246)).
2. Add a test case of DeltaLengthByteArrayDecoder, if possible.

Closes #10978 from shanhuuang/PARQUET-492

Lead-authored-by: shanhuuang <shanhuuang@gmail.com>
Co-authored-by: Shan Huang <85105837+shanhuuang@users.noreply.github.com>
Co-authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/arrow/util/bit_stream_utils.h         |  54 ++--
 .../parquet/arrow/arrow_reader_writer_test.cc | 117 ++++++--
 cpp/src/parquet/column_reader.cc              |   7 +-
 cpp/src/parquet/encoding.cc                   | 266 ++++++++++++++----
 cpp/submodules/parquet-testing                |   2 +-
 docs/source/cpp/parquet.rst                   |  37 +--
 6 files changed, 370 insertions(+), 113 deletions(-)

diff --git a/cpp/src/arrow/util/bit_stream_utils.h b/cpp/src/arrow/util/bit_stream_utils.h
index 49f602ed8426e..73b063925a1cc 100644
--- a/cpp/src/arrow/util/bit_stream_utils.h
+++ b/cpp/src/arrow/util/bit_stream_utils.h
@@ -140,7 +140,7 @@ class BitReader {
   }
 
   /// Gets the next value from the buffer.  Returns true if 'v' could be read or false if
-  /// there are not enough bytes left. num_bits must be <= 32.
+  /// there are not enough bytes left.
   template <typename T>
   bool GetValue(int num_bits, T* v);
 
@@ -157,6 +157,10 @@ class BitReader {
   template <typename T>
   bool GetAligned(int num_bytes, T* v);
 
+  /// Advances the stream by a number of bits. Returns true if succeed or false if there
+  /// are not enough bits left.
+  bool Advance(int64_t num_bits);
+
   /// Reads a vlq encoded int from the stream.  The encoded int must start at
   /// the beginning of a byte. Return false if there were not enough bytes in
   /// the buffer.
@@ -255,6 +259,16 @@ inline bool BitWriter::PutAligned(T val, int num_bytes) {
 
 namespace detail {
 
+inline void ResetBufferedValues_(const uint8_t* buffer, int byte_offset,
+                                 int bytes_remaining, uint64_t* buffered_values) {
+  if (ARROW_PREDICT_TRUE(bytes_remaining >= 8)) {
+    memcpy(buffered_values, buffer + byte_offset, 8);
+  } else {
+    memcpy(buffered_values, buffer + byte_offset, bytes_remaining);
+  }
+  *buffered_values = arrow::BitUtil::FromLittleEndian(*buffered_values);
+}
+
 template <typename T>
 inline void GetValue_(int num_bits, T* v, int max_bytes, const uint8_t* buffer,
                       int* bit_offset, int* byte_offset, uint64_t* buffered_values) {
@@ -272,13 +286,7 @@ inline void GetValue_(int num_bits, T* v, int max_bytes, const uint8_t* buffer,
     *byte_offset += 8;
     *bit_offset -= 64;
 
-    int bytes_remaining = max_bytes - *byte_offset;
-    if (ARROW_PREDICT_TRUE(bytes_remaining >= 8)) {
-      memcpy(buffered_values, buffer + *byte_offset, 8);
-    } else {
-      memcpy(buffered_values, buffer + *byte_offset, bytes_remaining);
-    }
-    *buffered_values = arrow::BitUtil::FromLittleEndian(*buffered_values);
+    ResetBufferedValues_(buffer, *byte_offset, max_bytes - *byte_offset, buffered_values);
 #ifdef _MSC_VER
 #pragma warning(push)
 #pragma warning(disable : 4800 4805)
@@ -374,13 +382,8 @@ inline int BitReader::GetBatch(int num_bits, T* v, int batch_size) {
     }
   }
 
-  int bytes_remaining = max_bytes - byte_offset;
-  if (bytes_remaining >= 8) {
-    memcpy(&buffered_values, buffer + byte_offset, 8);
-  } else {
-    memcpy(&buffered_values, buffer + byte_offset, bytes_remaining);
-  }
-  buffered_values = arrow::BitUtil::FromLittleEndian(buffered_values);
+  detail::ResetBufferedValues_(buffer, byte_offset, max_bytes - byte_offset,
+                               &buffered_values);
 
   for (; i < batch_size; ++i) {
     detail::GetValue_(num_bits, &v[i], max_bytes, buffer, &bit_offset, &byte_offset,
@@ -411,15 +414,22 @@ inline bool BitReader::GetAligned(int num_bytes, T* v) {
   *v = arrow::BitUtil::FromLittleEndian(*v);
   byte_offset_ += num_bytes;
 
-  // Reset buffered_values_
   bit_offset_ = 0;
-  int bytes_remaining = max_bytes_ - byte_offset_;
-  if (ARROW_PREDICT_TRUE(bytes_remaining >= 8)) {
-    memcpy(&buffered_values_, buffer_ + byte_offset_, 8);
-  } else {
-    memcpy(&buffered_values_, buffer_ + byte_offset_, bytes_remaining);
+  detail::ResetBufferedValues_(buffer_, byte_offset_, max_bytes_ - byte_offset_,
+                               &buffered_values_);
+  return true;
+}
+
+inline bool BitReader::Advance(int64_t num_bits) {
+  int64_t bits_required = bit_offset_ + num_bits;
+  int64_t bytes_required = BitUtil::BytesForBits(bits_required);
+  if (ARROW_PREDICT_FALSE(bytes_required > max_bytes_ - byte_offset_)) {
+    return false;
   }
-  buffered_values_ = arrow::BitUtil::FromLittleEndian(buffered_values_);
+  byte_offset_ += static_cast<int>(bits_required >> 3);
+  bit_offset_ = static_cast<int>(bits_required & 7);
+  detail::ResetBufferedValues_(buffer_, byte_offset_, max_bytes_ - byte_offset_,
+                               &buffered_values_);
   return true;
 }
 
diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index fa8b15d2ba7e3..198c3a8817d92 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -4158,36 +4158,119 @@ TEST(TestArrowWriteDictionaries, NestedSubfield) {
 }
 
 #ifdef ARROW_CSV
-TEST(TestArrowReadDeltaEncoding, DeltaBinaryPacked) {
-  auto file = test::get_data_file("delta_binary_packed.parquet");
-  auto expect_file = test::get_data_file("delta_binary_packed_expect.csv");
-  auto pool = ::arrow::default_memory_pool();
-  std::unique_ptr<FileReader> parquet_reader;
-  std::shared_ptr<::arrow::Table> table;
-  ASSERT_OK(
-      FileReader::Make(pool, ParquetFileReader::OpenFile(file, false), &parquet_reader));
-  ASSERT_OK(parquet_reader->ReadTable(&table));
 
-  ASSERT_OK_AND_ASSIGN(auto input_file, ::arrow::io::ReadableFile::Open(expect_file));
+class TestArrowReadDeltaEncoding : public ::testing::Test {
+ public:
+  void ReadTableFromParquetFile(const std::string& file_name,
+                                std::shared_ptr<Table>* out) {
+    auto file = test::get_data_file(file_name);
+    auto pool = ::arrow::default_memory_pool();
+    std::unique_ptr<FileReader> parquet_reader;
+    ASSERT_OK(FileReader::Make(pool, ParquetFileReader::OpenFile(file, false),
+                               &parquet_reader));
+    ASSERT_OK(parquet_reader->ReadTable(out));
+    ASSERT_OK((*out)->ValidateFull());
+  }
+
+  void ReadTableFromCSVFile(const std::string& file_name,
+                            const ::arrow::csv::ConvertOptions& convert_options,
+                            std::shared_ptr<Table>* out) {
+    auto file = test::get_data_file(file_name);
+    ASSERT_OK_AND_ASSIGN(auto input_file, ::arrow::io::ReadableFile::Open(file));
+    ASSERT_OK_AND_ASSIGN(auto csv_reader,
+                         ::arrow::csv::TableReader::Make(
+                             ::arrow::io::default_io_context(), input_file,
+                             ::arrow::csv::ReadOptions::Defaults(),
+                             ::arrow::csv::ParseOptions::Defaults(), convert_options));
+    ASSERT_OK_AND_ASSIGN(*out, csv_reader->Read());
+  }
+};
+
+TEST_F(TestArrowReadDeltaEncoding, DeltaBinaryPacked) {
+  std::shared_ptr<::arrow::Table> actual_table, expect_table;
+  ReadTableFromParquetFile("delta_binary_packed.parquet", &actual_table);
+
   auto convert_options = ::arrow::csv::ConvertOptions::Defaults();
   for (int i = 0; i <= 64; ++i) {
     std::string column_name = "bitwidth" + std::to_string(i);
     convert_options.column_types[column_name] = ::arrow::int64();
   }
   convert_options.column_types["int_value"] = ::arrow::int32();
-  ASSERT_OK_AND_ASSIGN(auto csv_reader,
-                       ::arrow::csv::TableReader::Make(
-                           ::arrow::io::default_io_context(), input_file,
-                           ::arrow::csv::ReadOptions::Defaults(),
-                           ::arrow::csv::ParseOptions::Defaults(), convert_options));
-  ASSERT_OK_AND_ASSIGN(auto expect_table, csv_reader->Read());
+  ReadTableFromCSVFile("delta_binary_packed_expect.csv", convert_options, &expect_table);
+
+  ::arrow::AssertTablesEqual(*actual_table, *expect_table);
+}
+
+TEST_F(TestArrowReadDeltaEncoding, DeltaByteArray) {
+  std::shared_ptr<::arrow::Table> actual_table, expect_table;
+  ReadTableFromParquetFile("delta_byte_array.parquet", &actual_table);
+
+  auto convert_options = ::arrow::csv::ConvertOptions::Defaults();
+  std::vector<std::string> column_names = {
+      "c_customer_id", "c_salutation",          "c_first_name",
+      "c_last_name",   "c_preferred_cust_flag", "c_birth_country",
+      "c_login",       "c_email_address",       "c_last_review_date"};
+  for (auto name : column_names) {
+    convert_options.column_types[name] = ::arrow::utf8();
+  }
+  convert_options.strings_can_be_null = true;
+  ReadTableFromCSVFile("delta_byte_array_expect.csv", convert_options, &expect_table);
+
+  ::arrow::AssertTablesEqual(*actual_table, *expect_table, false);
+}
+
+TEST_F(TestArrowReadDeltaEncoding, IncrementalDecodeDeltaByteArray) {
+  auto file = test::get_data_file("delta_byte_array.parquet");
+  auto pool = ::arrow::default_memory_pool();
+  const int64_t batch_size = 100;
+  ArrowReaderProperties properties = default_arrow_reader_properties();
+  properties.set_batch_size(batch_size);
+  std::unique_ptr<FileReader> parquet_reader;
+  std::shared_ptr<::arrow::RecordBatchReader> rb_reader;
+  ASSERT_OK(FileReader::Make(pool, ParquetFileReader::OpenFile(file, false), properties,
+                             &parquet_reader));
+  ASSERT_OK(parquet_reader->GetRecordBatchReader(Iota(parquet_reader->num_row_groups()),
+                                                 &rb_reader));
 
-  ::arrow::AssertTablesEqual(*table, *expect_table);
+  auto convert_options = ::arrow::csv::ConvertOptions::Defaults();
+  std::vector<std::string> column_names = {
+      "c_customer_id", "c_salutation",          "c_first_name",
+      "c_last_name",   "c_preferred_cust_flag", "c_birth_country",
+      "c_login",       "c_email_address",       "c_last_review_date"};
+  for (auto name : column_names) {
+    convert_options.column_types[name] = ::arrow::utf8();
+  }
+  convert_options.strings_can_be_null = true;
+  std::shared_ptr<::arrow::Table> csv_table;
+  ReadTableFromCSVFile("delta_byte_array_expect.csv", convert_options, &csv_table);
+
+  ::arrow::TableBatchReader csv_table_reader(*csv_table);
+  csv_table_reader.set_chunksize(batch_size);
+
+  std::shared_ptr<::arrow::RecordBatch> actual_batch, expected_batch;
+  for (int i = 0; i < csv_table->num_rows() / batch_size; ++i) {
+    ASSERT_OK(rb_reader->ReadNext(&actual_batch));
+    ASSERT_OK(actual_batch->ValidateFull());
+    ASSERT_OK(csv_table_reader.ReadNext(&expected_batch));
+    ASSERT_NO_FATAL_FAILURE(::arrow::AssertBatchesEqual(*expected_batch, *actual_batch));
+  }
+  ASSERT_OK(rb_reader->ReadNext(&actual_batch));
+  ASSERT_EQ(nullptr, actual_batch);
 }
+
 #else
 TEST(TestArrowReadDeltaEncoding, DeltaBinaryPacked) {
   GTEST_SKIP() << "Test needs CSV reader";
 }
+
+TEST(TestArrowReadDeltaEncoding, DeltaByteArray) {
+  GTEST_SKIP() << "Test needs CSV reader";
+}
+
+TEST(TestArrowReadDeltaEncoding, IncrementalDecodeDeltaByteArray) {
+  GTEST_SKIP() << "Test needs CSV reader";
+}
+
 #endif
 
 struct NestedFilterTestCase {
diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc
index c7ad78c10d105..c05f3564236d9 100644
--- a/cpp/src/parquet/column_reader.cc
+++ b/cpp/src/parquet/column_reader.cc
@@ -780,8 +780,13 @@ class ColumnReaderImplBase {
           decoders_[static_cast<int>(encoding)] = std::move(decoder);
           break;
         }
+        case Encoding::DELTA_BYTE_ARRAY: {
+          auto decoder = MakeTypedDecoder<DType>(Encoding::DELTA_BYTE_ARRAY, descr_);
+          current_decoder_ = decoder.get();
+          decoders_[static_cast<int>(encoding)] = std::move(decoder);
+          break;
+        }
         case Encoding::DELTA_LENGTH_BYTE_ARRAY:
-        case Encoding::DELTA_BYTE_ARRAY:
           ParquetException::NYI("Unsupported encoding");
 
         default:
diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index 2639c3dd4aa05..3ff59423411ac 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -2068,11 +2068,25 @@ class DeltaBitPackDecoder : public DecoderImpl, virtual public TypedDecoder<DTyp
   }
 
   void SetData(int num_values, const uint8_t* data, int len) override {
+    // num_values is equal to page's num_values, including null values in this page
     this->num_values_ = num_values;
-    decoder_ = ::arrow::BitUtil::BitReader(data, len);
+    decoder_ = std::make_shared<::arrow::BitUtil::BitReader>(data, len);
     InitHeader();
   }
 
+  // Set BitReader which is already initialized by DeltaLengthByteArrayDecoder or
+  // DeltaByteArrayDecoder
+  void SetDecoder(int num_values, std::shared_ptr<::arrow::BitUtil::BitReader> decoder) {
+    this->num_values_ = num_values;
+    decoder_ = decoder;
+    InitHeader();
+  }
+
+  int ValidValuesCount() {
+    // total_value_count_ in header ignores of null values
+    return static_cast<int>(total_value_count_);
+  }
+
   int Decode(T* buffer, int max_values) override {
     return GetInternal(buffer, max_values);
   }
@@ -2108,10 +2122,10 @@ class DeltaBitPackDecoder : public DecoderImpl, virtual public TypedDecoder<DTyp
   static constexpr int kMaxDeltaBitWidth = static_cast<int>(sizeof(T) * 8);
 
   void InitHeader() {
-    if (!decoder_.GetVlqInt(&values_per_block_) ||
-        !decoder_.GetVlqInt(&mini_blocks_per_block_) ||
-        !decoder_.GetVlqInt(&total_value_count_) ||
-        !decoder_.GetZigZagVlqInt(&last_value_)) {
+    if (!decoder_->GetVlqInt(&values_per_block_) ||
+        !decoder_->GetVlqInt(&mini_blocks_per_block_) ||
+        !decoder_->GetVlqInt(&total_value_count_) ||
+        !decoder_->GetZigZagVlqInt(&last_value_)) {
       ParquetException::EofException();
     }
 
@@ -2137,12 +2151,12 @@ class DeltaBitPackDecoder : public DecoderImpl, virtual public TypedDecoder<DTyp
   }
 
   void InitBlock() {
-    if (!decoder_.GetZigZagVlqInt(&min_delta_)) ParquetException::EofException();
+    if (!decoder_->GetZigZagVlqInt(&min_delta_)) ParquetException::EofException();
 
     // read the bitwidth of each miniblock
     uint8_t* bit_width_data = delta_bit_widths_->mutable_data();
     for (uint32_t i = 0; i < mini_blocks_per_block_; ++i) {
-      if (!decoder_.GetAligned<uint8_t>(1, bit_width_data + i)) {
+      if (!decoder_->GetAligned<uint8_t>(1, bit_width_data + i)) {
         ParquetException::EofException();
       }
       if (bit_width_data[i] > kMaxDeltaBitWidth) {
@@ -2163,7 +2177,6 @@ class DeltaBitPackDecoder : public DecoderImpl, virtual public TypedDecoder<DTyp
       if (ARROW_PREDICT_FALSE(values_current_mini_block_ == 0)) {
         if (ARROW_PREDICT_FALSE(!block_initialized_)) {
           buffer[i++] = last_value_;
-          --total_value_count_;
           if (ARROW_PREDICT_FALSE(i == max_values)) break;
           InitBlock();
         } else {
@@ -2179,7 +2192,7 @@ class DeltaBitPackDecoder : public DecoderImpl, virtual public TypedDecoder<DTyp
 
       int values_decode =
           std::min(values_current_mini_block_, static_cast<uint32_t>(max_values - i));
-      if (decoder_.GetBatch(delta_bit_width_, buffer + i, values_decode) !=
+      if (decoder_->GetBatch(delta_bit_width_, buffer + i, values_decode) !=
           values_decode) {
         ParquetException::EofException();
       }
@@ -2192,15 +2205,24 @@ class DeltaBitPackDecoder : public DecoderImpl, virtual public TypedDecoder<DTyp
         last_value_ = buffer[i + j];
       }
       values_current_mini_block_ -= values_decode;
-      total_value_count_ -= values_decode;
       i += values_decode;
     }
+    total_value_count_ -= max_values;
     this->num_values_ -= max_values;
+
+    if (ARROW_PREDICT_FALSE(total_value_count_ == 0)) {
+      uint32_t padding_bits = values_current_mini_block_ * delta_bit_width_;
+      // skip the padding bits
+      if (!decoder_->Advance(padding_bits)) {
+        ParquetException::EofException();
+      }
+      values_current_mini_block_ = 0;
+    }
     return max_values;
   }
 
   MemoryPool* pool_;
-  ::arrow::BitUtil::BitReader decoder_;
+  std::shared_ptr<::arrow::BitUtil::BitReader> decoder_;
   uint32_t values_per_block_;
   uint32_t mini_blocks_per_block_;
   uint32_t values_per_mini_block_;
@@ -2226,30 +2248,48 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl,
                                        MemoryPool* pool = ::arrow::default_memory_pool())
       : DecoderImpl(descr, Encoding::DELTA_LENGTH_BYTE_ARRAY),
         len_decoder_(nullptr, pool),
-        pool_(pool) {}
+        buffered_length_(AllocateBuffer(pool, 0)),
+        buffered_data_(AllocateBuffer(pool, 0)) {}
 
   void SetData(int num_values, const uint8_t* data, int len) override {
     num_values_ = num_values;
     if (len == 0) return;
-    int total_lengths_len = ::arrow::util::SafeLoadAs<int32_t>(data);
-    data += 4;
-    this->len_decoder_.SetData(num_values, data, total_lengths_len);
-    data_ = data + total_lengths_len;
-    this->len_ = len - 4 - total_lengths_len;
+    decoder_ = std::make_shared<::arrow::BitUtil::BitReader>(data, len);
+    DecodeLengths();
+  }
+
+  void SetDecoder(int num_values, std::shared_ptr<::arrow::BitUtil::BitReader> decoder) {
+    num_values_ = num_values;
+    decoder_ = decoder;
+    DecodeLengths();
   }
 
   int Decode(ByteArray* buffer, int max_values) override {
-    using VectorT = ArrowPoolVector<int>;
-    max_values = std::min(max_values, num_values_);
-    VectorT lengths(max_values, 0, ::arrow::stl::allocator<int>(pool_));
-    len_decoder_.Decode(lengths.data(), max_values);
+    max_values = std::min(max_values, num_valid_values_);
+
+    int64_t data_size = 0;
+    const int32_t* length_ptr =
+        reinterpret_cast<const int32_t*>(buffered_length_->data()) + length_idx_;
+    for (int i = 0; i < max_values; ++i) {
+      int32_t len = length_ptr[i];
+      buffer[i].len = len;
+      data_size += len;
+    }
+    length_idx_ += max_values;
+
+    PARQUET_THROW_NOT_OK(buffered_data_->Resize(data_size));
+    if (decoder_->GetBatch(8, buffered_data_->mutable_data(),
+                           static_cast<int>(data_size)) != static_cast<int>(data_size)) {
+      ParquetException::EofException();
+    }
+    const uint8_t* data_ptr = buffered_data_->data();
+
     for (int i = 0; i < max_values; ++i) {
-      buffer[i].len = lengths[i];
-      buffer[i].ptr = data_;
-      this->data_ += lengths[i];
-      this->len_ -= lengths[i];
+      buffer[i].ptr = data_ptr;
+      data_ptr += buffer[i].len;
     }
     this->num_values_ -= max_values;
+    num_valid_values_ -= max_values;
     return max_values;
   }
 
@@ -2266,8 +2306,30 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl,
   }
 
  private:
+  // Decode all the encoded lengths. The decoder_ will be at the start of the encoded data
+  // after that.
+  void DecodeLengths() {
+    len_decoder_.SetDecoder(num_values_, decoder_);
+
+    // get the number of encoded lengths
+    int num_length = len_decoder_.ValidValuesCount();
+    PARQUET_THROW_NOT_OK(buffered_length_->Resize(num_length * sizeof(int32_t)));
+
+    // call len_decoder_.Decode to decode all the lengths.
+    // all the lengths are buffered in buffered_length_.
+    int ret = len_decoder_.Decode(
+        reinterpret_cast<int32_t*>(buffered_length_->mutable_data()), num_length);
+    DCHECK_EQ(ret, num_length);
+    length_idx_ = 0;
+    num_valid_values_ = num_length;
+  }
+
+  std::shared_ptr<::arrow::BitUtil::BitReader> decoder_;
   DeltaBitPackDecoder<Int32Type> len_decoder_;
-  ::arrow::MemoryPool* pool_;
+  int num_valid_values_;
+  uint32_t length_idx_;
+  std::shared_ptr<ResizableBuffer> buffered_length_;
+  std::shared_ptr<ResizableBuffer> buffered_data_;
 };
 
 // ----------------------------------------------------------------------
@@ -2281,46 +2343,134 @@ class DeltaByteArrayDecoder : public DecoderImpl,
       : DecoderImpl(descr, Encoding::DELTA_BYTE_ARRAY),
         prefix_len_decoder_(nullptr, pool),
         suffix_decoder_(nullptr, pool),
-        last_value_(0, nullptr) {}
+        last_value_in_previous_page_(""),
+        buffered_prefix_length_(AllocateBuffer(pool, 0)),
+        buffered_data_(AllocateBuffer(pool, 0)) {}
 
-  virtual void SetData(int num_values, const uint8_t* data, int len) {
+  void SetData(int num_values, const uint8_t* data, int len) override {
     num_values_ = num_values;
-    if (len == 0) return;
-    int prefix_len_length = ::arrow::util::SafeLoadAs<int32_t>(data);
-    data += 4;
-    len -= 4;
-    prefix_len_decoder_.SetData(num_values, data, prefix_len_length);
-    data += prefix_len_length;
-    len -= prefix_len_length;
-    suffix_decoder_.SetData(num_values, data, len);
-  }
-
-  // TODO: this doesn't work and requires memory management. We need to allocate
-  // new strings to store the results.
-  virtual int Decode(ByteArray* buffer, int max_values) {
-    max_values = std::min(max_values, this->num_values_);
-    for (int i = 0; i < max_values; ++i) {
-      int prefix_len = 0;
-      prefix_len_decoder_.Decode(&prefix_len, 1);
-      ByteArray suffix = {0, nullptr};
-      suffix_decoder_.Decode(&suffix, 1);
-      buffer[i].len = prefix_len + suffix.len;
+    decoder_ = std::make_shared<::arrow::BitUtil::BitReader>(data, len);
+    prefix_len_decoder_.SetDecoder(num_values, decoder_);
+
+    // get the number of encoded prefix lengths
+    int num_prefix = prefix_len_decoder_.ValidValuesCount();
+    // call prefix_len_decoder_.Decode to decode all the prefix lengths.
+    // all the prefix lengths are buffered in buffered_prefix_length_.
+    PARQUET_THROW_NOT_OK(buffered_prefix_length_->Resize(num_prefix * sizeof(int32_t)));
+    int ret = prefix_len_decoder_.Decode(
+        reinterpret_cast<int32_t*>(buffered_prefix_length_->mutable_data()), num_prefix);
+    DCHECK_EQ(ret, num_prefix);
+    prefix_len_offset_ = 0;
+    num_valid_values_ = num_prefix;
+
+    // at this time, the decoder_ will be at the start of the encoded suffix data.
+    suffix_decoder_.SetDecoder(num_values, decoder_);
+
+    // TODO: read corrupted files written with bug(PARQUET-246). last_value_ should be set
+    // to last_value_in_previous_page_ when decoding a new page(except the first page)
+    last_value_ = "";
+  }
+
+  int Decode(ByteArray* buffer, int max_values) override {
+    return GetInternal(buffer, max_values);
+  }
+
+  int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+                  int64_t valid_bits_offset,
+                  typename EncodingTraits<ByteArrayType>::Accumulator* out) override {
+    int result = 0;
+    PARQUET_THROW_NOT_OK(DecodeArrowDense(num_values, null_count, valid_bits,
+                                          valid_bits_offset, out, &result));
+    return result;
+  }
 
-      uint8_t* result = reinterpret_cast<uint8_t*>(malloc(buffer[i].len));
-      memcpy(result, last_value_.ptr, prefix_len);
-      memcpy(result + prefix_len, suffix.ptr, suffix.len);
+  int DecodeArrow(
+      int num_values, int null_count, const uint8_t* valid_bits,
+      int64_t valid_bits_offset,
+      typename EncodingTraits<ByteArrayType>::DictAccumulator* builder) override {
+    ParquetException::NYI("DecodeArrow of DictAccumulator for DeltaByteArrayDecoder");
+  }
+
+ private:
+  int GetInternal(ByteArray* buffer, int max_values) {
+    max_values = std::min(max_values, num_valid_values_);
+    suffix_decoder_.Decode(buffer, max_values);
+
+    int64_t data_size = 0;
+    const int32_t* prefix_len_ptr =
+        reinterpret_cast<const int32_t*>(buffered_prefix_length_->data()) +
+        prefix_len_offset_;
+    for (int i = 0; i < max_values; ++i) {
+      data_size += prefix_len_ptr[i] + buffer[i].len;
+    }
+    PARQUET_THROW_NOT_OK(buffered_data_->Resize(data_size));
 
-      buffer[i].ptr = result;
-      last_value_ = buffer[i];
+    uint8_t* data_ptr = buffered_data_->mutable_data();
+    for (int i = 0; i < max_values; ++i) {
+      DCHECK_LE(static_cast<const size_t>(prefix_len_ptr[i]), last_value_.length());
+      memcpy(data_ptr, last_value_.data(), prefix_len_ptr[i]);
+      memcpy(data_ptr + prefix_len_ptr[i], buffer[i].ptr, buffer[i].len);
+      buffer[i].ptr = data_ptr;
+      buffer[i].len += prefix_len_ptr[i];
+      data_ptr += buffer[i].len;
+      last_value_ =
+          std::string(reinterpret_cast<const char*>(buffer[i].ptr), buffer[i].len);
     }
+    prefix_len_offset_ += max_values;
     this->num_values_ -= max_values;
+    num_valid_values_ -= max_values;
+
+    if (num_valid_values_ == 0) {
+      last_value_in_previous_page_ = last_value_;
+    }
     return max_values;
   }
 
- private:
+  Status DecodeArrowDense(int num_values, int null_count, const uint8_t* valid_bits,
+                          int64_t valid_bits_offset,
+                          typename EncodingTraits<ByteArrayType>::Accumulator* out,
+                          int* out_num_values) {
+    ArrowBinaryHelper helper(out);
+    ::arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, num_values);
+
+    std::vector<ByteArray> values(num_values);
+    int num_valid_values = GetInternal(values.data(), num_values - null_count);
+    DCHECK_EQ(num_values - null_count, num_valid_values);
+
+    auto values_ptr = reinterpret_cast<const ByteArray*>(values.data());
+    int value_idx = 0;
+
+    for (int i = 0; i < num_values; ++i) {
+      bool is_valid = bit_reader.IsSet();
+      bit_reader.Next();
+
+      if (is_valid) {
+        const auto& val = values_ptr[value_idx];
+        if (ARROW_PREDICT_FALSE(!helper.CanFit(val.len))) {
+          RETURN_NOT_OK(helper.PushChunk());
+        }
+        RETURN_NOT_OK(helper.Append(val.ptr, static_cast<int32_t>(val.len)));
+        ++value_idx;
+      } else {
+        RETURN_NOT_OK(helper.AppendNull());
+        --null_count;
+      }
+    }
+    DCHECK_EQ(null_count, 0);
+    *out_num_values = num_valid_values;
+    return Status::OK();
+  }
+
+  std::shared_ptr<::arrow::BitUtil::BitReader> decoder_;
   DeltaBitPackDecoder<Int32Type> prefix_len_decoder_;
   DeltaLengthByteArrayDecoder suffix_decoder_;
-  ByteArray last_value_;
+  std::string last_value_;
+  // string buffer for last value in previous page
+  std::string last_value_in_previous_page_;
+  int num_valid_values_;
+  uint32_t prefix_len_offset_;
+  std::shared_ptr<ResizableBuffer> buffered_prefix_length_;
+  std::shared_ptr<ResizableBuffer> buffered_data_;
 };
 
 // ----------------------------------------------------------------------
@@ -2558,6 +2708,12 @@ std::unique_ptr<Decoder> MakeDecoder(Type::type type_num, Encoding::type encodin
         throw ParquetException("DELTA_BINARY_PACKED only supports INT32 and INT64");
         break;
     }
+  } else if (encoding == Encoding::DELTA_BYTE_ARRAY) {
+    if (type_num == Type::BYTE_ARRAY || type_num == Type::FIXED_LEN_BYTE_ARRAY) {
+      return std::unique_ptr<Decoder>(new DeltaByteArrayDecoder(descr));
+    }
+    throw ParquetException(
+        "DELTA_BYTE_ARRAY only supports BYTE_ARRAY and FIXED_LEN_BYTE_ARRAY");
   } else {
     ParquetException::NYI("Selected encoding is not supported");
   }
diff --git a/cpp/submodules/parquet-testing b/cpp/submodules/parquet-testing
index 600d437de0e8b..8f2a069ed2c58 160000
--- a/cpp/submodules/parquet-testing
+++ b/cpp/submodules/parquet-testing
@@ -1 +1 @@
-Subproject commit 600d437de0e8b0e9927c87e76f844a1b385b02e8
+Subproject commit 8f2a069ed2c58787e5be2a3ca8c68bc801b8eafa
diff --git a/docs/source/cpp/parquet.rst b/docs/source/cpp/parquet.rst
index 88ea4e5b6c8d1..bacc45baa68b7 100644
--- a/docs/source/cpp/parquet.rst
+++ b/docs/source/cpp/parquet.rst
@@ -80,29 +80,32 @@ Compression
 Encodings
 ---------
 
-+--------------------------+---------+
-| Encoding                 | Notes   |
-+==========================+=========+
-| PLAIN                    |         |
-+--------------------------+---------+
-| PLAIN_DICTIONARY         |         |
-+--------------------------+---------+
-| BIT_PACKED               |         |
-+--------------------------+---------+
-| RLE                      | \(1)    |
-+--------------------------+---------+
-| RLE_DICTIONARY           | \(2)    |
-+--------------------------+---------+
-| BYTE_STREAM_SPLIT        |         |
-+--------------------------+---------+
++--------------------------+----------+----------+---------+
+| Encoding                 | Reading  | Writing  | Notes   |
++==========================+==========+==========+=========+
+| PLAIN                    | ✓        | ✓        |         |
++--------------------------+----------+----------+---------+
+| PLAIN_DICTIONARY         | ✓        | ✓        |         |
++--------------------------+----------+----------+---------+
+| BIT_PACKED               | ✓        | ✓        |         |
++--------------------------+----------+----------+---------+
+| RLE                      | ✓        | ✓        | \(1)    |
++--------------------------+----------+----------+---------+
+| RLE_DICTIONARY           | ✓        | ✓        | \(2)    |
++--------------------------+----------+----------+---------+
+| BYTE_STREAM_SPLIT        | ✓        | ✓        |         |
++--------------------------+----------+----------+---------+
+| DELTA_BINARY_PACKED      | ✓        |          |         |
++--------------------------+----------+----------+---------+
+| DELTA_BYTE_ARRAY         | ✓        |          |         |
++--------------------------+----------+----------+---------+
 
 * \(1) Only supported for encoding definition and repetition levels, not values.
 
 * \(2) On the write path, RLE_DICTIONARY is only enabled if Parquet format version
   2.4 or greater is selected in :func:`WriterProperties::version`.
 
-*Unsupported encodings:* DELTA_BINARY_PACKED, DELTA_LENGTH_BYTE_ARRAY,
-DELTA_BYTE_ARRAY.
+*Unsupported encoding:* DELTA_LENGTH_BYTE_ARRAY.
 
 Types
 -----

From 0ed742494f11b6be8033cf65e9e48725249669b6 Mon Sep 17 00:00:00 2001
From: David Li <li.davidm96@gmail.com>
Date: Mon, 8 Nov 2021 14:32:46 -0500
Subject: [PATCH 108/194] ARROW-14583: [C++] Handle empty chunked arrays in
 Take, empty datasets in GroupByNode

This fixes two issues:
- A crash in GroupByNode when no batches are processed
- A spurious error calling Take on a ChunkedArray with no chunks

Closes #11623 from lidavidm/arrow-14583

Authored-by: David Li <li.davidm96@gmail.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 cpp/src/arrow/compute/exec/aggregate_node.cc  |  2 ++
 cpp/src/arrow/compute/exec/test_util.h        |  2 --
 .../compute/kernels/hash_aggregate_test.cc    | 33 +++++++++++++++++--
 .../arrow/compute/kernels/vector_selection.cc |  9 +++--
 .../compute/kernels/vector_selection_test.cc  |  7 ++++
 5 files changed, 46 insertions(+), 7 deletions(-)

diff --git a/cpp/src/arrow/compute/exec/aggregate_node.cc b/cpp/src/arrow/compute/exec/aggregate_node.cc
index 295979062d03f..904fa4e05f06b 100644
--- a/cpp/src/arrow/compute/exec/aggregate_node.cc
+++ b/cpp/src/arrow/compute/exec/aggregate_node.cc
@@ -421,6 +421,8 @@ class GroupByNode : public ExecNode {
 
   Result<ExecBatch> Finalize() {
     ThreadLocalState* state = &local_states_[0];
+    // If we never got any batches, then state won't have been initialized
+    RETURN_NOT_OK(InitLocalStateIfNeeded(state));
 
     ExecBatch out_data{{}, state->grouper->num_groups()};
     out_data.values.resize(agg_kernels_.size() + key_field_ids_.size());
diff --git a/cpp/src/arrow/compute/exec/test_util.h b/cpp/src/arrow/compute/exec/test_util.h
index b68c29394334b..a05d3b664ee76 100644
--- a/cpp/src/arrow/compute/exec/test_util.h
+++ b/cpp/src/arrow/compute/exec/test_util.h
@@ -50,8 +50,6 @@ struct BatchesWithSchema {
   std::shared_ptr<Schema> schema;
 
   AsyncGenerator<util::optional<ExecBatch>> gen(bool parallel, bool slow) const {
-    DCHECK_GT(batches.size(), 0);
-
     auto opt_batches = ::arrow::internal::MapVector(
         [](ExecBatch batch) { return util::make_optional(std::move(batch)); }, batches);
 
diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc b/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc
index e53c5d43ca88d..ef64974be72db 100644
--- a/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc
+++ b/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc
@@ -168,16 +168,23 @@ Result<Datum> GroupByUsingExecPlan(const BatchesWithSchema& input,
                         start_and_collect.MoveResult());
 
   ArrayVector out_arrays(aggregates.size() + key_names.size());
+  const auto& output_schema = plan->sources()[0]->outputs()[0]->output_schema();
   for (size_t i = 0; i < out_arrays.size(); ++i) {
     std::vector<std::shared_ptr<Array>> arrays(output_batches.size());
     for (size_t j = 0; j < output_batches.size(); ++j) {
       arrays[j] = output_batches[j].values[i].make_array();
     }
-    ARROW_ASSIGN_OR_RAISE(out_arrays[i], Concatenate(arrays));
+    if (arrays.empty()) {
+      ARROW_ASSIGN_OR_RAISE(
+          out_arrays[i],
+          MakeArrayOfNull(output_schema->field(static_cast<int>(i))->type(),
+                          /*length=*/0));
+    } else {
+      ARROW_ASSIGN_OR_RAISE(out_arrays[i], Concatenate(arrays));
+    }
   }
 
-  return StructArray::Make(std::move(out_arrays),
-                           plan->sources()[0]->outputs()[0]->output_schema()->fields());
+  return StructArray::Make(std::move(out_arrays), output_schema->fields());
 }
 
 /// Simpler overload where you can give the columns as datums
@@ -694,6 +701,26 @@ TEST(GroupBy, Errors) {
                      HasSubstr("Direct execution of HASH_AGGREGATE functions")));
 }
 
+TEST(GroupBy, NoBatches) {
+  // Regression test for ARROW-14583: handle when no batches are
+  // passed to the group by node before finalizing
+  auto table =
+      TableFromJSON(schema({field("argument", float64()), field("key", int64())}), {});
+  ASSERT_OK_AND_ASSIGN(
+      Datum aggregated_and_grouped,
+      GroupByTest({table->GetColumnByName("argument")}, {table->GetColumnByName("key")},
+                  {
+                      {"hash_count", nullptr},
+                  },
+                  /*use_threads=*/true, /*use_exec_plan=*/true));
+  AssertDatumsEqual(ArrayFromJSON(struct_({
+                                      field("hash_count", int64()),
+                                      field("key_0", int64()),
+                                  }),
+                                  R"([])"),
+                    aggregated_and_grouped, /*verbose=*/true);
+}
+
 namespace {
 void SortBy(std::vector<std::string> names, Datum* aggregated_and_grouped) {
   SortOptions options;
diff --git a/cpp/src/arrow/compute/kernels/vector_selection.cc b/cpp/src/arrow/compute/kernels/vector_selection.cc
index 35e5e390e2242..ba8238ddc9a5f 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection.cc
@@ -2016,8 +2016,13 @@ Result<std::shared_ptr<ChunkedArray>> TakeCA(const ChunkedArray& values,
     // TODO Case 3: If indices are sorted, can slice them and call Array Take
 
     // Case 4: Else, concatenate chunks and call Array Take
-    ARROW_ASSIGN_OR_RAISE(current_chunk,
-                          Concatenate(values.chunks(), ctx->memory_pool()));
+    if (values.chunks().empty()) {
+      ARROW_ASSIGN_OR_RAISE(current_chunk, MakeArrayOfNull(values.type(), /*length=*/0,
+                                                           ctx->memory_pool()));
+    } else {
+      ARROW_ASSIGN_OR_RAISE(current_chunk,
+                            Concatenate(values.chunks(), ctx->memory_pool()));
+    }
   }
   // Call Array Take on our single chunk
   ARROW_ASSIGN_OR_RAISE(new_chunks[0], TakeAA(*current_chunk, indices, options, ctx));
diff --git a/cpp/src/arrow/compute/kernels/vector_selection_test.cc b/cpp/src/arrow/compute/kernels/vector_selection_test.cc
index 0adb0dd59f81d..7164a329bfe73 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection_test.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection_test.cc
@@ -1611,7 +1611,12 @@ class TestTakeKernelWithChunkedArray : public TestTakeKernelTyped<ChunkedArray>
 
 TEST_F(TestTakeKernelWithChunkedArray, TakeChunkedArray) {
   this->AssertTake(int8(), {"[]"}, "[]", {"[]"});
+  this->AssertChunkedTake(int8(), {}, {}, {});
+  this->AssertChunkedTake(int8(), {}, {"[]"}, {"[]"});
+  this->AssertChunkedTake(int8(), {}, {"[null]"}, {"[null]"});
+  this->AssertChunkedTake(int8(), {"[]"}, {}, {});
   this->AssertChunkedTake(int8(), {"[]"}, {"[]"}, {"[]"});
+  this->AssertChunkedTake(int8(), {"[]"}, {"[null]"}, {"[null]"});
 
   this->AssertTake(int8(), {"[7]", "[8, 9]"}, "[0, 1, 0, 2]", {"[7, 8, 7, 9]"});
   this->AssertChunkedTake(int8(), {"[7]", "[8, 9]"}, {"[0, 1, 0]", "[]", "[2]"},
@@ -1623,6 +1628,8 @@ TEST_F(TestTakeKernelWithChunkedArray, TakeChunkedArray) {
                 this->TakeWithArray(int8(), {"[7]", "[8, 9]"}, "[0, 5]", &arr));
   ASSERT_RAISES(IndexError, this->TakeWithChunkedArray(int8(), {"[7]", "[8, 9]"},
                                                        {"[0, 1, 0]", "[5, 1]"}, &arr));
+  ASSERT_RAISES(IndexError, this->TakeWithChunkedArray(int8(), {}, {"[0]"}, &arr));
+  ASSERT_RAISES(IndexError, this->TakeWithChunkedArray(int8(), {"[]"}, {"[0]"}, &arr));
 }
 
 class TestTakeKernelWithTable : public TestTakeKernelTyped<Table> {

From 187211dedc83ac378c124b0cc7ed916872cc8174 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Mon, 8 Nov 2021 21:20:11 +0100
Subject: [PATCH 109/194] ARROW-14629: [Python] Add pytest dataset marker to
 test_permutation_of_column_order

Closes #11643 from jorisvandenbossche/ARROW-14629

Authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 python/pyarrow/tests/parquet/test_basic.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/pyarrow/tests/parquet/test_basic.py b/python/pyarrow/tests/parquet/test_basic.py
index ad7cc44271884..38bc2630e0667 100644
--- a/python/pyarrow/tests/parquet/test_basic.py
+++ b/python/pyarrow/tests/parquet/test_basic.py
@@ -631,6 +631,7 @@ def test_reads_over_batch(tempdir):
     assert table == table2
 
 
+@pytest.mark.dataset
 def test_permutation_of_column_order(tempdir):
     # ARROW-2366
     case = tempdir / "dataset_column_order_permutation"

From fa6b0aab40eb910514db78cd1d641af1fa3bab35 Mon Sep 17 00:00:00 2001
From: David Li <li.davidm96@gmail.com>
Date: Mon, 8 Nov 2021 16:01:03 -0500
Subject: [PATCH 110/194] ARROW-14630: [C++] Fix aggregation over scalar key
 columns

This fixes two issues:
- GroupByNode would try to finish its future twice
- GroupByNode wouldn't set the length of the key column batch, which broke down when using a scalar column as the key

Closes #11640 from lidavidm/arrow-14630

Authored-by: David Li <li.davidm96@gmail.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 cpp/src/arrow/compute/exec/aggregate_node.cc |  7 +-
 cpp/src/arrow/compute/exec/plan_test.cc      | 76 +++++++++++++++++++-
 2 files changed, 76 insertions(+), 7 deletions(-)

diff --git a/cpp/src/arrow/compute/exec/aggregate_node.cc b/cpp/src/arrow/compute/exec/aggregate_node.cc
index 904fa4e05f06b..ddf6f7934a719 100644
--- a/cpp/src/arrow/compute/exec/aggregate_node.cc
+++ b/cpp/src/arrow/compute/exec/aggregate_node.cc
@@ -372,7 +372,7 @@ class GroupByNode : public ExecNode {
     for (size_t i = 0; i < key_field_ids_.size(); ++i) {
       keys[i] = batch.values[key_field_ids_[i]];
     }
-    ARROW_ASSIGN_OR_RAISE(ExecBatch key_batch, ExecBatch::Make(keys));
+    ExecBatch key_batch(std::move(keys), batch.length);
 
     // Create a batch with group ids
     ARROW_ASSIGN_OR_RAISE(Datum id_batch, state->grouper->Consume(key_batch));
@@ -527,9 +527,8 @@ class GroupByNode : public ExecNode {
   void StopProducing(ExecNode* output) override {
     DCHECK_EQ(output, outputs_[0]);
 
-    if (input_counter_.Cancel()) {
-      finished_.MarkFinished();
-    } else if (output_counter_.Cancel()) {
+    ARROW_UNUSED(input_counter_.Cancel());
+    if (output_counter_.Cancel()) {
       finished_.MarkFinished();
     }
     inputs_[0]->StopProducing(this);
diff --git a/cpp/src/arrow/compute/exec/plan_test.cc b/cpp/src/arrow/compute/exec/plan_test.cc
index 1a58596f8b1ef..7d5bfe7d959a1 100644
--- a/cpp/src/arrow/compute/exec/plan_test.cc
+++ b/cpp/src/arrow/compute/exec/plan_test.cc
@@ -548,7 +548,7 @@ TEST(ExecPlanExecution, StressSourceSink) {
     for (bool parallel : {false, true}) {
       SCOPED_TRACE(parallel ? "parallel" : "single threaded");
 
-      int num_batches = slow && !parallel ? 30 : 300;
+      int num_batches = (slow && !parallel) ? 30 : 300;
 
       ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make());
       AsyncGenerator<util::optional<ExecBatch>> sink_gen;
@@ -578,7 +578,7 @@ TEST(ExecPlanExecution, StressSourceOrderBy) {
     for (bool parallel : {false, true}) {
       SCOPED_TRACE(parallel ? "parallel" : "single threaded");
 
-      int num_batches = slow && !parallel ? 30 : 300;
+      int num_batches = (slow && !parallel) ? 30 : 300;
 
       ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make());
       AsyncGenerator<util::optional<ExecBatch>> sink_gen;
@@ -607,6 +607,42 @@ TEST(ExecPlanExecution, StressSourceOrderBy) {
   }
 }
 
+TEST(ExecPlanExecution, StressSourceGroupedSumStop) {
+  auto input_schema = schema({field("a", int32()), field("b", boolean())});
+  for (bool slow : {false, true}) {
+    SCOPED_TRACE(slow ? "slowed" : "unslowed");
+
+    for (bool parallel : {false, true}) {
+      SCOPED_TRACE(parallel ? "parallel" : "single threaded");
+
+      int num_batches = (slow && !parallel) ? 30 : 300;
+
+      ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make());
+      AsyncGenerator<util::optional<ExecBatch>> sink_gen;
+
+      auto random_data = MakeRandomBatches(input_schema, num_batches);
+
+      SortOptions options({SortKey("a", SortOrder::Ascending)});
+      ASSERT_OK(Declaration::Sequence(
+                    {
+                        {"source", SourceNodeOptions{random_data.schema,
+                                                     random_data.gen(parallel, slow)}},
+                        {"aggregate",
+                         AggregateNodeOptions{/*aggregates=*/{{"hash_sum", nullptr}},
+                                              /*targets=*/{"a"}, /*names=*/{"sum(a)"},
+                                              /*keys=*/{"b"}}},
+                        {"sink", SinkNodeOptions{&sink_gen}},
+                    })
+                    .AddToPlan(plan.get()));
+
+      ASSERT_OK(plan->Validate());
+      ASSERT_OK(plan->StartProducing());
+      plan->StopProducing();
+      ASSERT_FINISHES_OK(plan->finished());
+    }
+  }
+}
+
 TEST(ExecPlanExecution, StressSourceSinkStopped) {
   for (bool slow : {false, true}) {
     SCOPED_TRACE(slow ? "slowed" : "unslowed");
@@ -614,7 +650,7 @@ TEST(ExecPlanExecution, StressSourceSinkStopped) {
     for (bool parallel : {false, true}) {
       SCOPED_TRACE(parallel ? "parallel" : "single threaded");
 
-      int num_batches = slow && !parallel ? 30 : 300;
+      int num_batches = (slow && !parallel) ? 30 : 300;
 
       ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make());
       AsyncGenerator<util::optional<ExecBatch>> sink_gen;
@@ -1007,6 +1043,40 @@ TEST(ExecPlanExecution, ScalarSourceScalarAggSink) {
       }))));
 }
 
+TEST(ExecPlanExecution, ScalarSourceGroupedSum) {
+  // ARROW-14630: ensure grouped aggregation with a scalar key/array input doesn't error
+  ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make());
+  AsyncGenerator<util::optional<ExecBatch>> sink_gen;
+
+  BatchesWithSchema scalar_data;
+  scalar_data.batches = {
+      ExecBatchFromJSON({int32(), ValueDescr::Scalar(boolean())},
+                        "[[5, false], [6, false], [7, false]]"),
+      ExecBatchFromJSON({int32(), ValueDescr::Scalar(boolean())},
+                        "[[1, true], [2, true], [3, true]]"),
+  };
+  scalar_data.schema = schema({field("a", int32()), field("b", boolean())});
+
+  SortOptions options({SortKey("b", SortOrder::Descending)});
+  ASSERT_OK(Declaration::Sequence(
+                {
+                    {"source", SourceNodeOptions{scalar_data.schema,
+                                                 scalar_data.gen(/*parallel=*/false,
+                                                                 /*slow=*/false)}},
+                    {"aggregate",
+                     AggregateNodeOptions{/*aggregates=*/{{"hash_sum", nullptr}},
+                                          /*targets=*/{"a"}, /*names=*/{"hash_sum(a)"},
+                                          /*keys=*/{"b"}}},
+                    {"order_by_sink", OrderBySinkNodeOptions{options, &sink_gen}},
+                })
+                .AddToPlan(plan.get()));
+
+  ASSERT_THAT(StartAndCollect(plan.get(), sink_gen),
+              Finishes(ResultWith(UnorderedElementsAreArray({
+                  ExecBatchFromJSON({int64(), boolean()}, R"([[6, true], [18, false]])"),
+              }))));
+}
+
 TEST(ExecPlanExecution, SelfInnerHashJoinSink) {
   for (bool parallel : {false, true}) {
     SCOPED_TRACE(parallel ? "parallel/merged" : "serial");

From 8df40b6301dde12ca5e334828d3328c722bbf27d Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Tue, 9 Nov 2021 06:21:54 +0900
Subject: [PATCH 111/194] ARROW-14599: [Release][Java] Upload .jar to Artifacts

This doesn't upload to any Maven repository. It's out-of-scope of
this.

Closes #11617 from kou/release-upload-java-jars

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 dev/release/02-source-test.rb       | 17 +++++++++--------
 dev/release/02-source.sh            | 17 +++++++++--------
 dev/release/04-binary-download.sh   | 10 ++++++----
 dev/release/05-binary-upload.sh     |  4 ++++
 dev/release/binary-task.rb          | 25 ++++++++++++++++++-------
 dev/release/download_rc_binaries.py |  2 +-
 dev/release/post-02-binary.sh       |  4 ++++
 dev/release/utils-binary.sh         |  1 +
 8 files changed, 52 insertions(+), 28 deletions(-)

diff --git a/dev/release/02-source-test.rb b/dev/release/02-source-test.rb
index 0af4aafca22f6..abb8f216086e3 100644
--- a/dev/release/02-source-test.rb
+++ b/dev/release/02-source-test.rb
@@ -120,11 +120,11 @@ def test_vote
 #{@current_commit} [2]
 
 The source release rc0 is hosted at [3].
-The binary artifacts are hosted at [4][5][6][7][8][9][10].
-The changelog is located at [11].
+The binary artifacts are hosted at [4][5][6][7][8][9][10][11].
+The changelog is located at [12].
 
 Please download, verify checksums and signatures, run the unit tests,
-and vote on the release. See [12] for how to validate a release candidate.
+and vote on the release. See [13] for how to validate a release candidate.
 
 The vote will be open for at least 72 hours.
 
@@ -139,11 +139,12 @@ def test_vote
 [5]: https://apache.jfrog.io/artifactory/arrow/amazon-linux-rc/
 [6]: https://apache.jfrog.io/artifactory/arrow/centos-rc/
 [7]: https://apache.jfrog.io/artifactory/arrow/debian-rc/
-[8]: https://apache.jfrog.io/artifactory/arrow/nuget-rc/#{@release_version}-rc0
-[9]: https://apache.jfrog.io/artifactory/arrow/python-rc/#{@release_version}-rc0
-[10]: https://apache.jfrog.io/artifactory/arrow/ubuntu-rc/
-[11]: https://github.com/apache/arrow/blob/#{@current_commit}/CHANGELOG.md
-[12]: https://cwiki.apache.org/confluence/display/ARROW/How+to+Verify+Release+Candidates
+[8]: https://apache.jfrog.io/artifactory/arrow/java-rc/#{@release_version}-rc0
+[9]: https://apache.jfrog.io/artifactory/arrow/nuget-rc/#{@release_version}-rc0
+[10]: https://apache.jfrog.io/artifactory/arrow/python-rc/#{@release_version}-rc0
+[11]: https://apache.jfrog.io/artifactory/arrow/ubuntu-rc/
+[12]: https://github.com/apache/arrow/blob/#{@current_commit}/CHANGELOG.md
+[13]: https://cwiki.apache.org/confluence/display/ARROW/How+to+Verify+Release+Candidates
     VOTE
   end
 end
diff --git a/dev/release/02-source.sh b/dev/release/02-source.sh
index 4a4f301289019..89c8e9889f6c6 100755
--- a/dev/release/02-source.sh
+++ b/dev/release/02-source.sh
@@ -136,11 +136,11 @@ This release candidate is based on commit:
 ${release_hash} [2]
 
 The source release rc${rc} is hosted at [3].
-The binary artifacts are hosted at [4][5][6][7][8][9][10].
-The changelog is located at [11].
+The binary artifacts are hosted at [4][5][6][7][8][9][10][11].
+The changelog is located at [12].
 
 Please download, verify checksums and signatures, run the unit tests,
-and vote on the release. See [12] for how to validate a release candidate.
+and vote on the release. See [13] for how to validate a release candidate.
 
 The vote will be open for at least 72 hours.
 
@@ -155,11 +155,12 @@ The vote will be open for at least 72 hours.
 [5]: https://apache.jfrog.io/artifactory/arrow/amazon-linux-rc/
 [6]: https://apache.jfrog.io/artifactory/arrow/centos-rc/
 [7]: https://apache.jfrog.io/artifactory/arrow/debian-rc/
-[8]: https://apache.jfrog.io/artifactory/arrow/nuget-rc/${version}-rc${rc}
-[9]: https://apache.jfrog.io/artifactory/arrow/python-rc/${version}-rc${rc}
-[10]: https://apache.jfrog.io/artifactory/arrow/ubuntu-rc/
-[11]: https://github.com/apache/arrow/blob/${release_hash}/CHANGELOG.md
-[12]: https://cwiki.apache.org/confluence/display/ARROW/How+to+Verify+Release+Candidates
+[8]: https://apache.jfrog.io/artifactory/arrow/java-rc/${version}-rc${rc}
+[9]: https://apache.jfrog.io/artifactory/arrow/nuget-rc/${version}-rc${rc}
+[10]: https://apache.jfrog.io/artifactory/arrow/python-rc/${version}-rc${rc}
+[11]: https://apache.jfrog.io/artifactory/arrow/ubuntu-rc/
+[12]: https://github.com/apache/arrow/blob/${release_hash}/CHANGELOG.md
+[13]: https://cwiki.apache.org/confluence/display/ARROW/How+to+Verify+Release+Candidates
 MAIL
   echo "---------------------------------------------------------"
 fi
diff --git a/dev/release/04-binary-download.sh b/dev/release/04-binary-download.sh
index b433a3f9c1895..16530478ce41f 100755
--- a/dev/release/04-binary-download.sh
+++ b/dev/release/04-binary-download.sh
@@ -20,13 +20,15 @@
 
 set -e
 
-if [ "$#" -ne 2 ]; then
-  echo "Usage: $0 <version> <rc-num>"
+if [ "$#" -lt 2 ]; then
+  echo "Usage: $0 <version> <rc-num> [options]"
   exit
 fi
 
 version=$1
-rc_number=$2
+shift
+rc_number=$1
+shift
 version_with_rc="${version}-rc${rc_number}"
 crossbow_job_prefix="release-${version_with_rc}"
 
@@ -36,4 +38,4 @@ crossbow_job_prefix="release-${version_with_rc}"
 : ${CROSSBOW_JOB_NUMBER:="0"}
 : ${CROSSBOW_JOB_ID:="${crossbow_job_prefix}-${CROSSBOW_JOB_NUMBER}"}
 
-archery crossbow download-artifacts ${CROSSBOW_JOB_ID} --no-fetch
+archery crossbow download-artifacts --no-fetch ${CROSSBOW_JOB_ID} "$@"
diff --git a/dev/release/05-binary-upload.sh b/dev/release/05-binary-upload.sh
index 5a30fc8bdf8a3..4dd4a5040ba2e 100755
--- a/dev/release/05-binary-upload.sh
+++ b/dev/release/05-binary-upload.sh
@@ -68,6 +68,7 @@ fi
 : ${UPLOAD_AMAZON_LINUX:=${UPLOAD_DEFAULT}}
 : ${UPLOAD_CENTOS:=${UPLOAD_DEFAULT}}
 : ${UPLOAD_DEBIAN:=${UPLOAD_DEFAULT}}
+: ${UPLOAD_JAVA:=${UPLOAD_DEFAULT}}
 : ${UPLOAD_NUGET:=${UPLOAD_DEFAULT}}
 : ${UPLOAD_PYTHON:=${UPLOAD_DEFAULT}}
 : ${UPLOAD_UBUNTU:=${UPLOAD_DEFAULT}}
@@ -91,6 +92,9 @@ if [ ${UPLOAD_DEBIAN} -gt 0 ]; then
   rake_tasks+=(apt:rc)
   apt_targets+=(debian)
 fi
+if [ ${UPLOAD_JAVA} -gt 0 ]; then
+  rake_tasks+=(java:rc)
+fi
 if [ ${UPLOAD_NUGET} -gt 0 ]; then
   rake_tasks+=(nuget:rc)
 fi
diff --git a/dev/release/binary-task.rb b/dev/release/binary-task.rb
index 5f88e477e553a..10211d05afd60 100644
--- a/dev/release/binary-task.rb
+++ b/dev/release/binary-task.rb
@@ -739,8 +739,9 @@ def upload
   def define
     define_apt_tasks
     define_yum_tasks
-    define_python_tasks
+    define_java_tasks
     define_nuget_tasks
+    define_python_tasks
     define_summary_tasks
   end
 
@@ -1856,12 +1857,12 @@ def define_generic_data_tasks(label,
     define_generic_data_release_tasks(label, id, release_dir)
   end
 
-  def define_python_tasks
-    define_generic_data_tasks("Python",
-                              :python,
-                              "#{rc_dir}/python/#{full_version}",
-                              "#{release_dir}/python/#{full_version}",
-                              "{python-sdist,wheel-*}/**/*")
+  def define_java_tasks
+    define_generic_data_tasks("Java",
+                              :java,
+                              "#{rc_dir}/java/#{full_version}",
+                              "#{release_dir}/java/#{full_version}",
+                              "java-jars/**/*")
   end
 
   def define_nuget_tasks
@@ -1872,6 +1873,14 @@ def define_nuget_tasks
                               "nuget/**/*")
   end
 
+  def define_python_tasks
+    define_generic_data_tasks("Python",
+                              :python,
+                              "#{rc_dir}/python/#{full_version}",
+                              "#{release_dir}/python/#{full_version}",
+                              "{python-sdist,wheel-*}/**/*")
+  end
+
   def define_summary_tasks
     namespace :summary do
       desc "Show RC summary"
@@ -1884,6 +1893,7 @@ def define_summary_tasks
   https://apache.jfrog.io/artifactory/arrow/amazon-linux#{suffix}-rc/
   https://apache.jfrog.io/artifactory/arrow/centos#{suffix}-rc/
   https://apache.jfrog.io/artifactory/arrow/debian#{suffix}-rc/
+  https://apache.jfrog.io/artifactory/arrow/java#{suffix}-rc/#{version}
   https://apache.jfrog.io/artifactory/arrow/nuget#{suffix}-rc/#{full_version}
   https://apache.jfrog.io/artifactory/arrow/python#{suffix}-rc/#{full_version}
   https://apache.jfrog.io/artifactory/arrow/ubuntu#{suffix}-rc/
@@ -1900,6 +1910,7 @@ def define_summary_tasks
   https://apache.jfrog.io/artifactory/arrow/amazon-linux#{suffix}/
   https://apache.jfrog.io/artifactory/arrow/centos#{suffix}/
   https://apache.jfrog.io/artifactory/arrow/debian#{suffix}/
+  https://apache.jfrog.io/artifactory/arrow/java#{suffix}/#{version}
   https://apache.jfrog.io/artifactory/arrow/nuget#{suffix}/#{version}
   https://apache.jfrog.io/artifactory/arrow/python#{suffix}/#{version}
   https://apache.jfrog.io/artifactory/arrow/ubuntu#{suffix}/
diff --git a/dev/release/download_rc_binaries.py b/dev/release/download_rc_binaries.py
index c7e0dbfa43ac2..1384d785c96c8 100755
--- a/dev/release/download_rc_binaries.py
+++ b/dev/release/download_rc_binaries.py
@@ -133,7 +133,7 @@ def parallel_map_terminate_early(f, iterable, num_parallel):
     'debian',
     'ubuntu',
 ]
-ARROW_STANDALONE_PACKAGE_TYPES = ['nuget', 'python']
+ARROW_STANDALONE_PACKAGE_TYPES = ['java', 'nuget', 'python']
 ARROW_PACKAGE_TYPES = \
     ARROW_REPOSITORY_PACKAGE_TYPES + \
     ARROW_STANDALONE_PACKAGE_TYPES
diff --git a/dev/release/post-02-binary.sh b/dev/release/post-02-binary.sh
index b1b41f9fbeab7..d2a5b01341a0b 100755
--- a/dev/release/post-02-binary.sh
+++ b/dev/release/post-02-binary.sh
@@ -49,6 +49,7 @@ fi
 : ${DEPLOY_AMAZON_LINUX:=${DEPLOY_DEFAULT}}
 : ${DEPLOY_CENTOS:=${DEPLOY_DEFAULT}}
 : ${DEPLOY_DEBIAN:=${DEPLOY_DEFAULT}}
+: ${DEPLOY_JAVA:=${DEPLOY_DEFAULT}}
 : ${DEPLOY_NUGET:=${DEPLOY_DEFAULT}}
 : ${DEPLOY_PYTHON:=${DEPLOY_DEFAULT}}
 : ${DEPLOY_UBUNTU:=${DEPLOY_DEFAULT}}
@@ -72,6 +73,9 @@ if [ ${DEPLOY_DEBIAN} -gt 0 ]; then
   rake_tasks+=(apt:release)
   apt_targets+=(debian)
 fi
+if [ ${DEPLOY_JAVA} -gt 0 ]; then
+  rake_tasks+=(java:release)
+fi
 if [ ${DEPLOY_NUGET} -gt 0 ]; then
   rake_tasks+=(nuget:release)
 fi
diff --git a/dev/release/utils-binary.sh b/dev/release/utils-binary.sh
index 31ebcd8e9bb4e..4bcdb12537aaa 100644
--- a/dev/release/utils-binary.sh
+++ b/dev/release/utils-binary.sh
@@ -41,6 +41,7 @@ docker_gpg_ssh() {
       -i "${docker_ssh_key}" \
       -p ${ssh_port} \
       -R "/home/arrow/.gnupg/S.gpg-agent:${gpg_agent_extra_socket}" \
+      -t \
       arrow@127.0.0.1 \
       "$@"; then
     exit_code=$?;

From 939db7f513a56d0cab12a8479d8153e5aa2ae1df Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Tue, 9 Nov 2021 06:57:55 +0900
Subject: [PATCH 112/194] ARROW-14623: [Packaging][Java] Upload not only .jar
 but also .pom

.pom files will be needed to publish packages at https://repository.apache.org/#staging-upload .

.jar files are generated at target/ and installed to ~/.m2/repository/org/apache/arrow/.
.pom files are only installed to ~/.m2/repository/org/apache/arrow/.

So this change collects artifacts from ~/.m2/repository/org/apache/arrow/ instead of build directory.

Dist path change (arrow/java/dist -> arrow/java-dist) isn't required but it's better that
we use out of source directory for dist path. It's easy to debug.

Closes #11634 from kou/ci-java-jars-pom

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 ci/scripts/java_full_build.sh  | 12 +++++++++---
 dev/tasks/java-jars/github.yml | 31 +++++++++++++++++--------------
 dev/tasks/tasks.yml            | 24 +++++++++++++++++++++++-
 docker-compose.yml             |  4 ++--
 4 files changed, 51 insertions(+), 20 deletions(-)

diff --git a/ci/scripts/java_full_build.sh b/ci/scripts/java_full_build.sh
index fb1d2b5f535d2..e452b80985a09 100755
--- a/ci/scripts/java_full_build.sh
+++ b/ci/scripts/java_full_build.sh
@@ -27,10 +27,16 @@ export ARROW_TEST_DATA=${arrow_dir}/testing/data
 pushd ${arrow_dir}/java
 
 # build the entire project
-mvn clean install -Parrow-c-data -Parrow-jni -Darrow.cpp.build.dir=$dist_dir -Darrow.c.jni.dist.dir=$dist_dir
+mvn clean install \
+  -Parrow-c-data \
+  -Parrow-jni \
+  -Darrow.cpp.build.dir=$dist_dir \
+  -Darrow.c.jni.dist.dir=$dist_dir
 
 # copy all jars and pom files to the distribution folder
-find . -name "*.jar" -exec echo {} \; -exec cp {} $dist_dir \;
-find . -name "*.pom" -exec echo {} \; -exec cp {} $dist_dir \;
+find ~/.m2/repository/org/apache/arrow \
+     "(" -name "*.jar" -o -name "*.pom" ")" \
+     -exec echo {} ";" \
+     -exec cp {} $dist_dir ";"
 
 popd
diff --git a/dev/tasks/java-jars/github.yml b/dev/tasks/java-jars/github.yml
index 59fde6845ff4e..81d31dd4cff2f 100644
--- a/dev/tasks/java-jars/github.yml
+++ b/dev/tasks/java-jars/github.yml
@@ -30,7 +30,7 @@ jobs:
       - name: Build C++ Libs
         run: archery docker run java-jni-manylinux-2014
       - name: Compress into single artifact
-        run: tar -cvzf arrow-shared-libs-linux.tar.gz arrow/java/dist/
+        run: tar -cvzf arrow-shared-libs-linux.tar.gz arrow/java-dist/
       - name: Upload Artifacts
         uses: actions/upload-artifact@v2
         with:
@@ -59,16 +59,16 @@ jobs:
           arrow/ci/scripts/java_cdata_build.sh \
             $GITHUB_WORKSPACE/arrow \
             $GITHUB_WORKSPACE/arrow/java-native-build \
-            $GITHUB_WORKSPACE/arrow/java/dist
+            $GITHUB_WORKSPACE/arrow/java-dist
       - name: Build C++ Libs
         run: |
           set -e
           arrow/ci/scripts/java_jni_macos_build.sh \
             $GITHUB_WORKSPACE/arrow \
             $GITHUB_WORKSPACE/arrow/cpp-build \
-            $GITHUB_WORKSPACE/arrow/java/dist
+            $GITHUB_WORKSPACE/arrow/java-dist
       - name: Compress into single artifact
-        run: tar -cvzf arrow-shared-libs-macos.tar.gz arrow/java/dist/
+        run: tar -cvzf arrow-shared-libs-macos.tar.gz arrow/java-dist/
       - name: Upload Artifacts
         uses: actions/upload-artifact@v2
         with:
@@ -95,18 +95,21 @@ jobs:
           tar -xvzf arrow-shared-libs-linux.tar.gz
       - name: Test that Shared Libraries Exist
         run: |
-          test -f arrow/java/dist/libarrow_cdata_jni.dylib
-          test -f arrow/java/dist/libarrow_dataset_jni.dylib
-          test -f arrow/java/dist/libgandiva_jni.dylib
-          test -f arrow/java/dist/libarrow_orc_jni.dylib
-          test -f arrow/java/dist/libarrow_cdata_jni.so
-          test -f arrow/java/dist/libarrow_dataset_jni.so
-          test -f arrow/java/dist/libarrow_orc_jni.so
-          test -f arrow/java/dist/libgandiva_jni.so
+          test -f arrow/java-dist/libarrow_cdata_jni.dylib
+          test -f arrow/java-dist/libarrow_dataset_jni.dylib
+          test -f arrow/java-dist/libgandiva_jni.dylib
+          test -f arrow/java-dist/libarrow_orc_jni.dylib
+          test -f arrow/java-dist/libarrow_cdata_jni.so
+          test -f arrow/java-dist/libarrow_dataset_jni.so
+          test -f arrow/java-dist/libarrow_orc_jni.so
+          test -f arrow/java-dist/libgandiva_jni.so
       - name: Build Bundled Jar
         run: |
           set -e
+          pushd arrow/java
+          mvn versions:set -DnewVersion={{ arrow.no_rc_version }}
+          popd
           arrow/ci/scripts/java_full_build.sh \
             $GITHUB_WORKSPACE/arrow \
-            $GITHUB_WORKSPACE/arrow/java/dist
-      {{ macros.github_upload_releases(["arrow/java/dist/*.jar", "arrow/java/dist/*.pom"])|indent }}
+            $GITHUB_WORKSPACE/arrow/java-dist
+      {{ macros.github_upload_releases(["arrow/java-dist/*.jar", "arrow/java-dist/*.pom"])|indent }}
diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml
index 7ded109e9868f..4ab72d0318de5 100644
--- a/dev/tasks/tasks.yml
+++ b/dev/tasks/tasks.yml
@@ -695,43 +695,65 @@ tasks:
     artifacts:
       - arrow-algorithm-{no_rc_version}-tests.jar
       - arrow-algorithm-{no_rc_version}.jar
+      - arrow-algorithm-{no_rc_version}.pom
       - arrow-avro-{no_rc_version}-tests.jar
       - arrow-avro-{no_rc_version}.jar
+      - arrow-avro-{no_rc_version}.pom
+      - arrow-c-data-{no_rc_version}-tests.jar
+      - arrow-c-data-{no_rc_version}.jar
+      - arrow-c-data-{no_rc_version}.pom
       - arrow-compression-{no_rc_version}-tests.jar
       - arrow-compression-{no_rc_version}.jar
+      - arrow-compression-{no_rc_version}.pom
       - arrow-dataset-{no_rc_version}-tests.jar
       - arrow-dataset-{no_rc_version}.jar
+      - arrow-dataset-{no_rc_version}.pom
       - arrow-format-{no_rc_version}-tests.jar
       - arrow-format-{no_rc_version}.jar
+      - arrow-format-{no_rc_version}.pom
       - arrow-gandiva-{no_rc_version}-tests.jar
       - arrow-gandiva-{no_rc_version}.jar
+      - arrow-gandiva-{no_rc_version}.pom
+      - arrow-java-root-{no_rc_version}.pom
       - arrow-jdbc-{no_rc_version}-tests.jar
       - arrow-jdbc-{no_rc_version}.jar
+      - arrow-jdbc-{no_rc_version}.pom
+      - arrow-memory-{no_rc_version}.pom
       - arrow-memory-core-{no_rc_version}-tests.jar
       - arrow-memory-core-{no_rc_version}.jar
+      - arrow-memory-core-{no_rc_version}.pom
       - arrow-memory-netty-{no_rc_version}-tests.jar
       - arrow-memory-netty-{no_rc_version}.jar
+      - arrow-memory-netty-{no_rc_version}.pom
       - arrow-memory-unsafe-{no_rc_version}-tests.jar
       - arrow-memory-unsafe-{no_rc_version}.jar
+      - arrow-memory-unsafe-{no_rc_version}.pom
       - arrow-orc-{no_rc_version}-tests.jar
       - arrow-orc-{no_rc_version}.jar
+      - arrow-orc-{no_rc_version}.pom
       - arrow-performance-{no_rc_version}-tests.jar
       - arrow-performance-{no_rc_version}.jar
+      - arrow-performance-{no_rc_version}.pom
       - arrow-plasma-{no_rc_version}-tests.jar
       - arrow-plasma-{no_rc_version}.jar
+      - arrow-plasma-{no_rc_version}.pom
       - arrow-tools-{no_rc_version}-jar-with-dependencies.jar
       - arrow-tools-{no_rc_version}-tests.jar
       - arrow-tools-{no_rc_version}.jar
+      - arrow-tools-{no_rc_version}.pom
       - arrow-vector-{no_rc_version}-shade-format-flatbuffers.jar
       - arrow-vector-{no_rc_version}-tests.jar
       - arrow-vector-{no_rc_version}.jar
-      - benchmarks.jar
+      - arrow-vector-{no_rc_version}.pom
       - flight-core-{no_rc_version}-jar-with-dependencies.jar
       - flight-core-{no_rc_version}-shaded-ext.jar
       - flight-core-{no_rc_version}-shaded.jar
       - flight-core-{no_rc_version}-tests.jar
       - flight-core-{no_rc_version}.jar
+      - flight-core-{no_rc_version}.pom
       - flight-grpc-{no_rc_version}-tests.jar
+      - flight-grpc-{no_rc_version}.jar
+      - flight-grpc-{no_rc_version}.pom
 
   ############################## NuGet packages ###############################
 
diff --git a/docker-compose.yml b/docker-compose.yml
index 1f47735c85de1..9d6a4e3809d2d 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -908,8 +908,8 @@ services:
       - ${DOCKER_VOLUME_PREFIX}python-wheel-manylinux2014-ccache:/ccache:delegated
     command:
       ["pip install -e /arrow/dev/archery &&
-        /arrow/ci/scripts/java_cdata_build.sh /arrow /java-native-build /arrow/java/dist &&
-        /arrow/ci/scripts/java_jni_manylinux_build.sh /arrow /build /arrow/java/dist"]
+        /arrow/ci/scripts/java_cdata_build.sh /arrow /java-native-build /arrow/java-dist &&
+        /arrow/ci/scripts/java_jni_manylinux_build.sh /arrow /build /arrow/java-dist"]
 
   ##############################  Integration #################################
 

From caf1e1ed459e2c6ff66255fc915a5a7b16c0f5c5 Mon Sep 17 00:00:00 2001
From: Weston Pace <weston.pace@gmail.com>
Date: Mon, 8 Nov 2021 12:36:23 -1000
Subject: [PATCH 113/194] ARROW-14620: [Python] Missing bindings for
 existing_data_behavior makes it impossible to maintain old behavior

Closes #11632 from westonpace/bugfix/ARROW-14620--existing-data-behavior-missing-in-python

Authored-by: Weston Pace <weston.pace@gmail.com>
Signed-off-by: Weston Pace <weston.pace@gmail.com>
---
 python/pyarrow/_dataset.pyx                  | 16 ++++++-
 python/pyarrow/dataset.py                    | 21 ++++++++-
 python/pyarrow/includes/libarrow_dataset.pxd |  9 ++++
 python/pyarrow/tests/test_dataset.py         | 49 ++++++++++++++++++++
 4 files changed, 92 insertions(+), 3 deletions(-)

diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx
index 42d702095c202..459c3b8fb764c 100644
--- a/python/pyarrow/_dataset.pyx
+++ b/python/pyarrow/_dataset.pyx
@@ -3364,7 +3364,8 @@ def _filesystemdataset_write(
     Partitioning partitioning not None,
     FileWriteOptions file_options not None,
     int max_partitions,
-    object file_visitor
+    object file_visitor,
+    str existing_data_behavior not None
 ):
     """
     CFileSystemDataset.Write wrapper
@@ -3381,6 +3382,19 @@ def _filesystemdataset_write(
     c_options.partitioning = partitioning.unwrap()
     c_options.max_partitions = max_partitions
     c_options.basename_template = tobytes(basename_template)
+    if existing_data_behavior == 'error':
+        c_options.existing_data_behavior = ExistingDataBehavior_ERROR
+    elif existing_data_behavior == 'overwrite_or_ignore':
+        c_options.existing_data_behavior =\
+            ExistingDataBehavior_OVERWRITE_OR_IGNORE
+    elif existing_data_behavior == 'delete_matching':
+        c_options.existing_data_behavior = ExistingDataBehavior_DELETE_MATCHING
+    else:
+        raise ValueError(
+            ("existing_data_behavior must be one of 'error', ",
+             "'overwrite_or_ignore' or 'delete_matching'")
+        )
+
     if file_visitor is not None:
         visit_args = {'base_dir': c_options.base_dir,
                       'file_visitor': file_visitor}
diff --git a/python/pyarrow/dataset.py b/python/pyarrow/dataset.py
index 70aeb150b1b2f..42515a9f4bd5d 100644
--- a/python/pyarrow/dataset.py
+++ b/python/pyarrow/dataset.py
@@ -736,7 +736,8 @@ def _ensure_write_partitioning(part, schema, flavor):
 def write_dataset(data, base_dir, basename_template=None, format=None,
                   partitioning=None, partitioning_flavor=None, schema=None,
                   filesystem=None, file_options=None, use_threads=True,
-                  max_partitions=None, file_visitor=None):
+                  max_partitions=None, file_visitor=None,
+                  existing_data_behavior='error'):
     """
     Write a dataset to a given format and partitioning.
 
@@ -798,6 +799,22 @@ def write_dataset(data, base_dir, basename_template=None, format=None,
 
             def file_visitor(written_file):
                 visited_paths.append(written_file.path)
+    existing_data_behavior : 'error' | 'overwrite_or_ignore' | \
+'delete_matching'
+        Controls how the dataset will handle data that already exists in
+        the destination.  The default behavior ('error') is to raise an error
+        if any data exists in the destination.
+
+        'overwrite_or_ignore' will ignore any existing data and will
+        overwrite files with the same name as an output file.  Other
+        existing files will be ignored.  This behavior, in combination
+        with a unique basename_template for each write, will allow for
+        an append workflow.
+
+        'delete_matching' is useful when you are writing a partitioned
+        dataset.  The first time each partition directory is encountered
+        the entire directory will be deleted.  This allows you to overwrite
+        old partitions completely.
     """
     from pyarrow.fs import _resolve_filesystem_and_path
 
@@ -860,5 +877,5 @@ def file_visitor(written_file):
 
     _filesystemdataset_write(
         scanner, base_dir, basename_template, filesystem, partitioning,
-        file_options, max_partitions, file_visitor
+        file_options, max_partitions, file_visitor, existing_data_behavior
     )
diff --git a/python/pyarrow/includes/libarrow_dataset.pxd b/python/pyarrow/includes/libarrow_dataset.pxd
index 2527cde62b04a..abc79fea81331 100644
--- a/python/pyarrow/includes/libarrow_dataset.pxd
+++ b/python/pyarrow/includes/libarrow_dataset.pxd
@@ -86,6 +86,14 @@ ctypedef void cb_writer_finish(dict, CFileWriter*)
 
 cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil:
 
+    cdef enum ExistingDataBehavior" arrow::dataset::ExistingDataBehavior":
+        ExistingDataBehavior_DELETE_MATCHING" \
+            arrow::dataset::ExistingDataBehavior::kDeleteMatchingPartitions"
+        ExistingDataBehavior_OVERWRITE_OR_IGNORE" \
+            arrow::dataset::ExistingDataBehavior::kOverwriteOrIgnore"
+        ExistingDataBehavior_ERROR" \
+            arrow::dataset::ExistingDataBehavior::kError"
+
     cdef cppclass CScanOptions "arrow::dataset::ScanOptions":
         @staticmethod
         shared_ptr[CScanOptions] Make(shared_ptr[CSchema] schema)
@@ -278,6 +286,7 @@ cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil:
         c_string basename_template
         function[cb_writer_finish_internal] writer_pre_finish
         function[cb_writer_finish_internal] writer_post_finish
+        ExistingDataBehavior existing_data_behavior
 
     cdef cppclass CFileSystemDataset \
             "arrow::dataset::FileSystemDataset"(CDataset):
diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py
index b4959512f1146..7d539551b65c3 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -3481,6 +3481,55 @@ def test_write_dataset_with_dataset(tempdir):
         assert dict(load_back_table.to_pydict()) == table.to_pydict()
 
 
+@pytest.mark.pandas
+def test_write_dataset_existing_data(tempdir):
+    directory = tempdir / 'ds'
+    table = pa.table({'b': ['x', 'y', 'z'], 'c': [1, 2, 3]})
+    partitioning = ds.partitioning(schema=pa.schema(
+        [pa.field('c', pa.int64())]), flavor='hive')
+
+    def compare_tables_ignoring_order(t1, t2):
+        df1 = t1.to_pandas().sort_values('b').reset_index(drop=True)
+        df2 = t2.to_pandas().sort_values('b').reset_index(drop=True)
+        assert df1.equals(df2)
+
+    # First write is ok
+    ds.write_dataset(table, directory, partitioning=partitioning, format='ipc')
+
+    table = pa.table({'b': ['a', 'b', 'c'], 'c': [2, 3, 4]})
+
+    # Second write should fail
+    with pytest.raises(pa.ArrowInvalid):
+        ds.write_dataset(table, directory,
+                         partitioning=partitioning, format='ipc')
+
+    extra_table = pa.table({'b': ['e']})
+    extra_file = directory / 'c=2' / 'foo.arrow'
+    pyarrow.feather.write_feather(extra_table, extra_file)
+
+    # Should be ok and overwrite with overwrite behavior
+    ds.write_dataset(table, directory, partitioning=partitioning,
+                     format='ipc',
+                     existing_data_behavior='overwrite_or_ignore')
+
+    overwritten = pa.table(
+        {'b': ['e', 'x', 'a', 'b', 'c'], 'c': [2, 1, 2, 3, 4]})
+    readback = ds.dataset(tempdir, format='ipc',
+                          partitioning=partitioning).to_table()
+    compare_tables_ignoring_order(readback, overwritten)
+    assert extra_file.exists()
+
+    # Should be ok and delete matching with delete_matching
+    ds.write_dataset(table, directory, partitioning=partitioning,
+                     format='ipc', existing_data_behavior='delete_matching')
+
+    overwritten = pa.table({'b': ['x', 'a', 'b', 'c'], 'c': [1, 2, 3, 4]})
+    readback = ds.dataset(tempdir, format='ipc',
+                          partitioning=partitioning).to_table()
+    compare_tables_ignoring_order(readback, overwritten)
+    assert not extra_file.exists()
+
+
 @pytest.mark.parquet
 @pytest.mark.pandas
 def test_write_dataset_partitioned_dict(tempdir):

From 5c936560c1da003baf714d67dc92f25670730c84 Mon Sep 17 00:00:00 2001
From: Dewey Dunnington <dewey@fishandwhistle.net>
Date: Tue, 9 Nov 2021 08:20:20 -0600
Subject: [PATCH 114/194] ARROW-14051 [R] Handle conditionals enclosing
 aggregate expressions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR more correctly passes on type information for the temporary columns that are created when nested expressions exist. This isn't needed often, but occasionally the expression type is checked for the purposes of erroring or warning. The two examples where this happens are `ifelse()` and `case_when()`, which resulted in the following (valid) code pulling data into R with a confusing message:

``` r
library(arrow, warn.conflicts = FALSE)
library(dplyr, warn.conflicts = FALSE)

# motvating example
RecordBatch$create(x = c(0, 1, 1), y = c(2, 3, 5), z = c(8, 13, 21)) %>%
  group_by(x) %>%
  summarise(r = ifelse(n() > 1, mean(y), mean(z))) %>%
  collect()
#> Warning: Error : Expression ifelse(..temp0 > 1, ..temp1, ..temp2) not supported
#> in Arrow; pulling data into R
#> # A tibble: 2 × 2
#>       x     r
#>   <dbl> <dbl>
#> 1     0     8
#> 2     1     4
```

After this PR, the above works without warning:

``` r
library(arrow, warn.conflicts = FALSE)
library(dplyr, warn.conflicts = FALSE)

# motvating example
RecordBatch$create(x = c(0, 1, 1), y = c(2, 3, 5), z = c(8, 13, 21)) %>%
  group_by(x) %>%
  summarise(r = ifelse(n() > 1, mean(y), mean(z))) %>%
  collect()
#> # A tibble: 2 × 2
#>       x     r
#>   <dbl> <dbl>
#> 1     0     8
#> 2     1     4
```

I imagine I'm missing some of the complexity here...feel free to let me know!

Closes #11612 from paleolimbot/r-summarise-ifelse

Authored-by: Dewey Dunnington <dewey@fishandwhistle.net>
Signed-off-by: Jonathan Keane <jkeane@gmail.com>
---
 r/R/dplyr-summarize.R                   | 10 +++++++++-
 r/tests/testthat/test-dplyr-summarize.R | 17 +++++++++++++++++
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R
index a6b7a35926860..e68f74f241005 100644
--- a/r/R/dplyr-summarize.R
+++ b/r/R/dplyr-summarize.R
@@ -218,8 +218,16 @@ summarize_eval <- function(name, quosure, ctx, hash, recurse = FALSE) {
   } else if (all(inner_agg_exprs)) {
     # Something like: fun(agg(x), agg(y))
     # So based on the aggregations that have been extracted, mutate after
+    agg_field_refs <- make_field_refs(names(ctx$aggregations))
+    agg_field_types <- lapply(ctx$aggregations, function(x) x$data$type())
+
     mutate_mask <- arrow_mask(
-      list(selected_columns = make_field_refs(names(ctx$aggregations)))
+      list(
+        selected_columns = agg_field_refs,
+        .data = list(
+          schema = schema(!!! agg_field_types)
+        )
+      )
     )
     ctx$post_mutate[[name]] <- arrow_eval_or_stop(
       as_quosure(expr, ctx$quo_env),
diff --git a/r/tests/testthat/test-dplyr-summarize.R b/r/tests/testthat/test-dplyr-summarize.R
index f7b7772ae4eb8..2d69524c57f44 100644
--- a/r/tests/testthat/test-dplyr-summarize.R
+++ b/r/tests/testthat/test-dplyr-summarize.R
@@ -879,3 +879,20 @@ test_that("summarize() handles group_by .drop", {
     )
   )
 })
+
+test_that("summarise() passes through type information for temporary columns", {
+  # applies to ifelse and case_when(), in which argument types are checked
+  # within a translated function (previously this failed because the appropriate
+  # schema was not available for n() > 1, mean(y), and mean(z))
+  compare_dplyr_binding(
+    .input %>%
+      group_by(x) %>%
+      summarise(r = if_else(n() > 1, mean(y), mean(z))) %>%
+      collect(),
+    tibble(
+      x = c(0, 1, 1),
+      y = c(2, 3, 5),
+      z = c(8, 13, 21)
+    )
+  )
+})

From ed8c76e743468c5ef04f8ea861df747059771873 Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Tue, 9 Nov 2021 17:28:57 +0100
Subject: [PATCH 115/194] PARQUET-1856: [C++] Avoid failing tests with Snappy
 support disabled

Closes #11642 from pitrou/PARQUET-1856-snappy-tests

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/parquet/arrow/arrow_reader_writer_test.cc      | 3 +++
 cpp/src/parquet/encryption/key_management_test.cc      | 3 +++
 cpp/src/parquet/encryption/read_configurations_test.cc | 7 ++++++-
 cpp/src/parquet/encryption/test_encryption_util.cc     | 2 +-
 cpp/src/parquet/reader_test.cc                         | 4 ++++
 cpp/src/parquet/statistics_test.cc                     | 6 ++++++
 6 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index 198c3a8817d92..389c5366efa4a 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -3674,6 +3674,9 @@ TEST(TestArrowReaderAdHoc, LARGE_MEMORY_TEST(LargeStringColumn)) {
 }
 
 TEST(TestArrowReaderAdHoc, HandleDictPageOffsetZero) {
+#ifndef ARROW_WITH_SNAPPY
+  GTEST_SKIP() << "Test requires Snappy compression";
+#endif
   // PARQUET-1402: parquet-mr writes files this way which tripped up
   // some business logic
   TryReadDataFile(test::get_data_file("dict-page-offset-zero.parquet"));
diff --git a/cpp/src/parquet/encryption/key_management_test.cc b/cpp/src/parquet/encryption/key_management_test.cc
index 81e7d89a302ad..c8fcbbd93f393 100644
--- a/cpp/src/parquet/encryption/key_management_test.cc
+++ b/cpp/src/parquet/encryption/key_management_test.cc
@@ -40,6 +40,9 @@ std::unique_ptr<TemporaryDir> temp_dir;
 class TestEncryptionKeyManagement : public ::testing::Test {
  public:
   void SetUp() {
+#ifndef ARROW_WITH_SNAPPY
+    GTEST_SKIP() << "Test requires Snappy compression";
+#endif
     key_list_ = BuildKeyMap(kColumnMasterKeyIds, kColumnMasterKeys, kFooterMasterKeyId,
                             kFooterMasterKey);
     column_key_mapping_ = BuildColumnKeyMapping();
diff --git a/cpp/src/parquet/encryption/read_configurations_test.cc b/cpp/src/parquet/encryption/read_configurations_test.cc
index c065deac66638..ee441d9cacfb0 100644
--- a/cpp/src/parquet/encryption/read_configurations_test.cc
+++ b/cpp/src/parquet/encryption/read_configurations_test.cc
@@ -90,7 +90,12 @@ using parquet::test::ParquetTestException;
 class TestDecryptionConfiguration
     : public testing::TestWithParam<std::tuple<int, const char*>> {
  public:
-  void SetUp() { CreateDecryptionConfigurations(); }
+  void SetUp() {
+#ifndef ARROW_WITH_SNAPPY
+    GTEST_SKIP() << "Test requires Snappy compression";
+#endif
+    CreateDecryptionConfigurations();
+  }
 
  protected:
   FileDecryptor decryptor_;
diff --git a/cpp/src/parquet/encryption/test_encryption_util.cc b/cpp/src/parquet/encryption/test_encryption_util.cc
index 8b83154c96c46..012712c7b1a09 100644
--- a/cpp/src/parquet/encryption/test_encryption_util.cc
+++ b/cpp/src/parquet/encryption/test_encryption_util.cc
@@ -206,7 +206,7 @@ void FileEncryptor::EncryptFile(
     std::string file,
     std::shared_ptr<parquet::FileEncryptionProperties> encryption_configurations) {
   WriterProperties::Builder prop_builder;
-  prop_builder.compression(parquet::Compression::SNAPPY);
+  prop_builder.compression(parquet::Compression::UNCOMPRESSED);
   prop_builder.encryption(encryption_configurations);
   std::shared_ptr<WriterProperties> writer_properties = prop_builder.build();
 
diff --git a/cpp/src/parquet/reader_test.cc b/cpp/src/parquet/reader_test.cc
index 2d13266df22f0..4b2db178f32b1 100644
--- a/cpp/src/parquet/reader_test.cc
+++ b/cpp/src/parquet/reader_test.cc
@@ -323,6 +323,10 @@ TEST(TestFileReaderAdHoc, NationDictTruncatedDataPage) {
 }
 
 TEST(TestDumpWithLocalFile, DumpOutput) {
+#ifndef ARROW_WITH_SNAPPY
+  GTEST_SKIP() << "Test requires Snappy compression";
+#endif
+
   std::string header_output = R"###(File Name: nested_lists.snappy.parquet
 Version: 1.0
 Created By: parquet-mr version 1.8.2 (build c6522788629e590a53eb79874b95f6c3ff11f16c)
diff --git a/cpp/src/parquet/statistics_test.cc b/cpp/src/parquet/statistics_test.cc
index 9552c7b91ab09..60419e83b37a2 100644
--- a/cpp/src/parquet/statistics_test.cc
+++ b/cpp/src/parquet/statistics_test.cc
@@ -653,6 +653,12 @@ class TestStatisticsSortOrder : public ::testing::Test {
  public:
   using c_type = typename TestType::c_type;
 
+  void SetUp() override {
+#ifndef ARROW_WITH_SNAPPY
+    GTEST_SKIP() << "Test requires Snappy compression";
+#endif
+  }
+
   void AddNodes(std::string name) {
     fields_.push_back(schema::PrimitiveNode::Make(
         name, Repetition::REQUIRED, TestType::type_num, ConvertedType::NONE));

From f3f4423f4e952c49e14ed2b11e21809b6e98b666 Mon Sep 17 00:00:00 2001
From: David Li <li.davidm96@gmail.com>
Date: Tue, 9 Nov 2021 18:28:06 +0100
Subject: [PATCH 116/194] ARROW-14231: [C++] Support casting timestamp with
 timezone to string

Closes #11328 from lidavidm/arrow-14231

Authored-by: David Li <li.davidm96@gmail.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 .../compute/kernels/scalar_cast_string.cc     | 104 +++++++++++++++++-
 .../arrow/compute/kernels/scalar_cast_test.cc |  69 ++++++++++++
 .../compute/kernels/scalar_temporal_unary.cc  |  39 +------
 .../arrow/compute/kernels/temporal_internal.h |  39 ++++++-
 cpp/src/arrow/csv/writer_test.cc              |  37 +++++--
 5 files changed, 243 insertions(+), 45 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc
index eb2f904395523..4130c6a94879a 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc
@@ -21,6 +21,7 @@
 #include "arrow/array/builder_binary.h"
 #include "arrow/compute/kernels/common.h"
 #include "arrow/compute/kernels/scalar_cast_internal.h"
+#include "arrow/compute/kernels/temporal_internal.h"
 #include "arrow/result.h"
 #include "arrow/util/formatting.h"
 #include "arrow/util/int_util.h"
@@ -105,6 +106,107 @@ struct TemporalToStringCastFunctor {
   }
 };
 
+template <typename O>
+struct TemporalToStringCastFunctor<O, TimestampType> {
+  using value_type = typename TypeTraits<TimestampType>::CType;
+  using BuilderType = typename TypeTraits<O>::BuilderType;
+  using FormatterType = StringFormatter<TimestampType>;
+
+  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    DCHECK(out->is_array());
+    const ArrayData& input = *batch[0].array();
+    ArrayData* output = out->mutable_array();
+    return Convert(ctx, input, output);
+  }
+
+  static Status Convert(KernelContext* ctx, const ArrayData& input, ArrayData* output) {
+    const auto& timezone = GetInputTimezone(*input.type);
+    const auto& ty = checked_cast<const TimestampType&>(*input.type);
+    BuilderType builder(input.type, ctx->memory_pool());
+
+    // Preallocate
+    int64_t string_length = 19;  // YYYY-MM-DD HH:MM:SS
+    if (ty.unit() == TimeUnit::MILLI) {
+      string_length += 4;  // .SSS
+    } else if (ty.unit() == TimeUnit::MICRO) {
+      string_length += 7;  // .SSSSSS
+    } else if (ty.unit() == TimeUnit::NANO) {
+      string_length += 10;  // .SSSSSSSSS
+    }
+    if (!timezone.empty()) string_length += 5;  // +0000
+    RETURN_NOT_OK(builder.Reserve(input.length));
+    RETURN_NOT_OK(
+        builder.ReserveData((input.length - input.GetNullCount()) * string_length));
+
+    if (timezone.empty()) {
+      FormatterType formatter(input.type);
+      RETURN_NOT_OK(VisitArrayDataInline<TimestampType>(
+          input,
+          [&](value_type v) {
+            return formatter(v, [&](util::string_view v) { return builder.Append(v); });
+          },
+          [&]() {
+            builder.UnsafeAppendNull();
+            return Status::OK();
+          }));
+    } else {
+#ifdef _WIN32
+      // TODO(ARROW-13168):
+      return Status::NotImplemented(
+          "Casting a timestamp with time zone to string is not yet supported on "
+          "Windows.");
+#else
+      switch (ty.unit()) {
+        case TimeUnit::SECOND:
+          RETURN_NOT_OK(ConvertZoned<std::chrono::seconds>(input, timezone, &builder));
+          break;
+        case TimeUnit::MILLI:
+          RETURN_NOT_OK(
+              ConvertZoned<std::chrono::milliseconds>(input, timezone, &builder));
+          break;
+        case TimeUnit::MICRO:
+          RETURN_NOT_OK(
+              ConvertZoned<std::chrono::microseconds>(input, timezone, &builder));
+          break;
+        case TimeUnit::NANO:
+          RETURN_NOT_OK(
+              ConvertZoned<std::chrono::nanoseconds>(input, timezone, &builder));
+          break;
+        default:
+          DCHECK(false);
+          return Status::NotImplemented("Unimplemented time unit");
+      }
+#endif
+    }
+    std::shared_ptr<Array> output_array;
+    RETURN_NOT_OK(builder.Finish(&output_array));
+    *output = std::move(*output_array->data());
+    return Status::OK();
+  }
+
+  template <typename Duration>
+  static Status ConvertZoned(const ArrayData& input, const std::string& timezone,
+                             BuilderType* builder) {
+    static const std::string kFormatString = "%Y-%m-%d %H:%M:%S%z";
+    static const std::string kUtcFormatString = "%Y-%m-%d %H:%M:%SZ";
+    DCHECK(!timezone.empty());
+    ARROW_ASSIGN_OR_RAISE(const time_zone* tz, LocateZone(timezone));
+    ARROW_ASSIGN_OR_RAISE(std::locale locale, GetLocale("C"));
+    TimestampFormatter<Duration> formatter{
+        timezone == "UTC" ? kUtcFormatString : kFormatString, tz, locale};
+    return VisitArrayDataInline<TimestampType>(
+        input,
+        [&](value_type v) {
+          ARROW_ASSIGN_OR_RAISE(auto formatted, formatter(v));
+          return builder->Append(std::move(formatted));
+        },
+        [&]() {
+          builder->UnsafeAppendNull();
+          return Status::OK();
+        });
+  }
+};
+
 // ----------------------------------------------------------------------
 // Binary-like to binary-like
 //
@@ -304,7 +406,7 @@ void AddTemporalToStringCasts(CastFunction* func) {
   auto out_ty = TypeTraits<OutType>::type_singleton();
   for (const std::shared_ptr<DataType>& in_ty : TemporalTypes()) {
     DCHECK_OK(func->AddKernel(
-        in_ty->id(), {in_ty}, out_ty,
+        in_ty->id(), {InputType(in_ty->id())}, out_ty,
         TrivialScalarUnaryAsArraysExec(
             GenerateTemporal<TemporalToStringCastFunctor, OutType>(*in_ty)),
         NullHandling::COMPUTED_NO_PREALLOCATE));
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
index 5d516677669c1..92de7892f95c9 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
@@ -1553,8 +1553,77 @@ TEST(Cast, TimestampToString) {
     CheckCast(
         ArrayFromJSON(timestamp(TimeUnit::SECOND), "[-30610224000, -5364662400]"),
         ArrayFromJSON(string_type, R"(["1000-01-01 00:00:00", "1800-01-01 00:00:00"])"));
+
+    CheckCast(
+        ArrayFromJSON(timestamp(TimeUnit::MILLI), "[-30610224000000, -5364662400000]"),
+        ArrayFromJSON(string_type,
+                      R"(["1000-01-01 00:00:00.000", "1800-01-01 00:00:00.000"])"));
+
+    CheckCast(
+        ArrayFromJSON(timestamp(TimeUnit::MICRO),
+                      "[-30610224000000000, -5364662400000000]"),
+        ArrayFromJSON(string_type,
+                      R"(["1000-01-01 00:00:00.000000", "1800-01-01 00:00:00.000000"])"));
+
+    CheckCast(
+        ArrayFromJSON(timestamp(TimeUnit::NANO),
+                      "[-596933876543210988, 349837323456789012]"),
+        ArrayFromJSON(
+            string_type,
+            R"(["1951-02-01 01:02:03.456789012", "1981-02-01 01:02:03.456789012"])"));
+  }
+}
+
+#ifndef _WIN32
+TEST(Cast, TimestampWithZoneToString) {
+  for (auto string_type : {utf8(), large_utf8()}) {
+    CheckCast(
+        ArrayFromJSON(timestamp(TimeUnit::SECOND, "UTC"), "[-30610224000, -5364662400]"),
+        ArrayFromJSON(string_type,
+                      R"(["1000-01-01 00:00:00Z", "1800-01-01 00:00:00Z"])"));
+
+    CheckCast(
+        ArrayFromJSON(timestamp(TimeUnit::SECOND, "America/Phoenix"),
+                      "[-34226955, 1456767743]"),
+        ArrayFromJSON(string_type,
+                      R"(["1968-11-30 13:30:45-0700", "2016-02-29 10:42:23-0700"])"));
+
+    CheckCast(ArrayFromJSON(timestamp(TimeUnit::MILLI, "America/Phoenix"),
+                            "[-34226955877, 1456767743456]"),
+              ArrayFromJSON(
+                  string_type,
+                  R"(["1968-11-30 13:30:44.123-0700", "2016-02-29 10:42:23.456-0700"])"));
+
+    CheckCast(
+        ArrayFromJSON(timestamp(TimeUnit::MICRO, "America/Phoenix"),
+                      "[-34226955877000, 1456767743456789]"),
+        ArrayFromJSON(
+            string_type,
+            R"(["1968-11-30 13:30:44.123000-0700", "2016-02-29 10:42:23.456789-0700"])"));
+
+    CheckCast(
+        ArrayFromJSON(timestamp(TimeUnit::NANO, "America/Phoenix"),
+                      "[-34226955876543211, 1456767743456789246]"),
+        ArrayFromJSON(
+            string_type,
+            R"(["1968-11-30 13:30:44.123456789-0700", "2016-02-29 10:42:23.456789246-0700"])"));
   }
 }
+#else
+// TODO(ARROW-13168): we lack tzdb on Windows
+TEST(Cast, TimestampWithZoneToString) {
+  for (auto string_type : {utf8(), large_utf8()}) {
+    ASSERT_RAISES(NotImplemented, Cast(ArrayFromJSON(timestamp(TimeUnit::SECOND, "UTC"),
+                                                     "[-34226955, 1456767743]"),
+                                       CastOptions::Safe(string_type)));
+
+    ASSERT_RAISES(NotImplemented,
+                  Cast(ArrayFromJSON(timestamp(TimeUnit::SECOND, "America/Phoenix"),
+                                     "[-34226955, 1456767743]"),
+                       CastOptions::Safe(string_type)));
+  }
+}
+#endif
 
 TEST(Cast, DateToDate) {
   auto day_32 = ArrayFromJSON(date32(), "[0, null, 100, 1, 10]");
diff --git a/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc b/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc
index d29ebca0ca862..d1c5855d2df1c 100644
--- a/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc
@@ -45,7 +45,6 @@ using arrow_vendored::date::local_time;
 using arrow_vendored::date::locate_zone;
 using arrow_vendored::date::sys_days;
 using arrow_vendored::date::sys_time;
-using arrow_vendored::date::time_zone;
 using arrow_vendored::date::trunc;
 using arrow_vendored::date::weekday;
 using arrow_vendored::date::weeks;
@@ -479,7 +478,7 @@ struct Strftime {
     if ((options.format.find("%c") != std::string::npos) && (options.locale != "C")) {
       return Status::Invalid("%c flag is not supported in non-C locales.");
     }
-    auto timezone = GetInputTimezone(type);
+    const auto& timezone = GetInputTimezone(type);
 
     if (timezone.empty()) {
       if ((options.format.find("%z") != std::string::npos) ||
@@ -488,10 +487,10 @@ struct Strftime {
             "Timezone not present, cannot convert to string with timezone: ",
             options.format);
       }
-      timezone = "UTC";
     }
 
-    ARROW_ASSIGN_OR_RAISE(const time_zone* tz, LocateZone(timezone));
+    ARROW_ASSIGN_OR_RAISE(const time_zone* tz,
+                          LocateZone(timezone.empty() ? "UTC" : timezone));
 
     ARROW_ASSIGN_OR_RAISE(std::locale locale, GetLocale(options.locale));
 
@@ -500,7 +499,7 @@ struct Strftime {
 
   static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) {
     ARROW_ASSIGN_OR_RAISE(auto self, Make(ctx, *in.type));
-    TimestampFormatter formatter{self.options.format, self.tz, self.locale};
+    TimestampFormatter<Duration> formatter{self.options.format, self.tz, self.locale};
 
     if (in.is_valid) {
       const int64_t in_val = internal::UnboxScalar<const InType>::Unbox(in);
@@ -514,7 +513,7 @@ struct Strftime {
 
   static Status Call(KernelContext* ctx, const ArrayData& in, ArrayData* out) {
     ARROW_ASSIGN_OR_RAISE(auto self, Make(ctx, *in.type));
-    TimestampFormatter formatter{self.options.format, self.tz, self.locale};
+    TimestampFormatter<Duration> formatter{self.options.format, self.tz, self.locale};
 
     StringBuilder string_builder;
     // Presize string data using a heuristic
@@ -539,35 +538,9 @@ struct Strftime {
 
     return Status::OK();
   }
-
-  struct TimestampFormatter {
-    const char* format;
-    const time_zone* tz;
-    std::ostringstream bufstream;
-
-    explicit TimestampFormatter(const std::string& format, const time_zone* tz,
-                                const std::locale& locale)
-        : format(format.c_str()), tz(tz) {
-      bufstream.imbue(locale);
-      // Propagate errors as C++ exceptions (to get an actual error message)
-      bufstream.exceptions(std::ios::failbit | std::ios::badbit);
-    }
-
-    Result<std::string> operator()(int64_t arg) {
-      bufstream.str("");
-      const auto zt = zoned_time<Duration>{tz, sys_time<Duration>(Duration{arg})};
-      try {
-        arrow_vendored::date::to_stream(bufstream, format, zt);
-      } catch (const std::runtime_error& ex) {
-        bufstream.clear();
-        return Status::Invalid("Failed formatting timestamp: ", ex.what());
-      }
-      // XXX could return a view with std::ostringstream::view() (C++20)
-      return std::move(bufstream).str();
-    }
-  };
 };
 #else
+// TODO(ARROW-13168)
 template <typename Duration, typename InType>
 struct Strftime {
   static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) {
diff --git a/cpp/src/arrow/compute/kernels/temporal_internal.h b/cpp/src/arrow/compute/kernels/temporal_internal.h
index 45fa67a9b9b36..3d2d9c5b9bd9c 100644
--- a/cpp/src/arrow/compute/kernels/temporal_internal.h
+++ b/cpp/src/arrow/compute/kernels/temporal_internal.h
@@ -37,6 +37,7 @@ using arrow_vendored::date::sys_days;
 using arrow_vendored::date::sys_time;
 using arrow_vendored::date::time_zone;
 using arrow_vendored::date::year_month_day;
+using arrow_vendored::date::zoned_time;
 
 inline int64_t GetQuarter(const year_month_day& ymd) {
   return static_cast<int64_t>((static_cast<uint32_t>(ymd.month()) - 1) / 3);
@@ -72,7 +73,7 @@ static inline const std::string& GetInputTimezone(const ArrayData& array) {
   return checked_cast<const TimestampType&>(*array.type).timezone();
 }
 
-inline Status ValidateDayOfWeekOptions(const DayOfWeekOptions& options) {
+static inline Status ValidateDayOfWeekOptions(const DayOfWeekOptions& options) {
   if (options.week_start < 1 || 7 < options.week_start) {
     return Status::Invalid(
         "week_start must follow ISO convention (Monday=1, Sunday=7). Got week_start=",
@@ -81,6 +82,14 @@ inline Status ValidateDayOfWeekOptions(const DayOfWeekOptions& options) {
   return Status::OK();
 }
 
+static inline Result<std::locale> GetLocale(const std::string& locale) {
+  try {
+    return std::locale(locale.c_str());
+  } catch (const std::runtime_error& ex) {
+    return Status::Invalid("Cannot find locale '", locale, "': ", ex.what());
+  }
+}
+
 struct NonZonedLocalizer {
   using days_t = sys_days;
 
@@ -107,6 +116,34 @@ struct ZonedLocalizer {
   local_days ConvertDays(sys_days d) const { return local_days(year_month_day(d)); }
 };
 
+template <typename Duration>
+struct TimestampFormatter {
+  const char* format;
+  const time_zone* tz;
+  std::ostringstream bufstream;
+
+  explicit TimestampFormatter(const std::string& format, const time_zone* tz,
+                              const std::locale& locale)
+      : format(format.c_str()), tz(tz) {
+    bufstream.imbue(locale);
+    // Propagate errors as C++ exceptions (to get an actual error message)
+    bufstream.exceptions(std::ios::failbit | std::ios::badbit);
+  }
+
+  Result<std::string> operator()(int64_t arg) {
+    bufstream.str("");
+    const auto zt = zoned_time<Duration>{tz, sys_time<Duration>(Duration{arg})};
+    try {
+      arrow_vendored::date::to_stream(bufstream, format, zt);
+    } catch (const std::runtime_error& ex) {
+      bufstream.clear();
+      return Status::Invalid("Failed formatting timestamp: ", ex.what());
+    }
+    // XXX could return a view with std::ostringstream::view() (C++20)
+    return std::move(bufstream).str();
+  }
+};
+
 //
 // Which types to generate a kernel for
 //
diff --git a/cpp/src/arrow/csv/writer_test.cc b/cpp/src/arrow/csv/writer_test.cc
index 57b42c7f5a7b4..5d575887fdf73 100644
--- a/cpp/src/arrow/csv/writer_test.cc
+++ b/cpp/src/arrow/csv/writer_test.cc
@@ -59,6 +59,7 @@ std::vector<WriterTestParams> GenerateTestCases() {
       {field("c ", int32())},
       {field("d", date32())},
       {field("e", date64())},
+      {field("f", timestamp(TimeUnit::SECOND))},
   });
   auto populated_batch = R"([{"a": 1, "c ": -1},
                              { "a": 1, "b\"": "abc\"efg", "c ": 2324},
@@ -67,16 +68,18 @@ std::vector<WriterTestParams> GenerateTestCases() {
                              { "a": 546, "b\"": "", "c ": 517 },
                              { "a": 124, "b\"": "a\"\"b\"" },
                              { "d": 0 },
-                             { "e": 86400000 }])";
-  std::string expected_without_header = std::string("1,,-1,,") + "\n" +    // line 1
-                                        R"(1,"abc""efg",2324,,)" + "\n" +  // line 2
-                                        R"(,"abcd",5467,,)" + "\n" +       // line 3
-                                        R"(,,,,)" + "\n" +                 // line 4
-                                        R"(546,"",517,,)" + "\n" +         // line 5
-                                        R"(124,"a""""b""",,,)" + "\n" +    // line 6
-                                        R"(,,,1970-01-01,)" + "\n" +       // line 7
-                                        R"(,,,,1970-01-02)" + "\n";        // line 8
-  std::string expected_header = std::string(R"("a","b""","c ","d","e")") + "\n";
+                             { "e": 86400000 },
+                             { "f": 1078016523 }])";
+  std::string expected_without_header = std::string("1,,-1,,,") + "\n" +       // line 1
+                                        R"(1,"abc""efg",2324,,,)" + "\n" +     // line 2
+                                        R"(,"abcd",5467,,,)" + "\n" +          // line 3
+                                        R"(,,,,,)" + "\n" +                    // line 4
+                                        R"(546,"",517,,,)" + "\n" +            // line 5
+                                        R"(124,"a""""b""",,,,)" + "\n" +       // line 6
+                                        R"(,,,1970-01-01,,)" + "\n" +          // line 7
+                                        R"(,,,,1970-01-02,)" + "\n" +          // line 8
+                                        R"(,,,,,2004-02-29 01:02:03)" + "\n";  // line 9
+  std::string expected_header = std::string(R"("a","b""","c ","d","e","f")") + "\n";
 
   return std::vector<WriterTestParams>{
       {abc_schema, "[]", DefaultTestOptions(/*header=*/false), ""},
@@ -155,5 +158,19 @@ INSTANTIATE_TEST_SUITE_P(SingleColumnWriteCSVTest, TestWriteCSV,
                              R"("int64")"
                              "\n9999\n\n-15\n"}));
 
+#ifndef _WIN32
+// TODO(ARROW-13168):
+INSTANTIATE_TEST_SUITE_P(
+    TimestampWithTimezoneWriteCSVTest, TestWriteCSV,
+    ::testing::Values(WriterTestParams{
+        schema({
+            field("tz", timestamp(TimeUnit::SECOND, "America/Phoenix")),
+            field("utc", timestamp(TimeUnit::SECOND, "UTC")),
+        }),
+        R"([{ "tz": 1456767743, "utc": 1456767743 }])", WriteOptions(),
+        R"("tz","utc")"
+        "\n2016-02-29 10:42:23-0700,2016-02-29 17:42:23Z\n"}));
+#endif
+
 }  // namespace csv
 }  // namespace arrow

From 25b859d0be768cc211ac26e7c7c5a2fd93648e93 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dragos=20Moldovan-Gr=C3=BCnfeld?= <dragos.mold@gmail.com>
Date: Tue, 9 Nov 2021 11:39:19 -0600
Subject: [PATCH 117/194] ARROW-14558 [R] clarify OOP system wording in the
 Arrow vignette
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes #11651 from dragosmg/ARROW-14558_clarify_oop_wording_vignette

Lead-authored-by: Dragos Moldovan-Grünfeld <dragos.mold@gmail.com>
Co-authored-by: Dragoș Moldovan-Grünfeld <dragos.mold@gmail.com>
Signed-off-by: Jonathan Keane <jkeane@gmail.com>
---
 r/vignettes/arrow.Rmd | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/r/vignettes/arrow.Rmd b/r/vignettes/arrow.Rmd
index ff6bf7ce07f91..79f3a47fa14ed 100644
--- a/r/vignettes/arrow.Rmd
+++ b/r/vignettes/arrow.Rmd
@@ -216,7 +216,7 @@ Note that the `attributes()` stored in `$metadata$r` are only understood by R. I
 
 ## Class structure and package conventions
 
-C++ is an object-oriented language, so the core logic of the Arrow library is encapsulated in classes and methods. In the R package, these classes are implemented as `R6` reference classes, most of which are exported from the namespace.
+C++ is an object-oriented language, so the core logic of the Arrow library is encapsulated in classes and methods. In the R package, these classes are implemented as [`R6`](https://r6.r-lib.org) classes, most of which are exported from the namespace.
 
 In order to match the C++ naming conventions, the `R6` classes are in TitleCase, e.g. `RecordBatch`. This makes it easy to look up the relevant C++ implementations in the [code](https://github.com/apache/arrow/tree/master/cpp) or [documentation](https://arrow.apache.org/docs/cpp/). To simplify things in R, the C++ library namespaces are generally dropped or flattened; that is, where the C++ library has `arrow::io::FileOutputStream`, it is just `FileOutputStream` in the R package. One exception is for the file readers, where the namespace is necessary to disambiguate. So `arrow::csv::TableReader` becomes `CsvTableReader`, and `arrow::json::TableReader` becomes `JsonTableReader`.
 

From 8b440d1b5df4a0e58bcaa422b83e6d6ff1213b5f Mon Sep 17 00:00:00 2001
From: David Li <li.davidm96@gmail.com>
Date: Tue, 9 Nov 2021 18:48:48 +0100
Subject: [PATCH 118/194] ARROW-13988: [C++] Support base binary types in
 hash_min_max

Closes #11452 from lidavidm/arrow-13988

Authored-by: David Li <li.davidm96@gmail.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 .../compute/kernels/aggregate_benchmark.cc    |  35 +++
 .../arrow/compute/kernels/hash_aggregate.cc   | 225 +++++++++++++++++-
 .../compute/kernels/hash_aggregate_test.cc    | 107 +++++++++
 docs/source/cpp/compute.rst                   |   2 +-
 4 files changed, 362 insertions(+), 7 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/aggregate_benchmark.cc b/cpp/src/arrow/compute/kernels/aggregate_benchmark.cc
index 39cfeb039a84b..5aef6d5f12ec1 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_benchmark.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_benchmark.cc
@@ -322,6 +322,8 @@ static void BenchmarkGroupBy(benchmark::State& state,
     BenchmarkSetArgsWithSizes(bench, {1 * 1024 * 1024});             \
   })
 
+// Grouped Sum
+
 GROUP_BY_BENCHMARK(SumDoublesGroupedByTinyStringSet, [&] {
   auto summand = rng.Float64(args.size,
                              /*min=*/0.0,
@@ -463,6 +465,39 @@ GROUP_BY_BENCHMARK(SumDoublesGroupedByMediumIntStringPairSet, [&] {
   BenchmarkGroupBy(state, {{"hash_sum", NULLPTR}}, {summand}, {int_key, str_key});
 });
 
+// Grouped MinMax
+
+GROUP_BY_BENCHMARK(MinMaxDoublesGroupedByMediumInt, [&] {
+  auto input = rng.Float64(args.size,
+                           /*min=*/0.0,
+                           /*max=*/1.0e14,
+                           /*null_probability=*/args.null_proportion,
+                           /*nan_probability=*/args.null_proportion / 10);
+  auto int_key = rng.Int64(args.size, /*min=*/0, /*max=*/63);
+
+  BenchmarkGroupBy(state, {{"hash_min_max", NULLPTR}}, {input}, {int_key});
+});
+
+GROUP_BY_BENCHMARK(MinMaxShortStringsGroupedByMediumInt, [&] {
+  auto input = rng.String(args.size,
+                          /*min_length=*/0,
+                          /*max_length=*/64,
+                          /*null_probability=*/args.null_proportion);
+  auto int_key = rng.Int64(args.size, /*min=*/0, /*max=*/63);
+
+  BenchmarkGroupBy(state, {{"hash_min_max", NULLPTR}}, {input}, {int_key});
+});
+
+GROUP_BY_BENCHMARK(MinMaxLongStringsGroupedByMediumInt, [&] {
+  auto input = rng.String(args.size,
+                          /*min_length=*/0,
+                          /*max_length=*/512,
+                          /*null_probability=*/args.null_proportion);
+  auto int_key = rng.Int64(args.size, /*min=*/0, /*max=*/63);
+
+  BenchmarkGroupBy(state, {{"hash_min_max", NULLPTR}}, {input}, {int_key});
+});
+
 //
 // Sum
 //
diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate.cc b/cpp/src/arrow/compute/kernels/hash_aggregate.cc
index 9f53267535511..e2868a51ee01c 100644
--- a/cpp/src/arrow/compute/kernels/hash_aggregate.cc
+++ b/cpp/src/arrow/compute/kernels/hash_aggregate.cc
@@ -40,12 +40,14 @@
 #include "arrow/compute/kernels/row_encoder.h"
 #include "arrow/compute/kernels/util_internal.h"
 #include "arrow/record_batch.h"
+#include "arrow/stl_allocator.h"
 #include "arrow/util/bit_run_reader.h"
 #include "arrow/util/bitmap_ops.h"
 #include "arrow/util/bitmap_writer.h"
 #include "arrow/util/checked_cast.h"
 #include "arrow/util/cpu_info.h"
 #include "arrow/util/int128_internal.h"
+#include "arrow/util/int_util_internal.h"
 #include "arrow/util/make_unique.h"
 #include "arrow/util/task_group.h"
 #include "arrow/util/tdigest.h"
@@ -604,8 +606,9 @@ struct GroupedValueTraits<BooleanType> {
 };
 
 template <typename Type, typename ConsumeValue, typename ConsumeNull>
-void VisitGroupedValues(const ExecBatch& batch, ConsumeValue&& valid_func,
-                        ConsumeNull&& null_func) {
+typename arrow::internal::call_traits::enable_if_return<ConsumeValue, void>::type
+VisitGroupedValues(const ExecBatch& batch, ConsumeValue&& valid_func,
+                   ConsumeNull&& null_func) {
   auto g = batch[1].array()->GetValues<uint32_t>(1);
   if (batch[0].is_array()) {
     VisitArrayValuesInline<Type>(
@@ -627,6 +630,31 @@ void VisitGroupedValues(const ExecBatch& batch, ConsumeValue&& valid_func,
   }
 }
 
+template <typename Type, typename ConsumeValue, typename ConsumeNull>
+typename arrow::internal::call_traits::enable_if_return<ConsumeValue, Status>::type
+VisitGroupedValues(const ExecBatch& batch, ConsumeValue&& valid_func,
+                   ConsumeNull&& null_func) {
+  auto g = batch[1].array()->GetValues<uint32_t>(1);
+  if (batch[0].is_array()) {
+    return VisitArrayValuesInline<Type>(
+        *batch[0].array(),
+        [&](typename GetViewType<Type>::T val) { return valid_func(*g++, val); },
+        [&]() { return null_func(*g++); });
+  }
+  const auto& input = *batch[0].scalar();
+  if (input.is_valid) {
+    const auto val = UnboxScalar<Type>::Unbox(input);
+    for (int64_t i = 0; i < batch.length; i++) {
+      RETURN_NOT_OK(valid_func(*g++, val));
+    }
+  } else {
+    for (int64_t i = 0; i < batch.length; i++) {
+      RETURN_NOT_OK(null_func(*g++));
+    }
+  }
+  return Status::OK();
+}
+
 template <typename Type, typename ConsumeValue>
 void VisitGroupedValuesNonNull(const ExecBatch& batch, ConsumeValue&& valid_func) {
   VisitGroupedValues<Type>(batch, std::forward<ConsumeValue>(valid_func),
@@ -1554,7 +1582,7 @@ struct AntiExtrema<Decimal256> {
   static constexpr Decimal256 anti_max() { return BasicDecimal256::GetMinSentinel(); }
 };
 
-template <typename Type>
+template <typename Type, typename Enable = void>
 struct GroupedMinMaxImpl final : public GroupedAggregator {
   using CType = typename TypeTraits<Type>::CType;
   using GetSet = GroupedValueTraits<Type>;
@@ -1658,6 +1686,178 @@ struct GroupedMinMaxImpl final : public GroupedAggregator {
   ScalarAggregateOptions options_;
 };
 
+// For binary-like types
+// In principle, FixedSizeBinary could use base implementation
+template <typename Type>
+struct GroupedMinMaxImpl<Type,
+                         enable_if_t<is_base_binary_type<Type>::value ||
+                                     std::is_same<Type, FixedSizeBinaryType>::value>>
+    final : public GroupedAggregator {
+  using Allocator = arrow::stl::allocator<char>;
+  using StringType = std::basic_string<char, std::char_traits<char>, Allocator>;
+
+  Status Init(ExecContext* ctx, const std::vector<ValueDescr>&,
+              const FunctionOptions* options) override {
+    ctx_ = ctx;
+    allocator_ = Allocator(ctx->memory_pool());
+    options_ = *checked_cast<const ScalarAggregateOptions*>(options);
+    // type_ initialized by MinMaxInit
+    has_values_ = TypedBufferBuilder<bool>(ctx->memory_pool());
+    has_nulls_ = TypedBufferBuilder<bool>(ctx->memory_pool());
+    return Status::OK();
+  }
+
+  Status Resize(int64_t new_num_groups) override {
+    auto added_groups = new_num_groups - num_groups_;
+    DCHECK_GE(added_groups, 0);
+    num_groups_ = new_num_groups;
+    mins_.resize(new_num_groups);
+    maxes_.resize(new_num_groups);
+    RETURN_NOT_OK(has_values_.Append(added_groups, false));
+    RETURN_NOT_OK(has_nulls_.Append(added_groups, false));
+    return Status::OK();
+  }
+
+  Status Consume(const ExecBatch& batch) override {
+    return VisitGroupedValues<Type>(
+        batch,
+        [&](uint32_t g, util::string_view val) {
+          if (!mins_[g] || val < *mins_[g]) {
+            mins_[g].emplace(val.data(), val.size(), allocator_);
+          }
+          if (!maxes_[g] || val > *maxes_[g]) {
+            maxes_[g].emplace(val.data(), val.size(), allocator_);
+          }
+          BitUtil::SetBit(has_values_.mutable_data(), g);
+          return Status::OK();
+        },
+        [&](uint32_t g) {
+          BitUtil::SetBit(has_nulls_.mutable_data(), g);
+          return Status::OK();
+        });
+  }
+
+  Status Merge(GroupedAggregator&& raw_other,
+               const ArrayData& group_id_mapping) override {
+    auto other = checked_cast<GroupedMinMaxImpl*>(&raw_other);
+    auto g = group_id_mapping.GetValues<uint32_t>(1);
+    for (uint32_t other_g = 0; static_cast<int64_t>(other_g) < group_id_mapping.length;
+         ++other_g, ++g) {
+      if (!mins_[*g] ||
+          (mins_[*g] && other->mins_[other_g] && *mins_[*g] > *other->mins_[other_g])) {
+        mins_[*g] = std::move(other->mins_[other_g]);
+      }
+      if (!maxes_[*g] || (maxes_[*g] && other->maxes_[other_g] &&
+                          *maxes_[*g] < *other->maxes_[other_g])) {
+        maxes_[*g] = std::move(other->maxes_[other_g]);
+      }
+
+      if (BitUtil::GetBit(other->has_values_.data(), other_g)) {
+        BitUtil::SetBit(has_values_.mutable_data(), *g);
+      }
+      if (BitUtil::GetBit(other->has_nulls_.data(), other_g)) {
+        BitUtil::SetBit(has_nulls_.mutable_data(), *g);
+      }
+    }
+    return Status::OK();
+  }
+
+  Result<Datum> Finalize() override {
+    // aggregation for group is valid if there was at least one value in that group
+    ARROW_ASSIGN_OR_RAISE(auto null_bitmap, has_values_.Finish());
+
+    if (!options_.skip_nulls) {
+      // ... and there were no nulls in that group
+      ARROW_ASSIGN_OR_RAISE(auto has_nulls, has_nulls_.Finish());
+      arrow::internal::BitmapAndNot(null_bitmap->data(), 0, has_nulls->data(), 0,
+                                    num_groups_, 0, null_bitmap->mutable_data());
+    }
+
+    auto mins = ArrayData::Make(type_, num_groups_, {null_bitmap, nullptr});
+    auto maxes = ArrayData::Make(type_, num_groups_, {std::move(null_bitmap), nullptr});
+    RETURN_NOT_OK(MakeOffsetsValues(mins.get(), mins_));
+    RETURN_NOT_OK(MakeOffsetsValues(maxes.get(), maxes_));
+    return ArrayData::Make(out_type(), num_groups_, {nullptr},
+                           {std::move(mins), std::move(maxes)});
+  }
+
+  template <typename T = Type>
+  enable_if_base_binary<T, Status> MakeOffsetsValues(
+      ArrayData* array, const std::vector<util::optional<StringType>>& values) {
+    using offset_type = typename T::offset_type;
+    ARROW_ASSIGN_OR_RAISE(
+        auto raw_offsets,
+        AllocateBuffer((1 + values.size()) * sizeof(offset_type), ctx_->memory_pool()));
+    offset_type* offsets = reinterpret_cast<offset_type*>(raw_offsets->mutable_data());
+    offsets[0] = 0;
+    offsets++;
+    const uint8_t* null_bitmap = array->buffers[0]->data();
+    offset_type total_length = 0;
+    for (size_t i = 0; i < values.size(); i++) {
+      if (BitUtil::GetBit(null_bitmap, i)) {
+        const util::optional<StringType>& value = values[i];
+        DCHECK(value.has_value());
+        if (value->size() >
+                static_cast<size_t>(std::numeric_limits<offset_type>::max()) ||
+            arrow::internal::AddWithOverflow(
+                total_length, static_cast<offset_type>(value->size()), &total_length)) {
+          return Status::Invalid("Result is too large to fit in ", *array->type,
+                                 " cast to large_ variant of type");
+        }
+      }
+      offsets[i] = total_length;
+    }
+    ARROW_ASSIGN_OR_RAISE(auto data, AllocateBuffer(total_length, ctx_->memory_pool()));
+    int64_t offset = 0;
+    for (size_t i = 0; i < values.size(); i++) {
+      if (BitUtil::GetBit(null_bitmap, i)) {
+        const util::optional<StringType>& value = values[i];
+        DCHECK(value.has_value());
+        std::memcpy(data->mutable_data() + offset, value->data(), value->size());
+        offset += value->size();
+      }
+    }
+    array->buffers[1] = std::move(raw_offsets);
+    array->buffers.push_back(std::move(data));
+    return Status::OK();
+  }
+
+  template <typename T = Type>
+  enable_if_same<T, FixedSizeBinaryType, Status> MakeOffsetsValues(
+      ArrayData* array, const std::vector<util::optional<StringType>>& values) {
+    const uint8_t* null_bitmap = array->buffers[0]->data();
+    const int32_t slot_width =
+        checked_cast<const FixedSizeBinaryType&>(*array->type).byte_width();
+    int64_t total_length = values.size() * slot_width;
+    ARROW_ASSIGN_OR_RAISE(auto data, AllocateBuffer(total_length, ctx_->memory_pool()));
+    int64_t offset = 0;
+    for (size_t i = 0; i < values.size(); i++) {
+      if (BitUtil::GetBit(null_bitmap, i)) {
+        const util::optional<StringType>& value = values[i];
+        DCHECK(value.has_value());
+        std::memcpy(data->mutable_data() + offset, value->data(), slot_width);
+      } else {
+        std::memset(data->mutable_data() + offset, 0x00, slot_width);
+      }
+      offset += slot_width;
+    }
+    array->buffers[1] = std::move(data);
+    return Status::OK();
+  }
+
+  std::shared_ptr<DataType> out_type() const override {
+    return struct_({field("min", type_), field("max", type_)});
+  }
+
+  ExecContext* ctx_;
+  Allocator allocator_;
+  int64_t num_groups_;
+  std::vector<util::optional<StringType>> mins_, maxes_;
+  TypedBufferBuilder<bool> has_values_, has_nulls_;
+  std::shared_ptr<DataType> type_;
+  ScalarAggregateOptions options_;
+};
+
 struct GroupedNullMinMaxImpl final : public GroupedAggregator {
   Status Init(ExecContext* ctx, const std::vector<ValueDescr>&,
               const FunctionOptions*) override {
@@ -1755,6 +1955,17 @@ struct GroupedMinMaxFactory {
     return Status::OK();
   }
 
+  template <typename T>
+  enable_if_base_binary<T, Status> Visit(const T&) {
+    kernel = MakeKernel(std::move(argument_type), MinMaxInit<T>);
+    return Status::OK();
+  }
+
+  Status Visit(const FixedSizeBinaryType&) {
+    kernel = MakeKernel(std::move(argument_type), MinMaxInit<FixedSizeBinaryType>);
+    return Status::OK();
+  }
+
   Status Visit(const BooleanType&) {
     kernel = MakeKernel(std::move(argument_type), MinMaxInit<BooleanType>);
     return Status::OK();
@@ -2591,10 +2802,12 @@ void RegisterHashAggregateBasic(FunctionRegistry* registry) {
         &default_scalar_aggregate_options);
     DCHECK_OK(AddHashAggKernels(NumericTypes(), GroupedMinMaxFactory::Make, func.get()));
     DCHECK_OK(AddHashAggKernels(TemporalTypes(), GroupedMinMaxFactory::Make, func.get()));
+    DCHECK_OK(
+        AddHashAggKernels(BaseBinaryTypes(), GroupedMinMaxFactory::Make, func.get()));
     // Type parameters are ignored
-    DCHECK_OK(AddHashAggKernels(
-        {null(), boolean(), decimal128(1, 1), decimal256(1, 1), month_interval()},
-        GroupedMinMaxFactory::Make, func.get()));
+    DCHECK_OK(AddHashAggKernels({null(), boolean(), decimal128(1, 1), decimal256(1, 1),
+                                 month_interval(), fixed_size_binary(1)},
+                                GroupedMinMaxFactory::Make, func.get()));
     min_max_func = func.get();
     DCHECK_OK(registry->AddFunction(std::move(func)));
   }
diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc b/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc
index ef64974be72db..e8bc90695e109 100644
--- a/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc
+++ b/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc
@@ -1728,6 +1728,113 @@ TEST(GroupBy, MinMaxDecimal) {
   }
 }
 
+TEST(GroupBy, MinMaxBinary) {
+  for (bool use_exec_plan : {false, true}) {
+    for (bool use_threads : {true, false}) {
+      for (const auto& ty : BaseBinaryTypes()) {
+        SCOPED_TRACE(use_threads ? "parallel/merged" : "serial");
+
+        auto table = TableFromJSON(schema({
+                                       field("argument0", ty),
+                                       field("key", int64()),
+                                   }),
+                                   {R"([
+    ["aaaa", 1],
+    [null,   1]
+])",
+                                    R"([
+    ["bcd",  2],
+    [null,   3],
+    ["2",    null],
+    ["d",    1],
+    ["bc",   2]
+])",
+                                    R"([
+    ["babcd", 2],
+    ["123",   null],
+    [null,    3]
+])"});
+
+        ASSERT_OK_AND_ASSIGN(
+            Datum aggregated_and_grouped,
+            GroupByTest({table->GetColumnByName("argument0")},
+                        {table->GetColumnByName("key")}, {{"hash_min_max", nullptr}},
+                        use_threads, use_exec_plan));
+        ValidateOutput(aggregated_and_grouped);
+        SortBy({"key_0"}, &aggregated_and_grouped);
+
+        AssertDatumsEqual(
+            ArrayFromJSON(
+                struct_({
+                    field("hash_min_max", struct_({field("min", ty), field("max", ty)})),
+                    field("key_0", int64()),
+                }),
+                R"([
+    [{"min": "aaaa", "max": "d"},    1],
+    [{"min": "babcd", "max": "bcd"}, 2],
+    [{"min": null, "max": null},     3],
+    [{"min": "123", "max": "2"},     null]
+  ])"),
+            aggregated_and_grouped,
+            /*verbose=*/true);
+      }
+    }
+  }
+}
+
+TEST(GroupBy, MinMaxFixedSizeBinary) {
+  const auto ty = fixed_size_binary(3);
+  for (bool use_exec_plan : {false, true}) {
+    for (bool use_threads : {true, false}) {
+      SCOPED_TRACE(use_threads ? "parallel/merged" : "serial");
+
+      auto table = TableFromJSON(schema({
+                                     field("argument0", ty),
+                                     field("key", int64()),
+                                 }),
+                                 {R"([
+    ["aaa", 1],
+    [null,  1]
+])",
+                                  R"([
+    ["bac", 2],
+    [null,  3],
+    ["234", null],
+    ["ddd", 1],
+    ["bcd", 2]
+])",
+                                  R"([
+    ["bab", 2],
+    ["123", null],
+    [null,  3]
+])"});
+
+      ASSERT_OK_AND_ASSIGN(
+          Datum aggregated_and_grouped,
+          GroupByTest({table->GetColumnByName("argument0")},
+                      {table->GetColumnByName("key")}, {{"hash_min_max", nullptr}},
+                      use_threads, use_exec_plan));
+      ValidateOutput(aggregated_and_grouped);
+      SortBy({"key_0"}, &aggregated_and_grouped);
+
+      AssertDatumsEqual(
+          ArrayFromJSON(
+              struct_({
+                  field("hash_min_max", struct_({field("min", ty), field("max", ty)})),
+                  field("key_0", int64()),
+              }),
+              R"([
+    [{"min": "aaa", "max": "ddd"}, 1],
+    [{"min": "bab", "max": "bcd"}, 2],
+    [{"min": null, "max": null},   3],
+    [{"min": "123", "max": "234"}, null]
+  ])"),
+          aggregated_and_grouped,
+          /*verbose=*/true);
+    }
+  }
+}
+
 TEST(GroupBy, MinOrMax) {
   auto table =
       TableFromJSON(schema({field("argument", float64()), field("key", int64())}), {R"([
diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst
index 824170481a3ac..577a32db72f8e 100644
--- a/docs/source/cpp/compute.rst
+++ b/docs/source/cpp/compute.rst
@@ -330,7 +330,7 @@ equivalents above and reflects how they are implemented internally.
 +-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+
 | hash_min                | Unary | Non-nested, non-binary/string-like | Input type             | :struct:`ScalarAggregateOptions` |       |
 +-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+
-| hash_min_max            | Unary | Non-nested, non-binary/string-like | Struct                 | :struct:`ScalarAggregateOptions` | \(3)  |
+| hash_min_max            | Unary | Non-nested types                   | Struct                 | :struct:`ScalarAggregateOptions` | \(3)  |
 +-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+
 | hash_product            | Unary | Numeric                            | Numeric                | :struct:`ScalarAggregateOptions` | \(4)  |
 +-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+

From 1ddd644210888a0996754d5f5d517699dcf6c446 Mon Sep 17 00:00:00 2001
From: Matthew Topol <mtopol@factset.com>
Date: Tue, 9 Nov 2021 16:58:10 -0500
Subject: [PATCH 119/194] ARROW-14645: [Go] Add ValueOffsets function to
 array.String

Closes #11653 from zeroshade/arrow-14645-valueoffsets

Authored-by: Matthew Topol <mtopol@factset.com>
Signed-off-by: Matthew Topol <mtopol@factset.com>
---
 go/arrow/array/string.go      |  6 ++++++
 go/arrow/array/string_test.go | 17 +++++++++++++----
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/go/arrow/array/string.go b/go/arrow/array/string.go
index c8e16ce3d4395..f7e6afccfafa3 100644
--- a/go/arrow/array/string.go
+++ b/go/arrow/array/string.go
@@ -65,6 +65,12 @@ func (a *String) ValueOffset(i int) int {
 	return int(a.offsets[i+a.array.data.offset])
 }
 
+func (a *String) ValueOffsets() []int32 {
+	beg := a.array.data.offset
+	end := beg + a.array.data.length + 1
+	return a.offsets[beg:end]
+}
+
 func (a *String) ValueBytes() (ret []byte) {
 	beg := a.array.data.offset
 	end := beg + a.array.data.length
diff --git a/go/arrow/array/string_test.go b/go/arrow/array/string_test.go
index 549fe9965b080..f71eb37ff055e 100644
--- a/go/arrow/array/string_test.go
+++ b/go/arrow/array/string_test.go
@@ -18,6 +18,7 @@ package array_test
 
 import (
 	"bytes"
+	"reflect"
 	"testing"
 
 	"github.com/apache/arrow/go/arrow"
@@ -33,7 +34,7 @@ func TestStringArray(t *testing.T) {
 	var (
 		want    = []string{"hello", "世界", "", "bye"}
 		valids  = []bool{true, true, false, true}
-		offsets = []int{0, 5, 11, 11, 14}
+		offsets = []int32{0, 5, 11, 11, 14}
 	)
 
 	sb := array.NewStringBuilder(mem)
@@ -82,14 +83,18 @@ func TestStringArray(t *testing.T) {
 			}
 		}
 
-		if got, want := arr.ValueOffset(i), offsets[i]; got != want {
+		if got, want := arr.ValueOffset(i), int(offsets[i]); got != want {
 			t.Fatalf("arr-offset-beg[%d]: got=%d, want=%d", i, got, want)
 		}
-		if got, want := arr.ValueOffset(i+1), offsets[i+1]; got != want {
+		if got, want := arr.ValueOffset(i+1), int(offsets[i+1]); got != want {
 			t.Fatalf("arr-offset-end[%d]: got=%d, want=%d", i+1, got, want)
 		}
 	}
 
+	if !reflect.DeepEqual(offsets, arr.ValueOffsets()) {
+		t.Fatalf("ValueOffsets got=%v, want=%v", arr.ValueOffsets(), offsets)
+	}
+
 	sub := array.MakeFromData(arr.Data())
 	defer sub.Release()
 
@@ -129,10 +134,14 @@ func TestStringArray(t *testing.T) {
 	}
 
 	for i := 0; i < v.Len(); i++ {
-		if got, want := v.ValueOffset(0), offsets[i+slice.Offset()]; got != want {
+		if got, want := v.ValueOffset(0), int(offsets[i+slice.Offset()]); got != want {
 			t.Fatalf("val-offset-with-offset[%d]: got=%q, want=%q", i, got, want)
 		}
 	}
+
+	if !reflect.DeepEqual(offsets[2:5], v.ValueOffsets()) {
+		t.Fatalf("ValueOffsets got=%v, want=%v", v.ValueOffsets(), offsets[2:5])
+	}
 }
 
 func TestStringBuilder_Empty(t *testing.T) {

From 2b10648c831cb0ff71ab72098c46adf527710efc Mon Sep 17 00:00:00 2001
From: David Li <li.davidm96@gmail.com>
Date: Wed, 10 Nov 2021 14:17:34 +0100
Subject: [PATCH 120/194] ARROW-13948: [C++] Support timestamp with timezone in
 is_in/index_in

A quick PR to fix a minor oversight in kernel registration.

Closes #11368 from lidavidm/arrow-13948

Authored-by: David Li <li.davidm96@gmail.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/arrow/compute/api_scalar.cc           |  30 +--
 .../compute/kernels/scalar_set_lookup.cc      |  26 ++-
 .../compute/kernels/scalar_set_lookup_test.cc | 217 ++++++++++++------
 python/pyarrow/tests/parquet/test_dataset.py  |   2 +-
 4 files changed, 176 insertions(+), 99 deletions(-)

diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc
index e3fe1bdf73daf..c06c902be601b 100644
--- a/cpp/src/arrow/compute/api_scalar.cc
+++ b/cpp/src/arrow/compute/api_scalar.cc
@@ -532,44 +532,22 @@ Result<Datum> MinElementWise(const std::vector<Datum>& args,
 // ----------------------------------------------------------------------
 // Set-related operations
 
-static Result<Datum> ExecSetLookup(const std::string& func_name, const Datum& data,
-                                   const SetLookupOptions& options, ExecContext* ctx) {
-  if (!options.value_set.is_arraylike()) {
-    return Status::Invalid("Set lookup value set must be Array or ChunkedArray");
-  }
-  std::shared_ptr<DataType> data_type;
-  if (data.type()->id() == Type::DICTIONARY) {
-    data_type =
-        arrow::internal::checked_pointer_cast<DictionaryType>(data.type())->value_type();
-  } else {
-    data_type = data.type();
-  }
-
-  if (options.value_set.length() > 0 && !data_type->Equals(options.value_set.type())) {
-    std::stringstream ss;
-    ss << "Array type didn't match type of values set: " << data_type->ToString()
-       << " vs " << options.value_set.type()->ToString();
-    return Status::Invalid(ss.str());
-  }
-  return CallFunction(func_name, {data}, &options, ctx);
-}
-
 Result<Datum> IsIn(const Datum& values, const SetLookupOptions& options,
                    ExecContext* ctx) {
-  return ExecSetLookup("is_in", values, options, ctx);
+  return CallFunction("is_in", {values}, &options, ctx);
 }
 
 Result<Datum> IsIn(const Datum& values, const Datum& value_set, ExecContext* ctx) {
-  return ExecSetLookup("is_in", values, SetLookupOptions{value_set}, ctx);
+  return IsIn(values, SetLookupOptions{value_set}, ctx);
 }
 
 Result<Datum> IndexIn(const Datum& values, const SetLookupOptions& options,
                       ExecContext* ctx) {
-  return ExecSetLookup("index_in", values, options, ctx);
+  return CallFunction("index_in", {values}, &options, ctx);
 }
 
 Result<Datum> IndexIn(const Datum& values, const Datum& value_set, ExecContext* ctx) {
-  return ExecSetLookup("index_in", values, SetLookupOptions{value_set}, ctx);
+  return IndexIn(values, SetLookupOptions{value_set}, ctx);
 }
 
 // ----------------------------------------------------------------------
diff --git a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
index 96d8ba23c389b..31533ed1268a2 100644
--- a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
@@ -184,7 +184,27 @@ struct InitStateVisitor {
   }
 
   Result<std::unique_ptr<KernelState>> GetResult() {
-    if (!options.value_set.type()->Equals(arg_type)) {
+    if (arg_type->id() == Type::TIMESTAMP &&
+        options.value_set.type()->id() == Type::TIMESTAMP) {
+      // Other types will fail when casting, so no separate check is needed
+      const auto& ty1 = checked_cast<const TimestampType&>(*arg_type);
+      const auto& ty2 = checked_cast<const TimestampType&>(*options.value_set.type());
+      if (ty1.timezone().empty() ^ ty2.timezone().empty()) {
+        return Status::Invalid(
+            "Cannot compare timestamp with timezone to timestamp without timezone, got: ",
+            ty1, " and ", ty2);
+      }
+    } else if ((arg_type->id() == Type::STRING || arg_type->id() == Type::LARGE_STRING) &&
+               !is_base_binary_like(options.value_set.type()->id())) {
+      // This is a bit of a hack, but don't implicitly cast from a non-binary
+      // type to string, since most types support casting to string and that
+      // may lead to surprises. However, we do want most other implicit casts.
+      return Status::Invalid("Array type didn't match type of values set: ", *arg_type,
+                             " vs ", *options.value_set.type());
+    }
+    if (!options.value_set.is_arraylike()) {
+      return Status::Invalid("Set lookup value set must be Array or ChunkedArray");
+    } else if (!options.value_set.type()->Equals(arg_type)) {
       ARROW_ASSIGN_OR_RAISE(
           options.value_set,
           Cast(options.value_set, CastOptions::Safe(arg_type), ctx->exec_context()));
@@ -406,7 +426,7 @@ void AddBasicSetLookupKernels(ScalarKernel kernel,
                               ScalarFunction* func) {
   auto AddKernels = [&](const std::vector<std::shared_ptr<DataType>>& types) {
     for (const std::shared_ptr<DataType>& ty : types) {
-      kernel.signature = KernelSignature::Make({ty}, out_ty);
+      kernel.signature = KernelSignature::Make({InputType(ty->id())}, out_ty);
       DCHECK_OK(func->AddKernel(kernel));
     }
   };
@@ -416,7 +436,7 @@ void AddBasicSetLookupKernels(ScalarKernel kernel,
   AddKernels(TemporalTypes());
   AddKernels({month_day_nano_interval()});
 
-  std::vector<Type::type> other_types = {Type::BOOL, Type::DECIMAL,
+  std::vector<Type::type> other_types = {Type::BOOL, Type::DECIMAL128, Type::DECIMAL256,
                                          Type::FIXED_SIZE_BINARY};
   for (auto ty : other_types) {
     kernel.signature = KernelSignature::Make({InputType::Array(ty)}, out_ty);
diff --git a/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc b/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc
index 070196924d632..50eff4ab78b89 100644
--- a/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc
@@ -47,11 +47,9 @@ namespace compute {
 // ----------------------------------------------------------------------
 // IsIn tests
 
-void CheckIsIn(const std::shared_ptr<DataType>& type, const std::string& input_json,
-               const std::string& value_set_json, const std::string& expected_json,
+void CheckIsIn(const std::shared_ptr<Array> input,
+               const std::shared_ptr<Array>& value_set, const std::string& expected_json,
                bool skip_nulls = false) {
-  auto input = ArrayFromJSON(type, input_json);
-  auto value_set = ArrayFromJSON(type, value_set_json);
   auto expected = ArrayFromJSON(boolean(), expected_json);
 
   ASSERT_OK_AND_ASSIGN(Datum actual_datum,
@@ -61,6 +59,14 @@ void CheckIsIn(const std::shared_ptr<DataType>& type, const std::string& input_j
   AssertArraysEqual(*expected, *actual, /*verbose=*/true);
 }
 
+void CheckIsIn(const std::shared_ptr<DataType>& type, const std::string& input_json,
+               const std::string& value_set_json, const std::string& expected_json,
+               bool skip_nulls = false) {
+  auto input = ArrayFromJSON(type, input_json);
+  auto value_set = ArrayFromJSON(type, value_set_json);
+  CheckIsIn(input, value_set, expected_json, skip_nulls);
+}
+
 void CheckIsInChunked(const std::shared_ptr<ChunkedArray>& input,
                       const std::shared_ptr<ChunkedArray>& value_set,
                       const std::shared_ptr<ChunkedArray>& expected,
@@ -119,6 +125,22 @@ TEST_F(TestIsInKernel, ImplicitlyCastValueSet) {
   // fails; value_set cannot be cast to int8
   opts = SetLookupOptions{ArrayFromJSON(float32(), "[2.5, 3.1, 5.0]")};
   ASSERT_RAISES(Invalid, CallFunction("is_in", {input}, &opts));
+
+  // Allow implicit casts between binary types...
+  CheckIsIn(ArrayFromJSON(binary(), R"(["aaa", "bbb", "ccc", null, "bbb"])"),
+            ArrayFromJSON(fixed_size_binary(3), R"(["aaa", "bbb"])"),
+            "[true, true, false, false, true]");
+  CheckIsIn(ArrayFromJSON(utf8(), R"(["aaa", "bbb", "ccc", null, "bbb"])"),
+            ArrayFromJSON(large_utf8(), R"(["aaa", "bbb"])"),
+            "[true, true, false, false, true]");
+  // But explicitly deny implicit casts from non-binary to utf8 to
+  // avoid surprises
+  ASSERT_RAISES(Invalid,
+                IsIn(ArrayFromJSON(utf8(), R"(["aaa", "bbb", "ccc", null, "bbb"])"),
+                     SetLookupOptions(ArrayFromJSON(float64(), "[1.0, 2.0]"))));
+  ASSERT_RAISES(Invalid,
+                IsIn(ArrayFromJSON(large_utf8(), R"(["aaa", "bbb", "ccc", null, "bbb"])"),
+                     SetLookupOptions(ArrayFromJSON(float64(), "[1.0, 2.0]"))));
 }
 
 template <typename Type>
@@ -185,7 +207,8 @@ TEST_F(TestIsInKernel, NullType) {
 
 TEST_F(TestIsInKernel, TimeTimestamp) {
   for (const auto& type :
-       {time32(TimeUnit::SECOND), time64(TimeUnit::NANO), timestamp(TimeUnit::MICRO)}) {
+       {time32(TimeUnit::SECOND), time64(TimeUnit::NANO), timestamp(TimeUnit::MICRO),
+        timestamp(TimeUnit::NANO, "UTC")}) {
     CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, null]",
               "[true, true, false, true, true]", /*skip_nulls=*/false);
     CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, null]",
@@ -197,6 +220,19 @@ TEST_F(TestIsInKernel, TimeTimestamp) {
     CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, 1, null, 2]",
               "[true, false, false, true, true]", /*skip_nulls=*/true);
   }
+
+  // Disallow mixing timezone-aware and timezone-naive values
+  ASSERT_RAISES(Invalid, IsIn(ArrayFromJSON(timestamp(TimeUnit::SECOND), "[0, 1, 2]"),
+                              SetLookupOptions(ArrayFromJSON(
+                                  timestamp(TimeUnit::SECOND, "UTC"), "[0, 2]"))));
+  ASSERT_RAISES(
+      Invalid,
+      IsIn(ArrayFromJSON(timestamp(TimeUnit::SECOND, "UTC"), "[0, 1, 2]"),
+           SetLookupOptions(ArrayFromJSON(timestamp(TimeUnit::SECOND), "[0, 2]"))));
+  // However, mixed timezones are allowed (underlying value is UTC)
+  CheckIsIn(ArrayFromJSON(timestamp(TimeUnit::SECOND, "UTC"), "[0, 1, 2]"),
+            ArrayFromJSON(timestamp(TimeUnit::SECOND, "America/New_York"), "[0, 2]"),
+            "[true, false, true]");
 }
 
 TEST_F(TestIsInKernel, Boolean) {
@@ -273,34 +309,41 @@ TEST_F(TestIsInKernel, FixedSizeBinary) {
             R"(["aaa", null, "aaa", "bbb", "bbb", null])",
             "[true, true, false, false, true]",
             /*skip_nulls=*/true);
+
+  ASSERT_RAISES(Invalid,
+                IsIn(ArrayFromJSON(fixed_size_binary(3), R"(["abc"])"),
+                     SetLookupOptions(ArrayFromJSON(fixed_size_binary(2), R"(["ab"])"))));
 }
 
 TEST_F(TestIsInKernel, Decimal) {
-  auto type = decimal(3, 1);
-
-  CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])", R"(["12.3", "78.9"])",
-            "[true, false, true, false, true]",
-            /*skip_nulls=*/false);
-  CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])", R"(["12.3", "78.9"])",
-            "[true, false, true, false, true]",
-            /*skip_nulls=*/true);
-
-  CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])",
-            R"(["12.3", "78.9", null])", "[true, false, true, true, true]",
-            /*skip_nulls=*/false);
-  CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])",
-            R"(["12.3", "78.9", null])", "[true, false, true, false, true]",
-            /*skip_nulls=*/true);
+  for (auto type : {decimal128(3, 1), decimal256(3, 1)}) {
+    CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])", R"(["12.3", "78.9"])",
+              "[true, false, true, false, true]",
+              /*skip_nulls=*/false);
+    CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])", R"(["12.3", "78.9"])",
+              "[true, false, true, false, true]",
+              /*skip_nulls=*/true);
+
+    CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])",
+              R"(["12.3", "78.9", null])", "[true, false, true, true, true]",
+              /*skip_nulls=*/false);
+    CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])",
+              R"(["12.3", "78.9", null])", "[true, false, true, false, true]",
+              /*skip_nulls=*/true);
 
-  // Duplicates in right array
-  CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])",
-            R"([null, "12.3", "12.3", "78.9", "78.9", null])",
-            "[true, false, true, true, true]",
-            /*skip_nulls=*/false);
-  CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])",
-            R"([null, "12.3", "12.3", "78.9", "78.9", null])",
-            "[true, false, true, false, true]",
-            /*skip_nulls=*/true);
+    // Duplicates in right array
+    CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])",
+              R"([null, "12.3", "12.3", "78.9", "78.9", null])",
+              "[true, false, true, true, true]",
+              /*skip_nulls=*/false);
+    CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])",
+              R"([null, "12.3", "12.3", "78.9", "78.9", null])",
+              "[true, false, true, false, true]",
+              /*skip_nulls=*/true);
+
+    CheckIsIn(ArrayFromJSON(decimal128(4, 2), R"(["12.30", "45.60", "78.90"])"),
+              ArrayFromJSON(type, R"(["12.3", "78.9"])"), "[true, false, true]");
+  }
 }
 
 TEST_F(TestIsInKernel, DictionaryArray) {
@@ -426,11 +469,9 @@ TEST_F(TestIsInKernel, ChunkedArrayInvoke) {
 
 class TestIndexInKernel : public ::testing::Test {
  public:
-  void CheckIndexIn(const std::shared_ptr<DataType>& type, const std::string& input_json,
-                    const std::string& value_set_json, const std::string& expected_json,
-                    bool skip_nulls = false) {
-    std::shared_ptr<Array> input = ArrayFromJSON(type, input_json);
-    std::shared_ptr<Array> value_set = ArrayFromJSON(type, value_set_json);
+  void CheckIndexIn(const std::shared_ptr<Array>& input,
+                    const std::shared_ptr<Array>& value_set,
+                    const std::string& expected_json, bool skip_nulls = false) {
     std::shared_ptr<Array> expected = ArrayFromJSON(int32(), expected_json);
 
     SetLookupOptions options(value_set, skip_nulls);
@@ -440,6 +481,14 @@ class TestIndexInKernel : public ::testing::Test {
     AssertArraysEqual(*expected, *actual, /*verbose=*/true);
   }
 
+  void CheckIndexIn(const std::shared_ptr<DataType>& type, const std::string& input_json,
+                    const std::string& value_set_json, const std::string& expected_json,
+                    bool skip_nulls = false) {
+    std::shared_ptr<Array> input = ArrayFromJSON(type, input_json);
+    std::shared_ptr<Array> value_set = ArrayFromJSON(type, value_set_json);
+    return CheckIndexIn(input, value_set, expected_json, skip_nulls);
+  }
+
   void CheckIndexInChunked(const std::shared_ptr<ChunkedArray>& input,
                            const std::shared_ptr<ChunkedArray>& value_set,
                            const std::shared_ptr<ChunkedArray>& expected,
@@ -630,6 +679,9 @@ TEST_F(TestIndexInKernel, TimeTimestamp) {
   CheckIndexIn(timestamp(TimeUnit::NANO), "[2, null, 2, 1]", "[2, null, 1]",
                "[0, 1, 0, 2]");
 
+  CheckIndexIn(timestamp(TimeUnit::SECOND, "UTC"), "[2, null, 2, 1]", "[2, null, 1]",
+               "[0, 1, 0, 2]");
+
   // Empty input array
   CheckIndexIn(timestamp(TimeUnit::NANO), "[]", "[2, null, 1]", "[]");
 
@@ -639,6 +691,19 @@ TEST_F(TestIndexInKernel, TimeTimestamp) {
   // Both array are all null
   CheckIndexIn(time32(TimeUnit::SECOND), "[null, null, null, null]", "[null]",
                "[0, 0, 0, 0]");
+
+  // Disallow mixing timezone-aware and timezone-naive values
+  ASSERT_RAISES(Invalid, IndexIn(ArrayFromJSON(timestamp(TimeUnit::SECOND), "[0, 1, 2]"),
+                                 SetLookupOptions(ArrayFromJSON(
+                                     timestamp(TimeUnit::SECOND, "UTC"), "[0, 2]"))));
+  ASSERT_RAISES(
+      Invalid,
+      IndexIn(ArrayFromJSON(timestamp(TimeUnit::SECOND, "UTC"), "[0, 1, 2]"),
+              SetLookupOptions(ArrayFromJSON(timestamp(TimeUnit::SECOND), "[0, 2]"))));
+  // However, mixed timezones are allowed (underlying value is UTC)
+  CheckIndexIn(ArrayFromJSON(timestamp(TimeUnit::SECOND, "UTC"), "[0, 1, 2]"),
+               ArrayFromJSON(timestamp(TimeUnit::SECOND, "America/New_York"), "[0, 2]"),
+               "[0, null, 1]");
 }
 
 TEST_F(TestIndexInKernel, Boolean) {
@@ -801,6 +866,11 @@ TEST_F(TestIndexInKernel, FixedSizeBinary) {
 
   // Empty arrays
   CheckIndexIn(fixed_size_binary(0), R"([])", R"([])", R"([])");
+
+  ASSERT_RAISES(
+      Invalid,
+      IndexIn(ArrayFromJSON(fixed_size_binary(3), R"(["abc"])"),
+              SetLookupOptions(ArrayFromJSON(fixed_size_binary(2), R"(["ab"])"))));
 }
 
 TEST_F(TestIndexInKernel, MonthDayNanoInterval) {
@@ -822,41 +892,50 @@ TEST_F(TestIndexInKernel, MonthDayNanoInterval) {
 }
 
 TEST_F(TestIndexInKernel, Decimal) {
-  auto type = decimal(2, 0);
-
-  CheckIndexIn(type,
-               /*input=*/R"(["12", null, "11", "12", "13"])",
-               /*value_set=*/R"([null, "11", "12"])",
-               /*expected=*/R"([2, 0, 1, 2, null])",
-               /*skip_nulls=*/false);
-  CheckIndexIn(type,
-               /*input=*/R"(["12", null, "11", "12", "13"])",
-               /*value_set=*/R"([null, "11", "12"])",
-               /*expected=*/R"([2, null, 1, 2, null])",
-               /*skip_nulls=*/true);
-
-  CheckIndexIn(type,
-               /*input=*/R"(["12", null, "11", "12", "13"])",
-               /*value_set=*/R"(["11", "12"])",
-               /*expected=*/R"([1, null, 0, 1, null])",
-               /*skip_nulls=*/false);
-  CheckIndexIn(type,
-               /*input=*/R"(["12", null, "11", "12", "13"])",
-               /*value_set=*/R"(["11", "12"])",
-               /*expected=*/R"([1, null, 0, 1, null])",
-               /*skip_nulls=*/true);
-
-  // Duplicates in value_set
-  CheckIndexIn(type,
-               /*input=*/R"(["12", null, "11", "12", "13"])",
-               /*value_set=*/R"([null, null, "11", "11", "12", "12"])",
-               /*expected=*/R"([4, 0, 2, 4, null])",
-               /*skip_nulls=*/false);
-  CheckIndexIn(type,
-               /*input=*/R"(["12", null, "11", "12", "13"])",
-               /*value_set=*/R"([null, null, "11", "11", "12", "12"])",
-               /*expected=*/R"([4, null, 2, 4, null])",
-               /*skip_nulls=*/true);
+  for (const auto& type : {decimal128(2, 0), decimal256(2, 0)}) {
+    CheckIndexIn(type,
+                 /*input=*/R"(["12", null, "11", "12", "13"])",
+                 /*value_set=*/R"([null, "11", "12"])",
+                 /*expected=*/R"([2, 0, 1, 2, null])",
+                 /*skip_nulls=*/false);
+    CheckIndexIn(type,
+                 /*input=*/R"(["12", null, "11", "12", "13"])",
+                 /*value_set=*/R"([null, "11", "12"])",
+                 /*expected=*/R"([2, null, 1, 2, null])",
+                 /*skip_nulls=*/true);
+
+    CheckIndexIn(type,
+                 /*input=*/R"(["12", null, "11", "12", "13"])",
+                 /*value_set=*/R"(["11", "12"])",
+                 /*expected=*/R"([1, null, 0, 1, null])",
+                 /*skip_nulls=*/false);
+    CheckIndexIn(type,
+                 /*input=*/R"(["12", null, "11", "12", "13"])",
+                 /*value_set=*/R"(["11", "12"])",
+                 /*expected=*/R"([1, null, 0, 1, null])",
+                 /*skip_nulls=*/true);
+
+    // Duplicates in value_set
+    CheckIndexIn(type,
+                 /*input=*/R"(["12", null, "11", "12", "13"])",
+                 /*value_set=*/R"([null, null, "11", "11", "12", "12"])",
+                 /*expected=*/R"([4, 0, 2, 4, null])",
+                 /*skip_nulls=*/false);
+    CheckIndexIn(type,
+                 /*input=*/R"(["12", null, "11", "12", "13"])",
+                 /*value_set=*/R"([null, null, "11", "11", "12", "12"])",
+                 /*expected=*/R"([4, null, 2, 4, null])",
+                 /*skip_nulls=*/true);
+    CheckIndexIn(type,
+                 /*input=*/R"(["12", null, "11", "12", "13"])",
+                 /*value_set=*/R"([null, "11", "12"])",
+                 /*expected=*/R"([2, 0, 1, 2, null])",
+                 /*skip_nulls=*/false);
+
+    CheckIndexIn(
+        ArrayFromJSON(decimal256(3, 1), R"(["12.0", null, "11.0", "12.0", "13.0"])"),
+        ArrayFromJSON(type, R"([null, "11", "12"])"), R"([2, 0, 1, 2, null])");
+  }
 }
 
 TEST_F(TestIndexInKernel, DictionaryArray) {
diff --git a/python/pyarrow/tests/parquet/test_dataset.py b/python/pyarrow/tests/parquet/test_dataset.py
index 82f7e5814f1da..66c3df45fc0ca 100644
--- a/python/pyarrow/tests/parquet/test_dataset.py
+++ b/python/pyarrow/tests/parquet/test_dataset.py
@@ -449,7 +449,7 @@ def test_filters_inclusive_set(tempdir, use_legacy_dataset):
     dataset = pq.ParquetDataset(
         base_path, filesystem=fs,
         filters=[('integer', 'in', [1]), ('string', 'in', ('a', 'b')),
-                 ('boolean', 'not in', {False})],
+                 ('boolean', 'not in', {'False'})],
         use_legacy_dataset=use_legacy_dataset
     )
     table = dataset.read()

From a9f2091f8518590c72d25452dc60c8173ee6223c Mon Sep 17 00:00:00 2001
From: David Li <li.davidm96@gmail.com>
Date: Wed, 10 Nov 2021 14:21:08 +0100
Subject: [PATCH 121/194] ARROW-12820: [C++] Support zone offset in ISO8601,
 strptime parser

For ISO8601, this seems to have a small (~5%) impact on benchmarks.

For strptime, this is only supported on platforms exposing `tm_gmtoff` in `struct tm`. `%Z` still is ignored; it seems implementations don't really support it anyways. (For instance GNU libc will skip over the time zone, omitting it from the result.)

Closes #11358 from lidavidm/arrow-12820

Authored-by: David Li <li.davidm96@gmail.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 .../compute/kernels/scalar_cast_temporal.cc   |  21 +++-
 .../arrow/compute/kernels/scalar_cast_test.cc |  32 ++++-
 .../arrow/compute/kernels/scalar_string.cc    |  21 +++-
 .../compute/kernels/scalar_string_test.cc     |  19 +++
 cpp/src/arrow/csv/column_builder_test.cc      |  85 ++++++++++++++
 cpp/src/arrow/csv/converter.cc                |  54 ++++++++-
 cpp/src/arrow/csv/converter_test.cc           |  73 ++++++++++++
 cpp/src/arrow/csv/inference_internal.h        |  10 ++
 cpp/src/arrow/util/value_parsing.cc           |  30 ++++-
 cpp/src/arrow/util/value_parsing.h            |  73 +++++++++++-
 cpp/src/arrow/util/value_parsing_test.cc      | 110 +++++++++++++++++-
 docs/source/cpp/csv.rst                       |  65 +++++++++++
 r/tests/testthat/test-csv.R                   |  12 +-
 13 files changed, 574 insertions(+), 31 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc b/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc
index 5f16f1e9db455..74274e963a179 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc
@@ -29,7 +29,7 @@
 
 namespace arrow {
 
-using internal::ParseValue;
+using internal::ParseTimestampISO8601;
 
 namespace compute {
 namespace internal {
@@ -422,17 +422,34 @@ struct CastFunctor<TimestampType, Date64Type> {
 // String to Timestamp
 
 struct ParseTimestamp {
+  explicit ParseTimestamp(const TimestampType& type)
+      : type(type), expect_timezone(!type.timezone().empty()) {}
   template <typename OutValue, typename Arg0Value>
   OutValue Call(KernelContext*, Arg0Value val, Status* st) const {
     OutValue result = 0;
-    if (ARROW_PREDICT_FALSE(!ParseValue(type, val.data(), val.size(), &result))) {
+    bool zone_offset_present = false;
+    if (ARROW_PREDICT_FALSE(!ParseTimestampISO8601(val.data(), val.size(), type.unit(),
+                                                   &result, &zone_offset_present))) {
       *st = Status::Invalid("Failed to parse string: '", val, "' as a scalar of type ",
                             type.ToString());
     }
+    if (zone_offset_present != expect_timezone) {
+      if (expect_timezone) {
+        *st = Status::Invalid(
+            "Failed to parse string: '", val, "' as a scalar of type ", type.ToString(),
+            "expected a zone offset. If these timestamps "
+            "are in local time, cast to timestamp without timezone, then "
+            "call assume_timezone.");
+      } else {
+        *st = Status::Invalid("Failed to parse string: '", val, "' as a scalar of type ",
+                              type.ToString(), "expected no zone offset");
+      }
+    }
     return result;
   }
 
   const TimestampType& type;
+  bool expect_timezone;
 };
 
 template <typename I>
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
index 92de7892f95c9..b5cafead6b2c7 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
@@ -1945,7 +1945,37 @@ TEST(Cast, StringToTimestamp) {
       }
     }
 
-    // NOTE: timestamp parsing is tested comprehensively in parsing-util-test.cc
+    auto zoned = ArrayFromJSON(string_type,
+                               R"(["2020-02-29T00:00:00Z", "2020-03-02T10:11:12+0102"])");
+    auto mixed = ArrayFromJSON(string_type,
+                               R"(["2020-03-02T10:11:12+0102", "2020-02-29T00:00:00"])");
+
+    // Timestamp with zone offset should not parse as naive
+    CheckCastFails(zoned, CastOptions::Safe(timestamp(TimeUnit::SECOND)));
+
+    // Mixed zoned/unzoned should not parse as naive
+    CheckCastFails(mixed, CastOptions::Safe(timestamp(TimeUnit::SECOND)));
+    EXPECT_RAISES_WITH_MESSAGE_THAT(
+        Invalid, ::testing::HasSubstr("expected no zone offset"),
+        Cast(mixed, CastOptions::Safe(timestamp(TimeUnit::SECOND))));
+
+    // ...or as timestamp with timezone
+    EXPECT_RAISES_WITH_MESSAGE_THAT(
+        Invalid, ::testing::HasSubstr("expected a zone offset"),
+        Cast(mixed, CastOptions::Safe(timestamp(TimeUnit::SECOND, "UTC"))));
+
+    // Unzoned should not parse as timestamp with timezone
+    EXPECT_RAISES_WITH_MESSAGE_THAT(
+        Invalid, ::testing::HasSubstr("expected a zone offset"),
+        Cast(strings, CastOptions::Safe(timestamp(TimeUnit::SECOND, "UTC"))));
+
+    // Timestamp with zone offset can parse as any time zone (since they're unambiguous)
+    CheckCast(zoned, ArrayFromJSON(timestamp(TimeUnit::SECOND, "UTC"),
+                                   "[1582934400, 1583140152]"));
+    CheckCast(zoned, ArrayFromJSON(timestamp(TimeUnit::SECOND, "America/Phoenix"),
+                                   "[1582934400, 1583140152]"));
+
+    // NOTE: timestamp parsing is tested comprehensively in value_parsing_test.cc
   }
 }
 
diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index 7eeb80b013b0b..b567f4f351b4c 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -3625,11 +3625,24 @@ struct StrptimeExec {
 
 Result<ValueDescr> ResolveStrptimeOutput(KernelContext* ctx,
                                          const std::vector<ValueDescr>&) {
-  if (ctx->state()) {
-    return ::arrow::timestamp(StrptimeState::Get(ctx).unit);
+  if (!ctx->state()) {
+    return Status::Invalid("strptime does not provide default StrptimeOptions");
+  }
+  const StrptimeOptions& options = StrptimeState::Get(ctx);
+  // Check for use of %z or %Z
+  size_t cur = 0;
+  std::string zone = "";
+  while (cur < options.format.size() - 1) {
+    if (options.format[cur] == '%') {
+      if (options.format[cur + 1] == 'z') {
+        zone = "UTC";
+        break;
+      }
+      cur++;
+    }
+    cur++;
   }
-
-  return Status::Invalid("strptime does not provide default StrptimeOptions");
+  return ::arrow::timestamp(options.unit, zone);
 }
 
 // ----------------------------------------------------------------------
diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
index 4551e8c61e58f..18ca794f66942 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
@@ -32,6 +32,7 @@
 #include "arrow/compute/kernels/test_util.h"
 #include "arrow/testing/gtest_util.h"
 #include "arrow/type.h"
+#include "arrow/util/value_parsing.h"
 
 namespace arrow {
 namespace compute {
@@ -1757,6 +1758,24 @@ TYPED_TEST(TestStringKernels, Strptime) {
   std::string output1 = R"(["2020-05-01", null, "1900-12-11"])";
   StrptimeOptions options("%m/%d/%Y", TimeUnit::MICRO);
   this->CheckUnary("strptime", input1, timestamp(TimeUnit::MICRO), output1, &options);
+
+  input1 = R"(["5/1/2020 %z", null, "12/11/1900 %z"])";
+  options.format = "%m/%d/%Y %%z";
+  this->CheckUnary("strptime", input1, timestamp(TimeUnit::MICRO), output1, &options);
+}
+
+TYPED_TEST(TestStringKernels, StrptimeZoneOffset) {
+  if (!arrow::internal::kStrptimeSupportsZone) {
+    GTEST_SKIP() << "strptime does not support %z on this platform";
+  }
+  // N.B. BSD strptime only supports (+/-)HHMM and not the wider range
+  // of values GNU strptime supports.
+  std::string input1 = R"(["5/1/2020 +0100", null, "12/11/1900 -0130"])";
+  std::string output1 =
+      R"(["2020-04-30T23:00:00.000000", null, "1900-12-11T01:30:00.000000"])";
+  StrptimeOptions options("%m/%d/%Y %z", TimeUnit::MICRO);
+  this->CheckUnary("strptime", input1, timestamp(TimeUnit::MICRO, "UTC"), output1,
+                   &options);
 }
 
 TYPED_TEST(TestStringKernels, StrptimeDoesNotProvideDefaultOptions) {
diff --git a/cpp/src/arrow/csv/column_builder_test.cc b/cpp/src/arrow/csv/column_builder_test.cc
index 7577c883e8ce8..53e69ada62f11 100644
--- a/cpp/src/arrow/csv/column_builder_test.cc
+++ b/cpp/src/arrow/csv/column_builder_test.cc
@@ -33,6 +33,7 @@
 #include "arrow/util/checked_cast.h"
 #include "arrow/util/task_group.h"
 #include "arrow/util/thread_pool.h"
+#include "arrow/util/value_parsing.h"
 
 namespace arrow {
 namespace csv {
@@ -427,6 +428,13 @@ TEST_F(InferringColumnBuilderTest, SingleChunkTimestamp) {
                                         {{false, true, true}}, {{0, 0, 1542129070}},
                                         &expected);
   CheckInferred(tg, {{"", "1970-01-01", "2018-11-13 17:11:10"}}, options, expected);
+
+  options.timestamp_parsers.push_back(TimestampParser::MakeStrptime("%Y/%m/%d"));
+  tg = TaskGroup::MakeSerial();
+  ChunkedArrayFromVector<TimestampType>(timestamp(TimeUnit::SECOND),
+                                        {{false, true, true}}, {{0, 0, 1542067200}},
+                                        &expected);
+  CheckInferred(tg, {{"", "1970/01/01", "2018/11/13"}}, options, expected);
 }
 
 TEST_F(InferringColumnBuilderTest, MultipleChunkTimestamp) {
@@ -438,6 +446,13 @@ TEST_F(InferringColumnBuilderTest, MultipleChunkTimestamp) {
                                         {{false}, {true}, {true}},
                                         {{0}, {0}, {1542129070}}, &expected);
   CheckInferred(tg, {{""}, {"1970-01-01"}, {"2018-11-13 17:11:10"}}, options, expected);
+
+  options.timestamp_parsers.push_back(TimestampParser::MakeStrptime("%Y/%m/%d"));
+  tg = TaskGroup::MakeSerial();
+  ChunkedArrayFromVector<TimestampType>(timestamp(TimeUnit::SECOND),
+                                        {{false}, {true}, {true}},
+                                        {{0}, {0}, {1542067200}}, &expected);
+  CheckInferred(tg, {{""}, {"1970/01/01"}, {"2018/11/13"}}, options, expected);
 }
 
 TEST_F(InferringColumnBuilderTest, SingleChunkTimestampNS) {
@@ -471,6 +486,76 @@ TEST_F(InferringColumnBuilderTest, MultipleChunkTimestampNS) {
                 options, expected);
 }
 
+TEST_F(InferringColumnBuilderTest, SingleChunkTimestampWithZone) {
+  auto options = ConvertOptions::Defaults();
+  auto tg = TaskGroup::MakeSerial();
+
+  std::shared_ptr<ChunkedArray> expected;
+  ChunkedArrayFromVector<TimestampType>(timestamp(TimeUnit::SECOND, "UTC"),
+                                        {{false, true, true}}, {{0, 0, 1542129010}},
+                                        &expected);
+  CheckInferred(tg, {{"", "1970-01-01T00:00:00Z", "2018-11-13 17:11:10+0001"}}, options,
+                expected);
+
+  tg = TaskGroup::MakeSerial();
+  expected = ChunkedArrayFromJSON(
+      utf8(), {R"(["", "1970-01-01T00:00:00Z", "2018-11-13 17:11:10"])"});
+  CheckInferred(tg, {{"", "1970-01-01T00:00:00Z", "2018-11-13 17:11:10"}}, options,
+                expected);
+}
+
+TEST_F(InferringColumnBuilderTest, MultipleChunkTimestampWithZone) {
+  auto options = ConvertOptions::Defaults();
+  auto tg = TaskGroup::MakeSerial();
+
+  std::shared_ptr<ChunkedArray> expected;
+  ChunkedArrayFromVector<TimestampType>(timestamp(TimeUnit::SECOND, "UTC"),
+                                        {{false}, {true}, {true}},
+                                        {{0}, {0}, {1542129010}}, &expected);
+  CheckInferred(tg, {{""}, {"1970-01-01T00:00:00Z"}, {"2018-11-13 17:11:10+0001"}},
+                options, expected);
+
+  tg = TaskGroup::MakeSerial();
+  expected = ChunkedArrayFromJSON(
+      utf8(), {R"([""])", R"(["1970-01-01T00:00:00Z"])", R"(["2018-11-13 17:11:10"])"});
+  CheckInferred(tg, {{""}, {"1970-01-01T00:00:00Z"}, {"2018-11-13 17:11:10"}}, options,
+                expected);
+}
+
+TEST_F(InferringColumnBuilderTest, SingleChunkTimestampWithZoneNS) {
+  auto options = ConvertOptions::Defaults();
+  auto tg = TaskGroup::MakeSerial();
+
+  std::shared_ptr<ChunkedArray> expected;
+  ChunkedArrayFromVector<TimestampType>(
+      timestamp(TimeUnit::NANO, "UTC"), {{false, true, true, true, true}},
+      {{0, 3660000000000, 1542129070123000000, 1542129070123456000, 1542129070123456789}},
+      &expected);
+  CheckInferred(tg,
+                {{"", "1970-01-01T00:00:00-0101", "2018-11-13 17:11:10.123Z",
+                  "2018-11-13 17:11:10.123456Z", "2018-11-13 17:11:10.123456789Z"}},
+                options, expected);
+}
+
+TEST_F(InferringColumnBuilderTest, MultipleChunkTimestampWithZoneNS) {
+  auto options = ConvertOptions::Defaults();
+  auto tg = TaskGroup::MakeSerial();
+
+  std::shared_ptr<ChunkedArray> expected;
+  ChunkedArrayFromVector<TimestampType>(
+      timestamp(TimeUnit::NANO, "UTC"), {{false}, {true}, {true, true, true}},
+      {{0},
+       {3660000000000},
+       {1542129070123000000, 1542129070123456000, 1542129070123456789}},
+      &expected);
+  CheckInferred(tg,
+                {{""},
+                 {"1970-01-01T00:00:00-0101"},
+                 {"2018-11-13 17:11:10.123Z", "2018-11-13 17:11:10.123456Z",
+                  "2018-11-13 17:11:10.123456789Z"}},
+                options, expected);
+}
+
 TEST_F(InferringColumnBuilderTest, SingleChunkIntegerAndTime) {
   // Fallback to utf-8
   auto options = ConvertOptions::Defaults();
diff --git a/cpp/src/arrow/csv/converter.cc b/cpp/src/arrow/csv/converter.cc
index 66d05458097f2..4d00cbec4eba8 100644
--- a/cpp/src/arrow/csv/converter.cc
+++ b/cpp/src/arrow/csv/converter.cc
@@ -350,18 +350,37 @@ struct InlineISO8601ValueDecoder : public ValueDecoder {
   explicit InlineISO8601ValueDecoder(const std::shared_ptr<DataType>& type,
                                      const ConvertOptions& options)
       : ValueDecoder(type, options),
-        unit_(checked_cast<const TimestampType&>(*type_).unit()) {}
+        unit_(checked_cast<const TimestampType&>(*type_).unit()),
+        expect_timezone_(!checked_cast<const TimestampType&>(*type_).timezone().empty()) {
+  }
 
   Status Decode(const uint8_t* data, uint32_t size, bool quoted, value_type* out) {
-    if (ARROW_PREDICT_FALSE(!internal::ParseTimestampISO8601(
-            reinterpret_cast<const char*>(data), size, unit_, out))) {
+    bool zone_offset_present = false;
+    if (ARROW_PREDICT_FALSE(
+            !internal::ParseTimestampISO8601(reinterpret_cast<const char*>(data), size,
+                                             unit_, out, &zone_offset_present))) {
       return GenericConversionError(type_, data, size);
     }
+    if (zone_offset_present != expect_timezone_) {
+      if (expect_timezone_) {
+        return Status::Invalid("CSV conversion error to ", type_->ToString(),
+                               ": expected a zone offset in '",
+                               std::string(reinterpret_cast<const char*>(data), size),
+                               "'. If these timestamps are in local time, parse them as "
+                               "timestamps without timezone, then call assume_timezone.");
+      } else {
+        return Status::Invalid("CSV conversion error to ", type_->ToString(),
+                               ": expected no zone offset in '",
+                               std::string(reinterpret_cast<const char*>(data), size),
+                               "'");
+      }
+    }
     return Status::OK();
   }
 
  protected:
   TimeUnit::type unit_;
+  bool expect_timezone_;
 };
 
 struct SingleParserTimestampValueDecoder : public ValueDecoder {
@@ -371,18 +390,36 @@ struct SingleParserTimestampValueDecoder : public ValueDecoder {
                                              const ConvertOptions& options)
       : ValueDecoder(type, options),
         unit_(checked_cast<const TimestampType&>(*type_).unit()),
+        expect_timezone_(!checked_cast<const TimestampType&>(*type_).timezone().empty()),
         parser_(*options_.timestamp_parsers[0]) {}
 
   Status Decode(const uint8_t* data, uint32_t size, bool quoted, value_type* out) {
-    if (ARROW_PREDICT_FALSE(
-            !parser_(reinterpret_cast<const char*>(data), size, unit_, out))) {
+    bool zone_offset_present = false;
+    if (ARROW_PREDICT_FALSE(!parser_(reinterpret_cast<const char*>(data), size, unit_,
+                                     out, &zone_offset_present))) {
       return GenericConversionError(type_, data, size);
     }
+    if (zone_offset_present != expect_timezone_) {
+      if (expect_timezone_) {
+        return Status::Invalid("CSV conversion error to ", type_->ToString(),
+                               ": expected a zone offset in '",
+                               std::string(reinterpret_cast<const char*>(data), size),
+                               "'. If these timestamps are in local time, parse them as "
+                               "timestamps without timezone, then call assume_timezone. "
+                               "If using strptime, ensure '%z' is in the format string.");
+      } else {
+        return Status::Invalid("CSV conversion error to ", type_->ToString(),
+                               ": expected no zone offset in '",
+                               std::string(reinterpret_cast<const char*>(data), size),
+                               "'");
+      }
+    }
     return Status::OK();
   }
 
  protected:
   TimeUnit::type unit_;
+  bool expect_timezone_;
   const TimestampParser& parser_;
 };
 
@@ -393,11 +430,15 @@ struct MultipleParsersTimestampValueDecoder : public ValueDecoder {
                                                 const ConvertOptions& options)
       : ValueDecoder(type, options),
         unit_(checked_cast<const TimestampType&>(*type_).unit()),
+        expect_timezone_(!checked_cast<const TimestampType&>(*type_).timezone().empty()),
         parsers_(GetParsers(options_)) {}
 
   Status Decode(const uint8_t* data, uint32_t size, bool quoted, value_type* out) {
+    bool zone_offset_present = false;
     for (const auto& parser : parsers_) {
-      if (parser->operator()(reinterpret_cast<const char*>(data), size, unit_, out)) {
+      if (parser->operator()(reinterpret_cast<const char*>(data), size, unit_, out,
+                             &zone_offset_present) &&
+          zone_offset_present == expect_timezone_) {
         return Status::OK();
       }
     }
@@ -416,6 +457,7 @@ struct MultipleParsersTimestampValueDecoder : public ValueDecoder {
   }
 
   TimeUnit::type unit_;
+  bool expect_timezone_;
   std::vector<const TimestampParser*> parsers_;
 };
 
diff --git a/cpp/src/arrow/csv/converter_test.cc b/cpp/src/arrow/csv/converter_test.cc
index 9a68b95680370..9a83ef020dea9 100644
--- a/cpp/src/arrow/csv/converter_test.cc
+++ b/cpp/src/arrow/csv/converter_test.cc
@@ -548,12 +548,43 @@ TEST(TimestampConversion, Basics) {
                                            {"2018-11-13 17:11:10\n1900-02-28 12:34:56\n"},
                                            {{1542129070, -2203932304LL}});
 
+  // Zone offsets are not accepted
+  AssertConversionError(type,
+                        {"1970-01-01T00Z\n2000-02-29T00-0200\n"
+                         "3989-07-14T00+03:14\n1900-02-28 00-04:59\n"},
+                        {0});
+
   type = timestamp(TimeUnit::NANO);
   AssertConversion<TimestampType, int64_t>(
       type, {"1970-01-01\n2000-02-29\n1900-02-28\n"},
       {{0, 951782400000000000LL, -2203977600000000000LL}});
 }
 
+TEST(TimestampConversion, WithZoneOffset) {
+  auto type = timestamp(TimeUnit::SECOND, "UTC");
+
+  AssertConversion<TimestampType, int64_t>(
+      type,
+      {"1970-01-01T00Z\n2000-02-29T00-0200\n"
+       "3989-07-14T00+03:14\n1900-02-28 00-04:59\n"},
+      {{0, 951782400 + 7200, 63730281600LL - 11640, -2203977600LL + 17940}});
+
+  type = timestamp(TimeUnit::NANO, "UTC");
+  AssertConversion<TimestampType, int64_t>(
+      type,
+      {"1970-01-01T00Z\n"
+       "2000-02-29T00:00:00.123456789+0117\n"
+       "1900-02-28 00:00:00.123456789-01:00\n"},
+      {{0, 951782400000000000LL + 123456789LL - 4620000000000LL,
+        -2203977600000000000LL + 123456789LL + 3600000000000LL}});
+
+  // Local times are not accepted
+  AssertConversionError(type,
+                        {"1970-01-01T00\n2000-02-29T00\n"
+                         "3989-07-14T00\n1900-02-28 00\n"},
+                        {0});
+}
+
 TEST(TimestampConversion, Nulls) {
   auto type = timestamp(TimeUnit::MILLI);
   AssertConversion<TimestampType, int64_t>(
@@ -592,6 +623,48 @@ TEST(TimestampConversion, UserDefinedParsers) {
                                            {{86400000}, {172800000}}, options);
 }
 
+#ifndef _WIN32
+TEST(TimestampConversion, UserDefinedParsersWithZone) {
+  auto options = ConvertOptions::Defaults();
+  auto type = timestamp(TimeUnit::SECOND, "America/Phoenix");
+
+  // Test a single parser
+  options.timestamp_parsers = {TimestampParser::MakeStrptime("%m/%d/%Y %z")};
+  AssertConversion<TimestampType, int64_t>(type, {"01/02/1970 +0000,01/03/1970 +0000\n"},
+                                           {{86400}, {172800}}, options);
+
+  // Test multiple parsers
+  options.timestamp_parsers.push_back(TimestampParser::MakeISO8601());
+  AssertConversion<TimestampType, int64_t>(
+      type, {"01/02/1970 +0000,1970-01-03T00:00:00+0000\n"}, {{86400}, {172800}},
+      options);
+
+  // Test errors
+  options.timestamp_parsers = {TimestampParser::MakeStrptime("%m/%d/%Y")};
+  AssertConversionError(type, {"01/02/1970,01/03/1970\n"}, {0, 1}, options);
+  options.timestamp_parsers.push_back(TimestampParser::MakeISO8601());
+  AssertConversionError(type, {"01/02/1970,1970-01-03T00:00:00+0000\n"}, {0}, options);
+}
+#else
+// Windows uses the vendored musl strptime which doesn't support %z.
+TEST(TimestampConversion, UserDefinedParsersWithZone) {
+  auto options = ConvertOptions::Defaults();
+  auto type = timestamp(TimeUnit::SECOND, "America/Phoenix");
+
+  options.timestamp_parsers = {TimestampParser::MakeStrptime("%m/%d/%Y %z")};
+  AssertConversionError(type, {"01/02/1970 +0000,01/03/1970 +0000\n"}, {0, 1}, options);
+
+  options.timestamp_parsers.push_back(TimestampParser::MakeISO8601());
+  AssertConversionError(type, {"01/02/1970 +0000,1970-01-03T00:00:00+0000\n"}, {0},
+                        options);
+
+  options.timestamp_parsers = {TimestampParser::MakeStrptime("%m/%d/%Y")};
+  AssertConversionError(type, {"01/02/1970,01/03/1970\n"}, {0, 1}, options);
+  options.timestamp_parsers.push_back(TimestampParser::MakeISO8601());
+  AssertConversionError(type, {"01/02/1970,1970-01-03T00:00:00+0000\n"}, {0}, options);
+}
+#endif
+
 Decimal128 Dec128(util::string_view value) {
   Decimal128 dec;
   int32_t scale = 0;
diff --git a/cpp/src/arrow/csv/inference_internal.h b/cpp/src/arrow/csv/inference_internal.h
index 1fd6d41b5cc98..ede9bea319ec7 100644
--- a/cpp/src/arrow/csv/inference_internal.h
+++ b/cpp/src/arrow/csv/inference_internal.h
@@ -35,6 +35,8 @@ enum class InferKind {
   Time,
   Timestamp,
   TimestampNS,
+  TimestampWithZone,
+  TimestampWithZoneNS,
   TextDict,
   BinaryDict,
   Text,
@@ -67,6 +69,10 @@ class InferStatus {
       case InferKind::Timestamp:
         return SetKind(InferKind::TimestampNS);
       case InferKind::TimestampNS:
+        return SetKind(InferKind::TimestampWithZone);
+      case InferKind::TimestampWithZone:
+        return SetKind(InferKind::TimestampWithZoneNS);
+      case InferKind::TimestampWithZoneNS:
         return SetKind(InferKind::Real);
       case InferKind::Real:
         if (options_.auto_dict_encode) {
@@ -123,6 +129,10 @@ class InferStatus {
         return make_converter(timestamp(TimeUnit::SECOND));
       case InferKind::TimestampNS:
         return make_converter(timestamp(TimeUnit::NANO));
+      case InferKind::TimestampWithZone:
+        return make_converter(timestamp(TimeUnit::SECOND, "UTC"));
+      case InferKind::TimestampWithZoneNS:
+        return make_converter(timestamp(TimeUnit::NANO, "UTC"));
       case InferKind::Real:
         return make_converter(float64());
       case InferKind::Text:
diff --git a/cpp/src/arrow/util/value_parsing.cc b/cpp/src/arrow/util/value_parsing.cc
index adc333ecfcc4d..ccd7674aa7a34 100644
--- a/cpp/src/arrow/util/value_parsing.cc
+++ b/cpp/src/arrow/util/value_parsing.cc
@@ -42,10 +42,27 @@ namespace {
 
 class StrptimeTimestampParser : public TimestampParser {
  public:
-  explicit StrptimeTimestampParser(std::string format) : format_(std::move(format)) {}
+  explicit StrptimeTimestampParser(std::string format)
+      : format_(std::move(format)), have_zone_offset_(false) {
+    // Check for use of %z
+    size_t cur = 0;
+    while (cur < format_.size()) {
+      if (format_[cur] == '%') {
+        if (cur + 1 < format_.size() && format_[cur + 1] == 'z') {
+          have_zone_offset_ = true;
+          break;
+        }
+        cur++;
+      }
+      cur++;
+    }
+  }
 
-  bool operator()(const char* s, size_t length, TimeUnit::type out_unit,
-                  int64_t* out) const override {
+  bool operator()(const char* s, size_t length, TimeUnit::type out_unit, int64_t* out,
+                  bool* out_zone_offset_present = NULLPTR) const override {
+    if (out_zone_offset_present) {
+      *out_zone_offset_present = have_zone_offset_;
+    }
     return ParseTimestampStrptime(s, length, format_.c_str(),
                                   /*ignore_time_in_day=*/false,
                                   /*allow_trailing_chars=*/false, out_unit, out);
@@ -57,15 +74,16 @@ class StrptimeTimestampParser : public TimestampParser {
 
  private:
   std::string format_;
+  bool have_zone_offset_;
 };
 
 class ISO8601Parser : public TimestampParser {
  public:
   ISO8601Parser() {}
 
-  bool operator()(const char* s, size_t length, TimeUnit::type out_unit,
-                  int64_t* out) const override {
-    return ParseTimestampISO8601(s, length, out_unit, out);
+  bool operator()(const char* s, size_t length, TimeUnit::type out_unit, int64_t* out,
+                  bool* out_zone_offset_present = NULLPTR) const override {
+    return ParseTimestampISO8601(s, length, out_unit, out, out_zone_offset_present);
   }
 
   const char* kind() const override { return "iso8601"; }
diff --git a/cpp/src/arrow/util/value_parsing.h b/cpp/src/arrow/util/value_parsing.h
index d99634e122eee..927bcffcca3ec 100644
--- a/cpp/src/arrow/util/value_parsing.h
+++ b/cpp/src/arrow/util/value_parsing.h
@@ -45,7 +45,8 @@ class ARROW_EXPORT TimestampParser {
   virtual ~TimestampParser() = default;
 
   virtual bool operator()(const char* s, size_t length, TimeUnit::type out_unit,
-                          int64_t* out) const = 0;
+                          int64_t* out,
+                          bool* out_zone_offset_present = NULLPTR) const = 0;
 
   virtual const char* kind() const = 0;
 
@@ -495,6 +496,27 @@ static inline bool ParseHH_MM(const char* s, Duration* out) {
   return true;
 }
 
+template <typename Duration>
+static inline bool ParseHHMM(const char* s, Duration* out) {
+  uint8_t hours = 0;
+  uint8_t minutes = 0;
+  if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 0, 2, &hours))) {
+    return false;
+  }
+  if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 2, 2, &minutes))) {
+    return false;
+  }
+  if (ARROW_PREDICT_FALSE(hours >= 24)) {
+    return false;
+  }
+  if (ARROW_PREDICT_FALSE(minutes >= 60)) {
+    return false;
+  }
+  *out = std::chrono::duration_cast<Duration>(std::chrono::hours(hours) +
+                                              std::chrono::minutes(minutes));
+  return true;
+}
+
 template <typename Duration>
 static inline bool ParseHH_MM_SS(const char* s, Duration* out) {
   uint8_t hours = 0;
@@ -609,10 +631,15 @@ static inline bool ParseSubSeconds(const char* s, size_t length, TimeUnit::type
 }  // namespace detail
 
 static inline bool ParseTimestampISO8601(const char* s, size_t length,
-                                         TimeUnit::type unit,
-                                         TimestampType::c_type* out) {
+                                         TimeUnit::type unit, TimestampType::c_type* out,
+                                         bool* out_zone_offset_present = NULLPTR) {
   using seconds_type = std::chrono::duration<TimestampType::c_type>;
 
+  // We allow the following zone offset formats:
+  // - (none)
+  // - Z
+  // - [+-]HH(:?MM)?
+  //
   // We allow the following formats for all units:
   // - "YYYY-MM-DD"
   // - "YYYY-MM-DD[ T]hhZ?"
@@ -647,8 +674,38 @@ static inline bool ParseTimestampISO8601(const char* s, size_t length,
     return false;
   }
 
+  if (out_zone_offset_present) {
+    *out_zone_offset_present = false;
+  }
+
+  seconds_type zone_offset(0);
   if (s[length - 1] == 'Z') {
     --length;
+    if (out_zone_offset_present) *out_zone_offset_present = true;
+  } else if (s[length - 3] == '+' || s[length - 3] == '-') {
+    // [+-]HH
+    length -= 3;
+    if (ARROW_PREDICT_FALSE(!detail::ParseHH(s + length + 1, &zone_offset))) {
+      return false;
+    }
+    if (out_zone_offset_present) *out_zone_offset_present = true;
+  } else if (s[length - 5] == '+' || s[length - 5] == '-') {
+    // [+-]HHMM
+    length -= 5;
+    if (ARROW_PREDICT_FALSE(!detail::ParseHHMM(s + length + 1, &zone_offset))) {
+      return false;
+    }
+    if (out_zone_offset_present) *out_zone_offset_present = true;
+  } else if ((s[length - 6] == '+' || s[length - 6] == '-') && (s[length - 3] == ':')) {
+    // [+-]HH:MM
+    length -= 6;
+    if (ARROW_PREDICT_FALSE(!detail::ParseHH_MM(s + length + 1, &zone_offset))) {
+      return false;
+    }
+    if (out_zone_offset_present) *out_zone_offset_present = true;
+  }
+  if (s[length] == '+') {
+    zone_offset *= -1;
   }
 
   seconds_type seconds_since_midnight;
@@ -682,6 +739,7 @@ static inline bool ParseTimestampISO8601(const char* s, size_t length,
   }
 
   seconds_since_epoch += seconds_since_midnight;
+  seconds_since_epoch += zone_offset;
 
   if (length <= 19) {
     *out = util::CastSecondsToUnit(unit, seconds_since_epoch.count());
@@ -702,6 +760,12 @@ static inline bool ParseTimestampISO8601(const char* s, size_t length,
   return true;
 }
 
+#ifdef _WIN32
+static constexpr bool kStrptimeSupportsZone = false;
+#else
+static constexpr bool kStrptimeSupportsZone = true;
+#endif
+
 /// \brief Returns time since the UNIX epoch in the requested unit
 static inline bool ParseTimestampStrptime(const char* buf, size_t length,
                                           const char* format, bool ignore_time_in_day,
@@ -730,6 +794,9 @@ static inline bool ParseTimestampStrptime(const char* buf, size_t length,
   if (!ignore_time_in_day) {
     secs += (std::chrono::hours(result.tm_hour) + std::chrono::minutes(result.tm_min) +
              std::chrono::seconds(result.tm_sec));
+#ifndef _WIN32
+    secs -= std::chrono::seconds(result.tm_gmtoff);
+#endif
   }
   *out = util::CastSecondsToUnit(unit, secs.time_since_epoch().count());
   return true;
diff --git a/cpp/src/arrow/util/value_parsing_test.cc b/cpp/src/arrow/util/value_parsing_test.cc
index ebbb733398ddf..708d5ecd60f12 100644
--- a/cpp/src/arrow/util/value_parsing_test.cc
+++ b/cpp/src/arrow/util/value_parsing_test.cc
@@ -503,6 +503,15 @@ TEST(StringConversion, ToTimestampDateTime_ISO8601) {
 
     AssertConversion(type, "1970-01-01 00:00:00", 0);
     AssertConversion(type, "2018-11-13 17", 1542128400);
+    AssertConversion(type, "2018-11-13 17+00", 1542128400);
+    AssertConversion(type, "2018-11-13 17+0000", 1542128400);
+    AssertConversion(type, "2018-11-13 17+00:00", 1542128400);
+    AssertConversion(type, "2018-11-13 17+01", 1542124800);
+    AssertConversion(type, "2018-11-13 17+0117", 1542123780);
+    AssertConversion(type, "2018-11-13 17+01:17", 1542123780);
+    AssertConversion(type, "2018-11-13 17-01", 1542132000);
+    AssertConversion(type, "2018-11-13 17-0117", 1542133020);
+    AssertConversion(type, "2018-11-13 17-01:17", 1542133020);
     AssertConversion(type, "2018-11-13T17", 1542128400);
     AssertConversion(type, "2018-11-13 17Z", 1542128400);
     AssertConversion(type, "2018-11-13T17Z", 1542128400);
@@ -510,10 +519,28 @@ TEST(StringConversion, ToTimestampDateTime_ISO8601) {
     AssertConversion(type, "2018-11-13T17:11", 1542129060);
     AssertConversion(type, "2018-11-13 17:11Z", 1542129060);
     AssertConversion(type, "2018-11-13T17:11Z", 1542129060);
+    AssertConversion(type, "2018-11-13 17:11+00", 1542129060);
+    AssertConversion(type, "2018-11-13 17:11+0000", 1542129060);
+    AssertConversion(type, "2018-11-13 17:11+00:00", 1542129060);
+    AssertConversion(type, "2018-11-13 17:11+01", 1542125460);
+    AssertConversion(type, "2018-11-13 17:11+0117", 1542124440);
+    AssertConversion(type, "2018-11-13 17:11+01:17", 1542124440);
+    AssertConversion(type, "2018-11-13 17:11-01", 1542132660);
+    AssertConversion(type, "2018-11-13 17:11-0117", 1542133680);
+    AssertConversion(type, "2018-11-13 17:11-01:17", 1542133680);
     AssertConversion(type, "2018-11-13 17:11:10", 1542129070);
     AssertConversion(type, "2018-11-13T17:11:10", 1542129070);
     AssertConversion(type, "2018-11-13 17:11:10Z", 1542129070);
     AssertConversion(type, "2018-11-13T17:11:10Z", 1542129070);
+    AssertConversion(type, "2018-11-13T17:11:10+00", 1542129070);
+    AssertConversion(type, "2018-11-13T17:11:10+0000", 1542129070);
+    AssertConversion(type, "2018-11-13T17:11:10+00:00", 1542129070);
+    AssertConversion(type, "2018-11-13T17:11:10+01", 1542125470);
+    AssertConversion(type, "2018-11-13T17:11:10+0117", 1542124450);
+    AssertConversion(type, "2018-11-13T17:11:10+01:17", 1542124450);
+    AssertConversion(type, "2018-11-13T17:11:10-01", 1542132670);
+    AssertConversion(type, "2018-11-13T17:11:10-0117", 1542133690);
+    AssertConversion(type, "2018-11-13T17:11:10-01:17", 1542133690);
     AssertConversion(type, "1900-02-28 12:34:56", -2203932304LL);
 
     // No subseconds allowed
@@ -530,6 +557,22 @@ TEST(StringConversion, ToTimestampDateTime_ISO8601) {
     AssertConversionFails(type, "1970-01-01 00:00:60");
     AssertConversionFails(type, "1970-01-01 00:00,00");
     AssertConversionFails(type, "1970-01-01 00,00:00");
+    // Invalid zone offsets
+    AssertConversionFails(type, "1970-01-01 00:00+0");
+    AssertConversionFails(type, "1970-01-01 00:00+000");
+    AssertConversionFails(type, "1970-01-01 00:00+00000");
+    AssertConversionFails(type, "1970-01-01 00:00+2400");
+    AssertConversionFails(type, "1970-01-01 00:00+0060");
+    AssertConversionFails(type, "1970-01-01 00-0");
+    AssertConversionFails(type, "1970-01-01 00-000");
+    AssertConversionFails(type, "1970-01-01 00+00000");
+    AssertConversionFails(type, "1970-01-01 00+2400");
+    AssertConversionFails(type, "1970-01-01 00+0060");
+    AssertConversionFails(type, "1970-01-01 00:00:00+0");
+    AssertConversionFails(type, "1970-01-01 00:00:00-000");
+    AssertConversionFails(type, "1970-01-01 00:00:00-00000");
+    AssertConversionFails(type, "1970-01-01 00:00:00+2400");
+    AssertConversionFails(type, "1970-01-01 00:00:00+00:99");
   }
   {
     TimestampType type{TimeUnit::MILLI};
@@ -544,6 +587,13 @@ TEST(StringConversion, ToTimestampDateTime_ISO8601) {
     AssertConversion(type, "1900-02-28 12:34:56.12", -2203932304000LL + 120LL);
     AssertConversion(type, "1900-02-28 12:34:56.123", -2203932304000LL + 123LL);
 
+    AssertConversion(type, "2018-11-13 17:11:10.123+01", 1542129070123LL - 3600000LL);
+    AssertConversion(type, "2018-11-13 17:11:10.123+0117", 1542129070123LL - 4620000LL);
+    AssertConversion(type, "2018-11-13 17:11:10.123+01:17", 1542129070123LL - 4620000LL);
+    AssertConversion(type, "2018-11-13 17:11:10.123-01", 1542129070123LL + 3600000LL);
+    AssertConversion(type, "2018-11-13 17:11:10.123-0117", 1542129070123LL + 4620000LL);
+    AssertConversion(type, "2018-11-13 17:11:10.123-01:17", 1542129070123LL + 4620000LL);
+
     // Invalid subseconds
     AssertConversionFails(type, "1900-02-28 12:34:56.1234");
     AssertConversionFails(type, "1900-02-28 12:34:56.12345");
@@ -569,6 +619,19 @@ TEST(StringConversion, ToTimestampDateTime_ISO8601) {
     AssertConversion(type, "1900-02-28 12:34:56.12345", -2203932304000000LL + 123450LL);
     AssertConversion(type, "1900-02-28 12:34:56.123456", -2203932304000000LL + 123456LL);
 
+    AssertConversion(type, "1900-02-28 12:34:56.123456+01",
+                     -2203932304000000LL + 123456LL - 3600000000LL);
+    AssertConversion(type, "1900-02-28 12:34:56.123456+0117",
+                     -2203932304000000LL + 123456LL - 4620000000LL);
+    AssertConversion(type, "1900-02-28 12:34:56.123456+01:17",
+                     -2203932304000000LL + 123456LL - 4620000000LL);
+    AssertConversion(type, "1900-02-28 12:34:56.123456-01",
+                     -2203932304000000LL + 123456LL + 3600000000LL);
+    AssertConversion(type, "1900-02-28 12:34:56.123456-0117",
+                     -2203932304000000LL + 123456LL + 4620000000LL);
+    AssertConversion(type, "1900-02-28 12:34:56.123456-01:17",
+                     -2203932304000000LL + 123456LL + 4620000000LL);
+
     // Invalid subseconds
     AssertConversionFails(type, "1900-02-28 12:34:56.1234567");
     AssertConversionFails(type, "1900-02-28 12:34:56.12345678");
@@ -602,7 +665,21 @@ TEST(StringConversion, ToTimestampDateTime_ISO8601) {
     AssertConversion(type, "1900-02-28 12:34:56.123456789",
                      -2203932304000000000LL + 123456789LL);
 
+    AssertConversion(type, "1900-02-28 12:34:56.123456789+01",
+                     -2203932304000000000LL + 123456789LL - 3600000000000LL);
+    AssertConversion(type, "1900-02-28 12:34:56.123456789+0117",
+                     -2203932304000000000LL + 123456789LL - 4620000000000LL);
+    AssertConversion(type, "1900-02-28 12:34:56.123456789+01:17",
+                     -2203932304000000000LL + 123456789LL - 4620000000000LL);
+    AssertConversion(type, "1900-02-28 12:34:56.123456789-01",
+                     -2203932304000000000LL + 123456789LL + 3600000000000LL);
+    AssertConversion(type, "1900-02-28 12:34:56.123456789-0117",
+                     -2203932304000000000LL + 123456789LL + 4620000000000LL);
+    AssertConversion(type, "1900-02-28 12:34:56.123456789-01:17",
+                     -2203932304000000000LL + 123456789LL + 4620000000000LL);
+
     // Invalid subseconds
+    AssertConversionFails(type, "1900-02-28 12:34:56.1234567890");
   }
 }
 
@@ -618,10 +695,7 @@ TEST(TimestampParser, StrptimeParser) {
   std::vector<Case> cases = {{"5/31/2000 12:34:56", "2000-05-31 12:34:56"},
                              {"5/31/2000 00:00:00", "2000-05-31 00:00:00"}};
 
-  std::vector<TimeUnit::type> units = {TimeUnit::SECOND, TimeUnit::MILLI, TimeUnit::MICRO,
-                                       TimeUnit::NANO};
-
-  for (auto unit : units) {
+  for (auto unit : TimeUnit::values()) {
     for (const auto& case_ : cases) {
       int64_t converted, expected;
       ASSERT_TRUE((*parser)(case_.value.c_str(), case_.value.size(), unit, &converted));
@@ -639,5 +713,33 @@ TEST(TimestampParser, StrptimeParser) {
   }
 }
 
+TEST(TimestampParser, StrptimeZoneOffset) {
+  if (!kStrptimeSupportsZone) {
+    GTEST_SKIP() << "strptime does not support %z on this platform";
+  }
+  std::string format = "%Y-%d-%m %H:%M:%S%z";
+  auto parser = TimestampParser::MakeStrptime(format);
+
+  // N.B. GNU %z supports ISO8601 format while BSD %z supports only
+  // +HHMM or -HHMM and POSIX doesn't appear to define %z at all
+  for (auto unit : TimeUnit::values()) {
+    for (const std::string& value :
+         {"2018-01-01 00:00:00+0000", "2018-01-01 00:00:00+0100",
+          "2018-01-01 00:00:00+0130", "2018-01-01 00:00:00-0117"}) {
+      SCOPED_TRACE(value);
+      int64_t converted = 0;
+      int64_t expected = 0;
+      ASSERT_TRUE((*parser)(value.c_str(), value.size(), unit, &converted));
+      ASSERT_TRUE(ParseTimestampISO8601(value.c_str(), value.size(), unit, &expected));
+      ASSERT_EQ(expected, converted);
+    }
+    for (const std::string& value : {"2018-01-01 00:00:00", "2018-01-01 00:00:00EST"}) {
+      SCOPED_TRACE(value);
+      int64_t converted = 0;
+      ASSERT_FALSE((*parser)(value.c_str(), value.size(), unit, &converted));
+    }
+  }
+}
+
 }  // namespace internal
 }  // namespace arrow
diff --git a/docs/source/cpp/csv.rst b/docs/source/cpp/csv.rst
index 42b5af67d80a9..d6bb66ce49e90 100644
--- a/docs/source/cpp/csv.rst
+++ b/docs/source/cpp/csv.rst
@@ -190,6 +190,71 @@ dictionary-encoded string-like array.  It switches to a plain string-like
 array when the threshold in :member:`ConvertOptions::auto_dict_max_cardinality`
 is reached.
 
+Timestamp inference/parsing
+---------------------------
+
+If type inference is enabled, the CSV reader first tries to interpret
+string-like columns as timestamps. If all rows have some zone offset
+(e.g. ``Z`` or ``+0100``), even if the offsets are inconsistent, then the
+inferred type will be UTC timestamp. If no rows have a zone offset, then the
+inferred type will be timestamp without timezone. A mix of rows with/without
+offsets will result in a string column.
+
+If the type is explicitly specified as a timestamp with/without timezone, then
+the reader will error on values without/with zone offsets in that column. Note
+that this means it isn't currently possible to have the reader parse a column
+of timestamps without zone offsets as local times in a particular timezone;
+instead, parse the column as timestamp without timezone, then convert the
+values afterwards using the ``assume_timezone`` compute function.
+
++-------------------+------------------------------+-------------------+
+| Specified Type    | Input CSV                    | Result Type       |
++===================+==============================+===================+
+| (inferred)        | ``2021-01-01T00:00:00``      | timestamp[s]      |
+|                   +------------------------------+-------------------+
+|                   | ``2021-01-01T00:00:00Z``     | timestamp[s, UTC] |
+|                   +------------------------------+                   |
+|                   | ``2021-01-01T00:00:00+0100`` |                   |
+|                   +------------------------------+-------------------+
+|                   | ::                           | string            |
+|                   |                              |                   |
+|                   |     2021-01-01T00:00:00      |                   |
+|                   |     2021-01-01T00:00:00Z     |                   |
++-------------------+------------------------------+-------------------+
+| timestamp[s]      | ``2021-01-01T00:00:00``      | timestamp[s]      |
+|                   +------------------------------+-------------------+
+|                   | ``2021-01-01T00:00:00Z``     | (error)           |
+|                   +------------------------------+                   |
+|                   | ``2021-01-01T00:00:00+0100`` |                   |
+|                   +------------------------------+                   |
+|                   | ::                           |                   |
+|                   |                              |                   |
+|                   |     2021-01-01T00:00:00      |                   |
+|                   |     2021-01-01T00:00:00Z     |                   |
++-------------------+------------------------------+-------------------+
+| timestamp[s, UTC] | ``2021-01-01T00:00:00``      | (error)           |
+|                   +------------------------------+-------------------+
+|                   | ``2021-01-01T00:00:00Z``     | timestamp[s, UTC] |
+|                   +------------------------------+                   |
+|                   | ``2021-01-01T00:00:00+0100`` |                   |
+|                   +------------------------------+-------------------+
+|                   | ::                           | (error)           |
+|                   |                              |                   |
+|                   |     2021-01-01T00:00:00      |                   |
+|                   |     2021-01-01T00:00:00Z     |                   |
++-------------------+------------------------------+-------------------+
+| timestamp[s,      | ``2021-01-01T00:00:00``      | (error)           |
+| America/New_York] +------------------------------+-------------------+
+|                   | ``2021-01-01T00:00:00Z``     | timestamp[s,      |
+|                   +------------------------------+ America/New_York] |
+|                   | ``2021-01-01T00:00:00+0100`` |                   |
+|                   +------------------------------+-------------------+
+|                   | ::                           | (error)           |
+|                   |                              |                   |
+|                   |     2021-01-01T00:00:00      |                   |
+|                   |     2021-01-01T00:00:00Z     |                   |
++-------------------+------------------------------+-------------------+
+
 Nulls
 -----
 
diff --git a/r/tests/testthat/test-csv.R b/r/tests/testthat/test-csv.R
index f0416eb3f728f..061fbbd4cdb1c 100644
--- a/r/tests/testthat/test-csv.R
+++ b/r/tests/testthat/test-csv.R
@@ -219,10 +219,10 @@ test_that("read_csv_arrow() can read timestamps", {
   on.exit(unlink(tf))
   write.csv(tbl, tf, row.names = FALSE)
 
-  df <- read_csv_arrow(tf, col_types = schema(time = timestamp(timezone = "UTC")))
-  expect_equal(tbl, df)
-
+  df <- read_csv_arrow(tf, col_types = schema(time = timestamp()))
   # time zones are being read in as time zone-naive, hence ignore_attr = "tzone"
+  expect_equal(tbl, df, ignore_attr = "tzone")
+
   df <- read_csv_arrow(tf, col_types = "T", col_names = "time", skip = 1)
   expect_equal(tbl, df, ignore_attr = "tzone")
 })
@@ -235,10 +235,12 @@ test_that("read_csv_arrow(timestamp_parsers=)", {
 
   df <- read_csv_arrow(
     tf,
-    col_types = schema(time = timestamp(timezone = "UTC")),
+    col_types = schema(time = timestamp()),
     timestamp_parsers = "%d/%m/%Y"
   )
-  expect_equal(df$time, as.POSIXct(tbl$time, format = "%d/%m/%Y", tz = "UTC"))
+  # time zones are being read in as time zone-naive, hence ignore_attr = "tzone"
+  expected <- as.POSIXct(tbl$time, format = "%d/%m/%Y", tz = "UTC")
+  expect_equal(df$time, expected, ignore_attr = "tzone")
 })
 
 test_that("Skipping columns with null()", {

From 140b0b201f3f6b922ff20000178c9f58b4ee2c09 Mon Sep 17 00:00:00 2001
From: David Li <li.davidm96@gmail.com>
Date: Wed, 10 Nov 2021 14:27:04 +0100
Subject: [PATCH 122/194] ARROW-14615: [C++] Refactor nested field refs and add
 union support

Closes #11641 from lidavidm/arrow-14615

Authored-by: David Li <li.davidm96@gmail.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/arrow/array/array_nested.cc           |  38 +++++
 cpp/src/arrow/array/array_nested.h            |   8 +
 cpp/src/arrow/array/array_union_test.cc       |  65 ++++++++
 cpp/src/arrow/compute/api_scalar.cc           |   8 +
 cpp/src/arrow/compute/api_scalar.h            |  12 ++
 cpp/src/arrow/compute/exec/expression.cc      |  28 +---
 .../arrow/compute/kernels/scalar_nested.cc    | 152 ++++++++++++++++++
 .../compute/kernels/scalar_nested_test.cc     | 108 +++++++++++++
 docs/source/cpp/compute.rst                   |  43 +++--
 docs/source/python/api/compute.rst            |   1 +
 10 files changed, 431 insertions(+), 32 deletions(-)

diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc
index 2b4006961c76f..a3c1fab054e94 100644
--- a/cpp/src/arrow/array/array_nested.cc
+++ b/cpp/src/arrow/array/array_nested.cc
@@ -35,6 +35,7 @@
 #include "arrow/type_traits.h"
 #include "arrow/util/atomic_shared_ptr.h"
 #include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_generate.h"
 #include "arrow/util/bitmap_ops.h"
 #include "arrow/util/checked_cast.h"
 #include "arrow/util/logging.h"
@@ -650,6 +651,43 @@ SparseUnionArray::SparseUnionArray(std::shared_ptr<DataType> type, int64_t lengt
   SetData(std::move(internal_data));
 }
 
+Result<std::shared_ptr<Array>> SparseUnionArray::GetFlattenedField(
+    int index, MemoryPool* pool) const {
+  if (index < 0 || index >= num_fields()) {
+    return Status::Invalid("Index out of range: ", index);
+  }
+  auto child_data = data_->child_data[index]->Copy();
+  // Adjust the result offset/length to be absolute.
+  if (data_->offset != 0 || data_->length != child_data->length) {
+    child_data = child_data->Slice(data_->offset, data_->length);
+  }
+  std::shared_ptr<Buffer> child_null_bitmap = child_data->buffers[0];
+  const int64_t child_offset = child_data->offset;
+
+  // Synthesize a null bitmap based on the union discriminant.
+  // Make sure the bitmap has extra bits corresponding to the child offset.
+  ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> flattened_null_bitmap,
+                        AllocateEmptyBitmap(child_data->length + child_offset, pool));
+  const int8_t type_code = union_type()->type_codes()[index];
+  const int8_t* type_codes = raw_type_codes();
+  int64_t offset = 0;
+  internal::GenerateBitsUnrolled(flattened_null_bitmap->mutable_data(), child_offset,
+                                 data_->length,
+                                 [&] { return type_codes[offset++] == type_code; });
+
+  // The validity of a flattened datum is the logical AND of the synthesized
+  // null bitmap buffer and the individual field element's validity.
+  if (child_null_bitmap) {
+    BitmapAnd(flattened_null_bitmap->data(), child_offset, child_null_bitmap->data(),
+              child_offset, child_data->length, child_offset,
+              flattened_null_bitmap->mutable_data());
+  }
+
+  child_data->buffers[0] = std::move(flattened_null_bitmap);
+  child_data->null_count = kUnknownNullCount;
+  return MakeArray(child_data);
+}
+
 DenseUnionArray::DenseUnionArray(const std::shared_ptr<ArrayData>& data) {
   SetData(data);
 }
diff --git a/cpp/src/arrow/array/array_nested.h b/cpp/src/arrow/array/array_nested.h
index 178a0589d5a3b..b89680b79c043 100644
--- a/cpp/src/arrow/array/array_nested.h
+++ b/cpp/src/arrow/array/array_nested.h
@@ -464,6 +464,14 @@ class ARROW_EXPORT SparseUnionArray : public UnionArray {
     return internal::checked_cast<const SparseUnionType*>(union_type_);
   }
 
+  /// \brief Get one of the child arrays, adjusting its null bitmap
+  /// where the union array type code does not match.
+  ///
+  /// \param[in] index Which child array to get (i.e. the physical index, not the type
+  /// code) \param[in] pool The pool to allocate null bitmaps from, if necessary
+  Result<std::shared_ptr<Array>> GetFlattenedField(
+      int index, MemoryPool* pool = default_memory_pool()) const;
+
  protected:
   void SetData(std::shared_ptr<ArrayData> data);
 };
diff --git a/cpp/src/arrow/array/array_union_test.cc b/cpp/src/arrow/array/array_union_test.cc
index d3afe40df8ddc..3bd87a3438f4c 100644
--- a/cpp/src/arrow/array/array_union_test.cc
+++ b/cpp/src/arrow/array/array_union_test.cc
@@ -32,6 +32,7 @@
 namespace arrow {
 
 using internal::checked_cast;
+using internal::checked_pointer_cast;
 
 TEST(TestUnionArray, TestSliceEquals) {
   std::shared_ptr<RecordBatch> batch;
@@ -68,6 +69,70 @@ TEST(TestUnionArray, TestSliceEquals) {
   CheckUnion(batch->column(1));
 }
 
+TEST(TestSparseUnionArray, GetFlattenedField) {
+  auto ty = sparse_union({field("ints", int64()), field("strs", utf8())}, {2, 7});
+  auto ints = ArrayFromJSON(int64(), "[0, 1, 2, 3]");
+  auto strs = ArrayFromJSON(utf8(), R"(["a", null, "c", "d"])");
+  auto ids = ArrayFromJSON(int8(), "[2, 7, 2, 7]")->data()->buffers[1];
+  const int length = 4;
+
+  {
+    SparseUnionArray arr(ty, length, {ints, strs}, ids);
+    ASSERT_OK(arr.ValidateFull());
+
+    ASSERT_OK_AND_ASSIGN(auto flattened, arr.GetFlattenedField(0));
+    AssertArraysEqual(*ArrayFromJSON(int64(), "[0, null, 2, null]"), *flattened,
+                      /*verbose=*/true);
+
+    ASSERT_OK_AND_ASSIGN(flattened, arr.GetFlattenedField(1));
+    AssertArraysEqual(*ArrayFromJSON(utf8(), R"([null, null, null, "d"])"), *flattened,
+                      /*verbose=*/true);
+
+    const auto sliced = checked_pointer_cast<SparseUnionArray>(arr.Slice(1, 2));
+
+    ASSERT_OK_AND_ASSIGN(flattened, sliced->GetFlattenedField(0));
+    AssertArraysEqual(*ArrayFromJSON(int64(), "[null, 2]"), *flattened, /*verbose=*/true);
+
+    ASSERT_OK_AND_ASSIGN(flattened, sliced->GetFlattenedField(1));
+    AssertArraysEqual(*ArrayFromJSON(utf8(), R"([null, null])"), *flattened,
+                      /*verbose=*/true);
+
+    ASSERT_RAISES(Invalid, arr.GetFlattenedField(-1));
+    ASSERT_RAISES(Invalid, arr.GetFlattenedField(2));
+  }
+  {
+    SparseUnionArray arr(ty, length - 2, {ints->Slice(1, 2), strs->Slice(1, 2)}, ids);
+    ASSERT_OK(arr.ValidateFull());
+
+    ASSERT_OK_AND_ASSIGN(auto flattened, arr.GetFlattenedField(0));
+    AssertArraysEqual(*ArrayFromJSON(int64(), "[1, null]"), *flattened, /*verbose=*/true);
+
+    ASSERT_OK_AND_ASSIGN(flattened, arr.GetFlattenedField(1));
+    AssertArraysEqual(*ArrayFromJSON(utf8(), R"([null, "c"])"), *flattened,
+                      /*verbose=*/true);
+
+    const auto sliced = checked_pointer_cast<SparseUnionArray>(arr.Slice(1, 1));
+
+    ASSERT_OK_AND_ASSIGN(flattened, sliced->GetFlattenedField(0));
+    AssertArraysEqual(*ArrayFromJSON(int64(), "[null]"), *flattened, /*verbose=*/true);
+
+    ASSERT_OK_AND_ASSIGN(flattened, sliced->GetFlattenedField(1));
+    AssertArraysEqual(*ArrayFromJSON(utf8(), R"(["c"])"), *flattened, /*verbose=*/true);
+  }
+  {
+    SparseUnionArray arr(ty, /*length=*/0, {ints->Slice(length), strs->Slice(length)},
+                         ids);
+    ASSERT_OK(arr.ValidateFull());
+
+    ASSERT_OK_AND_ASSIGN(auto flattened, arr.GetFlattenedField(0));
+    AssertArraysEqual(*ArrayFromJSON(int64(), "[]"), *flattened, /*verbose=*/true);
+
+    ASSERT_OK_AND_ASSIGN(flattened, arr.GetFlattenedField(1));
+    AssertArraysEqual(*ArrayFromJSON(utf8(), "[]"), *flattened,
+                      /*verbose=*/true);
+  }
+}
+
 TEST(TestSparseUnionArray, Validate) {
   auto a = ArrayFromJSON(int32(), "[4, 5]");
   auto type = sparse_union({field("a", int32())});
diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc
index c06c902be601b..2fd1698ee1d3e 100644
--- a/cpp/src/arrow/compute/api_scalar.cc
+++ b/cpp/src/arrow/compute/api_scalar.cc
@@ -223,6 +223,8 @@ static auto kExtractRegexOptionsType = GetFunctionOptionsType<ExtractRegexOption
 static auto kSetLookupOptionsType = GetFunctionOptionsType<SetLookupOptions>(
     DataMember("value_set", &SetLookupOptions::value_set),
     DataMember("skip_nulls", &SetLookupOptions::skip_nulls));
+static auto kStructFieldOptionsType = GetFunctionOptionsType<StructFieldOptions>(
+    DataMember("indices", &StructFieldOptions::indices));
 static auto kStrptimeOptionsType = GetFunctionOptionsType<StrptimeOptions>(
     DataMember("format", &StrptimeOptions::format),
     DataMember("unit", &StrptimeOptions::unit));
@@ -351,6 +353,11 @@ SetLookupOptions::SetLookupOptions(Datum value_set, bool skip_nulls)
 SetLookupOptions::SetLookupOptions() : SetLookupOptions({}, false) {}
 constexpr char SetLookupOptions::kTypeName[];
 
+StructFieldOptions::StructFieldOptions(std::vector<int> indices)
+    : FunctionOptions(internal::kStructFieldOptionsType), indices(std::move(indices)) {}
+StructFieldOptions::StructFieldOptions() : StructFieldOptions(std::vector<int>()) {}
+constexpr char StructFieldOptions::kTypeName[];
+
 StrptimeOptions::StrptimeOptions(std::string format, TimeUnit::type unit)
     : FunctionOptions(internal::kStrptimeOptionsType),
       format(std::move(format)),
@@ -444,6 +451,7 @@ void RegisterScalarOptions(FunctionRegistry* registry) {
   DCHECK_OK(registry->AddFunctionOptionsType(kReplaceSubstringOptionsType));
   DCHECK_OK(registry->AddFunctionOptionsType(kExtractRegexOptionsType));
   DCHECK_OK(registry->AddFunctionOptionsType(kSetLookupOptionsType));
+  DCHECK_OK(registry->AddFunctionOptionsType(kStructFieldOptionsType));
   DCHECK_OK(registry->AddFunctionOptionsType(kStrptimeOptionsType));
   DCHECK_OK(registry->AddFunctionOptionsType(kStrftimeOptionsType));
   DCHECK_OK(registry->AddFunctionOptionsType(kAssumeTimezoneOptionsType));
diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h
index 4bb18b37527a9..d2234a6182d10 100644
--- a/cpp/src/arrow/compute/api_scalar.h
+++ b/cpp/src/arrow/compute/api_scalar.h
@@ -223,6 +223,18 @@ class ARROW_EXPORT SetLookupOptions : public FunctionOptions {
   bool skip_nulls;
 };
 
+/// Options for struct_field function
+class ARROW_EXPORT StructFieldOptions : public FunctionOptions {
+ public:
+  explicit StructFieldOptions(std::vector<int> indices);
+  StructFieldOptions();
+  constexpr static char const kTypeName[] = "StructFieldOptions";
+
+  /// The child indices to extract. For instance, to get the 2nd child
+  /// of the 1st child of a struct or union, this would be {0, 1}.
+  std::vector<int> indices;
+};
+
 class ARROW_EXPORT StrptimeOptions : public FunctionOptions {
  public:
   explicit StrptimeOptions(std::string format, TimeUnit::type unit);
diff --git a/cpp/src/arrow/compute/exec/expression.cc b/cpp/src/arrow/compute/exec/expression.cc
index 03db24b5413bd..4249179e1bf8c 100644
--- a/cpp/src/arrow/compute/exec/expression.cc
+++ b/cpp/src/arrow/compute/exec/expression.cc
@@ -510,29 +510,11 @@ Result<Datum> ExecuteScalarExpression(const Expression& expr, const ExecBatch& i
     }
 
     Datum field = input[param->indices[0]];
-    for (auto it = param->indices.begin() + 1; it != param->indices.end(); ++it) {
-      if (field.type()->id() != Type::STRUCT) {
-        return Status::Invalid("Nested field reference into a non-struct: ",
-                               *field.type());
-      }
-      const int index = *it;
-      if (index < 0 || index >= field.type()->num_fields()) {
-        return Status::Invalid("Out of bounds field reference: ", index, " but type has ",
-                               field.type()->num_fields(), " fields");
-      }
-      if (field.is_scalar()) {
-        const auto& struct_scalar = field.scalar_as<StructScalar>();
-        if (!struct_scalar.is_valid) {
-          return MakeNullScalar(param->descr.type);
-        }
-        field = struct_scalar.value[index];
-      } else if (field.is_array()) {
-        const auto& struct_array = field.array_as<StructArray>();
-        ARROW_ASSIGN_OR_RAISE(
-            field, struct_array->GetFlattenedField(index, exec_context->memory_pool()));
-      } else {
-        return Status::NotImplemented("Nested field reference into a ", field.ToString());
-      }
+    if (param->indices.size() > 1) {
+      std::vector<int> indices(param->indices.begin() + 1, param->indices.end());
+      compute::StructFieldOptions options(std::move(indices));
+      ARROW_ASSIGN_OR_RAISE(
+          field, compute::CallFunction("struct_field", {std::move(field)}, &options));
     }
     if (!field.type()->Equals(param->descr.type)) {
       return Status::Invalid("Referenced field ", expr.ToString(), " was ",
diff --git a/cpp/src/arrow/compute/kernels/scalar_nested.cc b/cpp/src/arrow/compute/kernels/scalar_nested.cc
index aeac0d747b1a5..682f73632b24a 100644
--- a/cpp/src/arrow/compute/kernels/scalar_nested.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_nested.cc
@@ -22,6 +22,7 @@
 #include "arrow/compute/kernels/common.h"
 #include "arrow/result.h"
 #include "arrow/util/bit_block_counter.h"
+#include "arrow/util/bitmap_generate.h"
 
 namespace arrow {
 namespace compute {
@@ -187,6 +188,152 @@ const FunctionDoc list_element_doc(
      "is emitted. Null values emit a null in the output."),
     {"lists", "index"});
 
+struct StructFieldFunctor {
+  static Status ExecArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    const auto& options = OptionsWrapper<StructFieldOptions>::Get(ctx);
+    std::shared_ptr<Array> current = batch[0].make_array();
+    for (const auto& index : options.indices) {
+      RETURN_NOT_OK(CheckIndex(index, *current->type()));
+      switch (current->type()->id()) {
+        case Type::STRUCT: {
+          const auto& struct_array = checked_cast<const StructArray&>(*current);
+          ARROW_ASSIGN_OR_RAISE(
+              current, struct_array.GetFlattenedField(index, ctx->memory_pool()));
+          break;
+        }
+        case Type::DENSE_UNION: {
+          // We implement this here instead of in DenseUnionArray since it's
+          // easiest to do via Take(), but DenseUnionArray can't rely on
+          // arrow::compute. See ARROW-8891.
+          const auto& union_array = checked_cast<const DenseUnionArray&>(*current);
+
+          // Generate a bitmap for the offsets buffer based on the type codes buffer.
+          ARROW_ASSIGN_OR_RAISE(
+              std::shared_ptr<Buffer> take_bitmap,
+              ctx->AllocateBitmap(union_array.length() + union_array.offset()));
+          const int8_t* type_codes = union_array.raw_type_codes();
+          const int8_t type_code = union_array.union_type()->type_codes()[index];
+          int64_t offset = 0;
+          arrow::internal::GenerateBitsUnrolled(
+              take_bitmap->mutable_data(), union_array.offset(), union_array.length(),
+              [&] { return type_codes[offset++] == type_code; });
+
+          // Pass the combined buffer to Take().
+          Datum take_indices(
+              ArrayData(int32(), union_array.length(),
+                        {std::move(take_bitmap), union_array.value_offsets()},
+                        kUnknownNullCount, union_array.offset()));
+          // Do not slice the child since the indices are relative to the unsliced array.
+          ARROW_ASSIGN_OR_RAISE(
+              Datum result,
+              CallFunction("take", {union_array.field(index), std::move(take_indices)}));
+          current = result.make_array();
+          break;
+        }
+        case Type::SPARSE_UNION: {
+          const auto& union_array = checked_cast<const SparseUnionArray&>(*current);
+          ARROW_ASSIGN_OR_RAISE(current,
+                                union_array.GetFlattenedField(index, ctx->memory_pool()));
+          break;
+        }
+        default:
+          // Should have been checked in ResolveStructFieldType
+          return Status::TypeError("struct_field: cannot reference child field of type ",
+                                   *current->type());
+      }
+    }
+    *out = current;
+    return Status::OK();
+  }
+
+  static Status ExecScalar(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    const auto& options = OptionsWrapper<StructFieldOptions>::Get(ctx);
+    const std::shared_ptr<Scalar>* current = &batch[0].scalar();
+    for (const auto& index : options.indices) {
+      RETURN_NOT_OK(CheckIndex(index, *(*current)->type));
+      if (!(*current)->is_valid) {
+        // out should already be a null scalar of the appropriate type
+        return Status::OK();
+      }
+
+      switch ((*current)->type->id()) {
+        case Type::STRUCT: {
+          current = &checked_cast<const StructScalar&>(**current).value[index];
+          break;
+        }
+        case Type::DENSE_UNION:
+        case Type::SPARSE_UNION: {
+          const auto& union_scalar = checked_cast<const UnionScalar&>(**current);
+          const auto& union_ty = checked_cast<const UnionType&>(*(*current)->type);
+          if (union_scalar.type_code != union_ty.type_codes()[index]) {
+            // out should already be a null scalar of the appropriate type
+            return Status::OK();
+          }
+          current = &union_scalar.value;
+          break;
+        }
+        default:
+          // Should have been checked in ResolveStructFieldType
+          return Status::TypeError("struct_field: cannot reference child field of type ",
+                                   *(*current)->type);
+      }
+    }
+    *out = *current;
+    return Status::OK();
+  }
+
+  static Status CheckIndex(int index, const DataType& type) {
+    if (!ValidParentType(type)) {
+      return Status::TypeError("struct_field: cannot subscript field of type ", type);
+    } else if (index < 0 || index > type.num_fields()) {
+      return Status::Invalid("struct_field: out-of-bounds field reference to field ",
+                             index, " in type ", type, " with ", type.num_fields(),
+                             " fields");
+    }
+    return Status::OK();
+  }
+
+  static bool ValidParentType(const DataType& type) {
+    return type.id() == Type::STRUCT || type.id() == Type::DENSE_UNION ||
+           type.id() == Type::SPARSE_UNION;
+  }
+};
+
+Result<ValueDescr> ResolveStructFieldType(KernelContext* ctx,
+                                          const std::vector<ValueDescr>& descrs) {
+  const auto& options = OptionsWrapper<StructFieldOptions>::Get(ctx);
+  const std::shared_ptr<DataType>* type = &descrs.front().type;
+  for (const auto& index : options.indices) {
+    RETURN_NOT_OK(StructFieldFunctor::CheckIndex(index, **type));
+    type = &(*type)->field(index)->type();
+  }
+  return ValueDescr(*type, descrs.front().shape);
+}
+
+void AddStructFieldKernels(ScalarFunction* func) {
+  for (const auto shape : {ValueDescr::ARRAY, ValueDescr::SCALAR}) {
+    for (const auto in_type : {Type::STRUCT, Type::DENSE_UNION, Type::SPARSE_UNION}) {
+      ScalarKernel kernel({InputType(in_type, shape)}, OutputType(ResolveStructFieldType),
+                          shape == ValueDescr::ARRAY ? StructFieldFunctor::ExecArray
+                                                     : StructFieldFunctor::ExecScalar,
+                          OptionsWrapper<StructFieldOptions>::Init);
+      kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
+      kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
+      DCHECK_OK(func->AddKernel(std::move(kernel)));
+    }
+  }
+}
+
+const FunctionDoc struct_field_doc(
+    "Extract children of a struct or union value by index.",
+    ("Given a series of indices (passed via StructFieldOptions), extract the "
+     "child array or scalar referenced by the index. For union values, mask "
+     "the child based on the type codes of the union array. The indices are "
+     "always the child index and not the type code (for unions) - so the "
+     "first child is always index 0. An empty set of indices returns the "
+     "argument unchanged."),
+    {"container"}, "StructFieldOptions");
+
 Result<ValueDescr> MakeStructResolve(KernelContext* ctx,
                                      const std::vector<ValueDescr>& descrs) {
   auto names = OptionsWrapper<MakeStructOptions>::Get(ctx).field_names;
@@ -298,6 +445,11 @@ void RegisterScalarNested(FunctionRegistry* registry) {
   AddListElementScalarKernels(list_element.get());
   DCHECK_OK(registry->AddFunction(std::move(list_element)));
 
+  auto struct_field =
+      std::make_shared<ScalarFunction>("struct_field", Arity::Unary(), &struct_field_doc);
+  AddStructFieldKernels(struct_field.get());
+  DCHECK_OK(registry->AddFunction(std::move(struct_field)));
+
   static MakeStructOptions kDefaultMakeStructOptions;
   auto make_struct_function = std::make_shared<ScalarFunction>(
       "make_struct", Arity::VarArgs(), &make_struct_doc, &kDefaultMakeStructOptions);
diff --git a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc
index cb16257399dc5..0b6f7bcc1ec3e 100644
--- a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc
@@ -107,6 +107,114 @@ TEST(TestScalarNested, ListElementInvalid) {
               Raises(StatusCode::Invalid));
 }
 
+TEST(TestScalarNested, StructField) {
+  StructFieldOptions trivial;
+  StructFieldOptions extract0({0});
+  StructFieldOptions extract20({2, 0});
+  StructFieldOptions invalid1({-1});
+  StructFieldOptions invalid2({2, 4});
+  StructFieldOptions invalid3({0, 1});
+  FieldVector fields = {field("a", int32()), field("b", utf8()),
+                        field("c", struct_({
+                                       field("d", int64()),
+                                       field("e", float64()),
+                                   }))};
+  {
+    auto arr = ArrayFromJSON(struct_(fields), R"([
+      [1, "a", [10, 10.0]],
+      [null, "b", [11, 11.0]],
+      [3, null, [12, 12.0]],
+      null
+    ])");
+    CheckScalar("struct_field", {arr}, arr, &trivial);
+    CheckScalar("struct_field", {arr}, ArrayFromJSON(int32(), "[1, null, 3, null]"),
+                &extract0);
+    CheckScalar("struct_field", {arr}, ArrayFromJSON(int64(), "[10, 11, 12, null]"),
+                &extract20);
+    EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid,
+                                    ::testing::HasSubstr("out-of-bounds field reference"),
+                                    CallFunction("struct_field", {arr}, &invalid1));
+    EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid,
+                                    ::testing::HasSubstr("out-of-bounds field reference"),
+                                    CallFunction("struct_field", {arr}, &invalid2));
+    EXPECT_RAISES_WITH_MESSAGE_THAT(TypeError, ::testing::HasSubstr("cannot subscript"),
+                                    CallFunction("struct_field", {arr}, &invalid3));
+  }
+  {
+    auto ty = dense_union(fields, {2, 5, 8});
+    auto arr = ArrayFromJSON(ty, R"([
+      [2, 1],
+      [5, "foo"],
+      [8, null],
+      [8, [10, 10.0]]
+    ])");
+    CheckScalar("struct_field", {arr}, arr, &trivial);
+    CheckScalar("struct_field", {arr}, ArrayFromJSON(int32(), "[1, null, null, null]"),
+                &extract0);
+    CheckScalar("struct_field", {arr}, ArrayFromJSON(int64(), "[null, null, null, 10]"),
+                &extract20);
+    EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid,
+                                    ::testing::HasSubstr("out-of-bounds field reference"),
+                                    CallFunction("struct_field", {arr}, &invalid1));
+    EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid,
+                                    ::testing::HasSubstr("out-of-bounds field reference"),
+                                    CallFunction("struct_field", {arr}, &invalid2));
+    EXPECT_RAISES_WITH_MESSAGE_THAT(TypeError, ::testing::HasSubstr("cannot subscript"),
+                                    CallFunction("struct_field", {arr}, &invalid3));
+
+    // Test edge cases for union representation
+    auto ints = ArrayFromJSON(fields[0]->type(), "[null, 2, 3]");
+    auto strs = ArrayFromJSON(fields[1]->type(), R"([null, "bar"])");
+    auto nested = ArrayFromJSON(fields[2]->type(), R"([null, [10, 10.0]])");
+    auto type_ids = ArrayFromJSON(int8(), "[2, 5, 8, 2, 5, 8]")->data()->buffers[1];
+    auto offsets = ArrayFromJSON(int32(), "[0, 0, 0, 1, 1, 1]")->data()->buffers[1];
+
+    arr = std::make_shared<DenseUnionArray>(ty, /*length=*/6,
+                                            ArrayVector{ints, strs, nested}, type_ids,
+                                            offsets, /*offset=*/0);
+    // Sliced parent
+    CheckScalar("struct_field", {arr->Slice(3, 3)},
+                ArrayFromJSON(int32(), "[2, null, null]"), &extract0);
+    // Sliced child
+    arr = std::make_shared<DenseUnionArray>(ty, /*length=*/6,
+                                            ArrayVector{ints->Slice(1, 2), strs, nested},
+                                            type_ids, offsets, /*offset=*/0);
+    CheckScalar("struct_field", {arr},
+                ArrayFromJSON(int32(), "[2, null, null, 3, null, null]"), &extract0);
+    // Sliced parent + sliced child
+    CheckScalar("struct_field", {arr->Slice(3, 3)},
+                ArrayFromJSON(int32(), "[3, null, null]"), &extract0);
+  }
+  {
+    // The underlying implementation is tested directly/more thoroughly in
+    // array_union_test.cc.
+    auto arr = ArrayFromJSON(sparse_union(fields, {2, 5, 8}), R"([
+      [2, 1],
+      [5, "foo"],
+      [8, null],
+      [8, [10, 10.0]]
+    ])");
+    CheckScalar("struct_field", {arr}, arr, &trivial);
+    CheckScalar("struct_field", {arr}, ArrayFromJSON(int32(), "[1, null, null, null]"),
+                &extract0);
+    CheckScalar("struct_field", {arr}, ArrayFromJSON(int64(), "[null, null, null, 10]"),
+                &extract20);
+    EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid,
+                                    ::testing::HasSubstr("out-of-bounds field reference"),
+                                    CallFunction("struct_field", {arr}, &invalid1));
+    EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid,
+                                    ::testing::HasSubstr("out-of-bounds field reference"),
+                                    CallFunction("struct_field", {arr}, &invalid2));
+    EXPECT_RAISES_WITH_MESSAGE_THAT(TypeError, ::testing::HasSubstr("cannot subscript"),
+                                    CallFunction("struct_field", {arr}, &invalid3));
+  }
+  {
+    auto arr = ArrayFromJSON(int32(), "[0, 1, 2, 3]");
+    ASSERT_RAISES(NotImplemented, CallFunction("struct_field", {arr}, &trivial));
+    ASSERT_RAISES(NotImplemented, CallFunction("struct_field", {arr}, &extract0));
+  }
+}
+
 struct {
   Result<Datum> operator()(std::vector<Datum> args) {
     return CallFunction("make_struct", args);
diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst
index 577a32db72f8e..9f7800580620a 100644
--- a/docs/source/cpp/compute.rst
+++ b/docs/source/cpp/compute.rst
@@ -1587,15 +1587,17 @@ in the respective option classes.
 Structural transforms
 ~~~~~~~~~~~~~~~~~~~~~
 
-+---------------------+------------+-------------------------------------+------------------+--------+
-| Function name       | Arity      | Input types                         | Output type      | Notes  |
-+=====================+============+=====================================+==================+========+
-| list_element        | Binary     | List-like (Arg 0), Integral (Arg 1) | List value type  | \(1)   |
-+---------------------+------------+-------------------------------------+------------------+--------+
-| list_flatten        | Unary      | List-like                           | List value type  | \(2)   |
-+---------------------+------------+-------------------------------------+------------------+--------+
-| list_parent_indices | Unary      | List-like                           | Int32 or Int64   | \(3)   |
-+---------------------+------------+-------------------------------------+------------------+--------+
++---------------------+------------+-------------------------------------+------------------+------------------------------+--------+
+| Function name       | Arity      | Input types                         | Output type      | Options class                | Notes  |
++=====================+============+=====================================+==================+==============================+========+
+| list_element        | Binary     | List-like (Arg 0), Integral (Arg 1) | List value type  |                              | \(1)   |
++---------------------+------------+-------------------------------------+------------------+------------------------------+--------+
+| list_flatten        | Unary      | List-like                           | List value type  |                              | \(2)   |
++---------------------+------------+-------------------------------------+------------------+------------------------------+--------+
+| list_parent_indices | Unary      | List-like                           | Int32 or Int64   |                              | \(3)   |
++---------------------+------------+-------------------------------------+------------------+------------------------------+--------+
+| struct_field        | Unary      | Struct or Union                     | Computed         | :struct:`StructFieldOptions` | \(4)   |
++---------------------+------------+-------------------------------------+------------------+------------------------------+--------+
 
 * \(1) Output is an array of the same length as the input list array. The
   output values are the values at the specified index of each child list.
@@ -1609,6 +1611,29 @@ Structural transforms
   are discarded.  Output type is Int32 for List and FixedSizeList, Int64 for
   LargeList.
 
+* \(4) Extract a child value based on a sequence of indices passed in
+  the options. The validity bitmap of the result will be the
+  intersection of all intermediate validity bitmaps. For example, for
+  an array with type ``struct<a: int32, b: struct<c: int64, d:
+  float64>>``:
+
+  * An empty sequence of indices yields the original value unchanged.
+  * The index ``0`` yields an array of type ``int32`` whose validity
+    bitmap is the intersection of the bitmap for the outermost struct
+    and the bitmap for the child ``a``.
+  * The index ``1, 1`` yields an array of type ``float64`` whose
+    validity bitmap is the intersection of the bitmaps for the
+    outermost struct, for struct ``b``, and for the child ``d``.
+
+  For unions, a validity bitmap is synthesized based on the type
+  codes. Also, the index is always the child index and not a type code.
+  Hence for array with type ``sparse_union<2: int32, 7: utf8>``:
+
+  * The index ``0`` yields an array of type ``int32``, which is valid
+    at an index *n* if and only if the child array ``a`` is valid at
+    index *n* and the type code at index *n* is 2.
+  * The indices ``2`` and ``7`` are invalid.
+
 These functions create a copy of the first input with some elements
 replaced, based on the remaining inputs.
 
diff --git a/docs/source/python/api/compute.rst b/docs/source/python/api/compute.rst
index 225d853718fe1..00897a249832d 100644
--- a/docs/source/python/api/compute.rst
+++ b/docs/source/python/api/compute.rst
@@ -497,3 +497,4 @@ Structural Transforms
    list_value_length
    make_struct
    replace_with_mask
+   struct_field

From f4dd80612f4c498a3bf9da53e974dc85aed78313 Mon Sep 17 00:00:00 2001
From: David Li <li.davidm96@gmail.com>
Date: Wed, 10 Nov 2021 14:32:43 +0100
Subject: [PATCH 123/194] ARROW-13989: [C++] Add support for month-day-nano
 interval to compute functions

Closes #11525 from lidavidm/arrow-13989

Authored-by: David Li <li.davidm96@gmail.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 .../arrow/compute/kernels/aggregate_test.cc   |   2 -
 .../arrow/compute/kernels/codegen_internal.h  |   2 +
 .../compute/kernels/hash_aggregate_test.cc    |   3 +-
 .../arrow/compute/kernels/scalar_if_else.cc   |   7 +-
 .../compute/kernels/scalar_if_else_test.cc    |  77 +++++++++
 .../arrow/compute/kernels/vector_replace.cc   |   4 +-
 .../compute/kernels/vector_replace_test.cc    | 154 ++++++++++++++----
 cpp/src/arrow/type.cc                         |   2 +-
 cpp/src/arrow/type_traits.h                   |   1 +
 9 files changed, 209 insertions(+), 43 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc
index c8b13862ae361..c7cdf3fd91e54 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_test.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc
@@ -1384,8 +1384,6 @@ template <typename ArrowType>
 class TestFloatingMinMaxKernel : public TestPrimitiveMinMaxKernel<ArrowType> {};
 
 class TestBooleanMinMaxKernel : public TestPrimitiveMinMaxKernel<BooleanType> {};
-class TestDayTimeIntervalMinMaxKernel
-    : public TestPrimitiveMinMaxKernel<DayTimeIntervalType> {};
 
 TEST_F(TestBooleanMinMaxKernel, Basics) {
   ScalarAggregateOptions options;
diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.h b/cpp/src/arrow/compute/kernels/codegen_internal.h
index aa199f494da80..a4914edbc8be7 100644
--- a/cpp/src/arrow/compute/kernels/codegen_internal.h
+++ b/cpp/src/arrow/compute/kernels/codegen_internal.h
@@ -1225,6 +1225,8 @@ ArrayKernelExec GenerateTypeAgnosticPrimitive(detail::GetTypeId get_id) {
     case Type::DURATION:
     case Type::INTERVAL_DAY_TIME:
       return Generator<UInt64Type, Args...>::Exec;
+    case Type::INTERVAL_MONTH_DAY_NANO:
+      return Generator<MonthDayNanoIntervalType, Args...>::Exec;
     default:
       DCHECK(false);
       return ExecFail;
diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc b/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc
index e8bc90695e109..d919956b63b5f 100644
--- a/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc
+++ b/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc
@@ -286,7 +286,8 @@ TEST(Grouper, SupportedKeys) {
     ASSERT_OK(internal::Grouper::Make({timestamp(unit), duration(unit)}));
   }
 
-  ASSERT_OK(internal::Grouper::Make({day_time_interval(), month_interval()}));
+  ASSERT_OK(internal::Grouper::Make(
+      {day_time_interval(), month_interval(), month_day_nano_interval()}));
 
   ASSERT_RAISES(NotImplemented, internal::Grouper::Make({struct_({field("", int64())})}));
 
diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else.cc b/cpp/src/arrow/compute/kernels/scalar_if_else.cc
index 6195d1381a044..88bc7ca2e76dd 100644
--- a/cpp/src/arrow/compute/kernels/scalar_if_else.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_if_else.cc
@@ -375,7 +375,9 @@ struct IfElseFunctor {};
 // internal::GenerateTypeAgnosticPrimitive forwards types to the corresponding unsigned
 // int type
 template <typename Type>
-struct IfElseFunctor<Type, enable_if_number<Type>> {
+struct IfElseFunctor<Type,
+                     enable_if_t<is_number_type<Type>::value ||
+                                 std::is_same<Type, MonthDayNanoIntervalType>::value>> {
   using T = typename TypeTraits<Type>::CType;
   // A - Array, S - Scalar, X = Array/Scalar
 
@@ -1295,7 +1297,8 @@ struct CopyFixedWidth<BooleanType> {
 };
 
 template <typename Type>
-struct CopyFixedWidth<Type, enable_if_number<Type>> {
+struct CopyFixedWidth<
+    Type, enable_if_t<is_number_type<Type>::value || is_interval_type<Type>::value>> {
   using CType = typename TypeTraits<Type>::CType;
   static void CopyScalar(const Scalar& scalar, const int64_t length,
                          uint8_t* raw_out_values, const int64_t out_offset) {
diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc
index 7bcbb814ada67..8c869ceef895d 100644
--- a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc
@@ -345,6 +345,15 @@ TEST_F(TestIfElseKernel, DayTimeInterval) {
       ArrayFromJSON(ty, "[[1, 2], [3, -4], [-5, 6], [15, 16]]"));
 }
 
+TEST_F(TestIfElseKernel, MonthDayNanoInterval) {
+  auto ty = month_day_nano_interval();
+  CheckWithDifferentShapes(
+      ArrayFromJSON(boolean(), "[true, true, true, false]"),
+      ArrayFromJSON(ty, "[[1, 2, -3], [3, -4, 5], [-5, 6, 7], [-7, -8, -9]]"),
+      ArrayFromJSON(ty, "[[-9, -10, 11], [11, -12, 0], [-13, 14, -1], [15, 16, 2]]"),
+      ArrayFromJSON(ty, "[[1, 2, -3], [3, -4, 5], [-5, 6, 7], [15, 16, 2]]"));
+}
+
 TEST_F(TestIfElseKernel, IfElseDispatchBest) {
   std::string name = "if_else";
   ASSERT_OK_AND_ASSIGN(auto function, GetFunctionRegistry()->GetFunction(name));
@@ -1368,6 +1377,22 @@ TEST(TestCaseWhen, DayTimeInterval) {
               ArrayFromJSON(type, "[null, null, null, [6, 6]]"));
 }
 
+TEST(TestCaseWhen, MonthDayNanoInterval) {
+  auto type = month_day_nano_interval();
+  auto cond1 = ArrayFromJSON(boolean(), "[true, true, null, null]");
+  auto cond2 = ArrayFromJSON(boolean(), "[true, false, true, null]");
+  auto values_null = ArrayFromJSON(type, "[null, null, null, null]");
+  auto values1 = ArrayFromJSON(type, R"([[0, 1, -2], null, [-3, 4, 5], [-6, -7, -8]])");
+  auto values2 = ArrayFromJSON(type, R"([[1, 2, 3], [4, 5, 6], null, [0, 2, 4]])");
+
+  CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1, values2},
+              ArrayFromJSON(type, R"([[0, 1, -2], null, null, null])"));
+  CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1, values2, values1},
+              ArrayFromJSON(type, R"([[0, 1, -2], null, null, [-6, -7, -8]])"));
+  CheckScalar("case_when", {MakeStruct({cond1, cond2}), values_null, values2, values1},
+              ArrayFromJSON(type, R"([null, null, null, [-6, -7, -8]])"));
+}
+
 TEST(TestCaseWhen, Decimal) {
   for (const auto& type :
        std::vector<std::shared_ptr<DataType>>{decimal128(3, 2), decimal256(3, 2)}) {
@@ -2425,6 +2450,35 @@ TEST(TestCoalesce, DayTimeInterval) {
               ArrayFromJSON(type, "[[1, 2], [1, 2], [1, 2], [1, 2]]"));
 }
 
+TEST(TestCoalesce, MonthDayNanoInterval) {
+  auto type = month_day_nano_interval();
+  auto scalar_null = ScalarFromJSON(type, "null");
+  auto scalar1 = ScalarFromJSON(type, "[1, 2, 3]");
+  auto values_null = ArrayFromJSON(type, "[null, null, null, null]");
+  auto values1 = ArrayFromJSON(type, "[null, [3, 4, 5], [5, 6, 7], [7, 8, 9]]");
+  auto values2 =
+      ArrayFromJSON(type, "[[9, 10, 0], [11, 12, 1], [13, 14, 2], [15, 16, 3]]");
+  auto values3 = ArrayFromJSON(type, "[[17, 18, 4], [19, 20, 5], [21, 22, 6], null]");
+  CheckScalar("coalesce", {values_null}, values_null);
+  CheckScalar("coalesce", {values_null, scalar1},
+              ArrayFromJSON(type, "[[1, 2, 3], [1, 2, 3], [1, 2, 3], [1, 2, 3]]"));
+  CheckScalar("coalesce", {values_null, values1}, values1);
+  CheckScalar("coalesce", {values_null, values2}, values2);
+  CheckScalar("coalesce", {values1, values_null}, values1);
+  CheckScalar("coalesce", {values2, values_null}, values2);
+  CheckScalar("coalesce", {scalar_null, values1}, values1);
+  CheckScalar("coalesce", {values1, scalar_null}, values1);
+  CheckScalar("coalesce", {values2, values1, values_null}, values2);
+  CheckScalar("coalesce", {values1, scalar1},
+              ArrayFromJSON(type, "[[1, 2, 3], [3, 4, 5], [5, 6, 7], [7, 8, 9]]"));
+  CheckScalar("coalesce", {values1, values2},
+              ArrayFromJSON(type, "[[9, 10, 0], [3, 4, 5], [5, 6, 7], [7, 8, 9]]"));
+  CheckScalar("coalesce", {values1, values2, values3},
+              ArrayFromJSON(type, "[[9, 10, 0], [3, 4, 5], [5, 6, 7], [7, 8, 9]]"));
+  CheckScalar("coalesce", {scalar1, values1},
+              ArrayFromJSON(type, "[[1, 2, 3], [1, 2, 3], [1, 2, 3], [1, 2, 3]]"));
+}
+
 TEST(TestCoalesce, Decimal) {
   for (const auto& type :
        std::vector<std::shared_ptr<DataType>>{decimal128(3, 2), decimal256(3, 2)}) {
@@ -2834,6 +2888,29 @@ TEST(TestChoose, DayTimeInterval) {
               *MakeArrayOfNull(type, 5));
 }
 
+TEST(TestChoose, MonthDayNanoInterval) {
+  auto type = month_day_nano_interval();
+  auto indices1 = ArrayFromJSON(int64(), "[0, 1, 0, 1, null]");
+  auto values1 = ArrayFromJSON(type, "[[10, 1, 0], [10, 1, 0], null, null, [10, 1, 0]]");
+  auto values2 = ArrayFromJSON(type, "[[2, 20, 4], [2, 20, 4], null, null, [2, 20, 4]]");
+  auto nulls = ArrayFromJSON(type, "[null, null, null, null, null]");
+  CheckScalar("choose", {indices1, values1, values2},
+              ArrayFromJSON(type, "[[10, 1, 0], [2, 20, 4], null, null, null]"));
+  CheckScalar("choose", {indices1, ScalarFromJSON(type, "[1, 2, 3]"), values1},
+              ArrayFromJSON(type, "[[1, 2, 3], [10, 1, 0], [1, 2, 3], null, null]"));
+  CheckScalar("choose", {ScalarFromJSON(int64(), "0"), values1, values2}, values1);
+  CheckScalar("choose", {ScalarFromJSON(int64(), "1"), values1, values2}, values2);
+  CheckScalar("choose", {ScalarFromJSON(int64(), "null"), values1, values2}, nulls);
+  auto scalar1 = ScalarFromJSON(type, "[10, 1, 0]");
+  CheckScalar("choose", {ScalarFromJSON(int64(), "0"), scalar1, values2},
+              *MakeArrayFromScalar(*scalar1, 5));
+  CheckScalar("choose", {ScalarFromJSON(int64(), "1"), scalar1, values2}, values2);
+  CheckScalar("choose", {ScalarFromJSON(int64(), "null"), values1, values2}, nulls);
+  auto scalar_null = ScalarFromJSON(type, "null");
+  CheckScalar("choose", {ScalarFromJSON(int64(), "0"), scalar_null, values2},
+              *MakeArrayOfNull(type, 5));
+}
+
 TEST(TestChoose, Decimal) {
   for (const auto& type : {decimal128(3, 2), decimal256(3, 2)}) {
     auto indices1 = ArrayFromJSON(int64(), "[0, 1, 0, 1, null]");
diff --git a/cpp/src/arrow/compute/kernels/vector_replace.cc b/cpp/src/arrow/compute/kernels/vector_replace.cc
index eca9c4bb72cb3..7f204b529ebfa 100644
--- a/cpp/src/arrow/compute/kernels/vector_replace.cc
+++ b/cpp/src/arrow/compute/kernels/vector_replace.cc
@@ -210,7 +210,9 @@ template <typename Type, typename Enable = void>
 struct ReplaceWithMask {};
 
 template <typename Type>
-struct ReplaceWithMask<Type, enable_if_number<Type>> {
+struct ReplaceWithMask<Type,
+                       enable_if_t<is_number_type<Type>::value ||
+                                   std::is_same<Type, MonthDayNanoIntervalType>::value>> {
   using T = typename TypeTraits<Type>::CType;
 
   static void CopyData(const DataType&, uint8_t* out, const int64_t out_offset,
diff --git a/cpp/src/arrow/compute/kernels/vector_replace_test.cc b/cpp/src/arrow/compute/kernels/vector_replace_test.cc
index f1e5750ca9577..e12a42e5254e4 100644
--- a/cpp/src/arrow/compute/kernels/vector_replace_test.cc
+++ b/cpp/src/arrow/compute/kernels/vector_replace_test.cc
@@ -149,6 +149,14 @@ class TestReplaceDayTimeInterval : public TestReplaceKernel<DayTimeIntervalType>
   }
 };
 
+class TestReplaceMonthDayNanoInterval
+    : public TestReplaceKernel<MonthDayNanoIntervalType> {
+ protected:
+  std::shared_ptr<DataType> type() override {
+    return TypeTraits<MonthDayNanoIntervalType>::type_singleton();
+  }
+};
+
 template <typename T>
 class TestReplaceBinary : public TestReplaceKernel<T> {
  protected:
@@ -209,12 +217,12 @@ TYPED_TEST(TestReplaceNumeric, ReplaceWithMask) {
                this->mask("[null, null, null, null]"), this->array("[]"),
                this->array("[null, null, null, null]"));
   this->Assert(ReplaceWithMask, this->array("[0, 1, 2, 3, 4, 5]"),
-               this->mask("[true, true, false, false, null, null]"),
-               this->array("[10, null]"), this->array("[10, null, 2, 3, null, null]"));
+               this->mask("[false, false, null, null, true, true]"),
+               this->array("[10, null]"), this->array("[0, 1, null, null, 10, null]"));
   this->Assert(ReplaceWithMask, this->array("[null, null, null, null, null, null]"),
-               this->mask("[true, true, false, false, null, null]"),
+               this->mask("[false, false, null, null, true, true]"),
                this->array("[10, null]"),
-               this->array("[10, null, null, null, null, null]"));
+               this->array("[null, null, null, null, 10, null]"));
 
   this->Assert(ReplaceWithMask, this->array("[]"), this->mask("[]"), this->scalar("1"),
                this->array("[]"));
@@ -223,8 +231,8 @@ TYPED_TEST(TestReplaceNumeric, ReplaceWithMask) {
   this->Assert(ReplaceWithMask, this->array("[0, 1]"), this->mask("[true, true]"),
                this->scalar("null"), this->array("[null, null]"));
   this->Assert(ReplaceWithMask, this->array("[0, 1, 2]"),
-               this->mask("[true, false, null]"), this->scalar("10"),
-               this->array("[10, 1, null]"));
+               this->mask("[false, null, true]"), this->scalar("10"),
+               this->array("[0, null, 10]"));
 }
 
 TYPED_TEST(TestReplaceNumeric, ReplaceWithMaskRandom) {
@@ -325,13 +333,13 @@ TEST_F(TestReplaceBoolean, ReplaceWithMask) {
                this->mask("[null, null, null, null]"), this->array("[]"),
                this->array("[null, null, null, null]"));
   this->Assert(ReplaceWithMask, this->array("[true, true, true, true, true, true]"),
-               this->mask("[true, true, false, false, null, null]"),
+               this->mask("[false, false, null, null, true, true]"),
                this->array("[false, null]"),
-               this->array("[false, null, true, true, null, null]"));
+               this->array("[true, true, null, null, false, null]"));
   this->Assert(ReplaceWithMask, this->array("[null, null, null, null, null, null]"),
-               this->mask("[true, true, false, false, null, null]"),
+               this->mask("[false, false, null, null, true, true]"),
                this->array("[false, null]"),
-               this->array("[false, null, null, null, null, null]"));
+               this->array("[null, null, null, null, false, null]"));
 
   this->Assert(ReplaceWithMask, this->array("[]"), this->mask("[]"), this->scalar("true"),
                this->array("[]"));
@@ -340,8 +348,8 @@ TEST_F(TestReplaceBoolean, ReplaceWithMask) {
   this->Assert(ReplaceWithMask, this->array("[false, false]"), this->mask("[true, true]"),
                this->scalar("null"), this->array("[null, null]"));
   this->Assert(ReplaceWithMask, this->array("[false, false, false]"),
-               this->mask("[true, false, null]"), this->scalar("true"),
-               this->array("[true, false, null]"));
+               this->mask("[false, null, true]"), this->scalar("true"),
+               this->array("[false, null, true]"));
 }
 
 TEST_F(TestReplaceBoolean, ReplaceWithMaskErrors) {
@@ -412,13 +420,13 @@ TEST_F(TestReplaceFixedSizeBinary, ReplaceWithMask) {
                this->array(R"([null, null, null, null])"));
   this->Assert(ReplaceWithMask,
                this->array(R"(["aaa", "bbb", "ccc", "ddd", "eee", "fff"])"),
-               this->mask("[true, true, false, false, null, null]"),
+               this->mask("[false, false, null, null, true, true]"),
                this->array(R"(["ggg", null])"),
-               this->array(R"(["ggg", null, "ccc", "ddd", null, null])"));
+               this->array(R"(["aaa", "bbb", null, null, "ggg", null])"));
   this->Assert(ReplaceWithMask, this->array(R"([null, null, null, null, null, null])"),
-               this->mask("[true, true, false, false, null, null]"),
+               this->mask("[false, false, null, null, true, true]"),
                this->array(R"(["aaa", null])"),
-               this->array(R"(["aaa", null, null, null, null, null])"));
+               this->array(R"([null, null, null, null, "aaa", null])"));
 
   this->Assert(ReplaceWithMask, this->array("[]"), this->mask("[]"),
                this->scalar(R"("zzz")"), this->array("[]"));
@@ -429,8 +437,8 @@ TEST_F(TestReplaceFixedSizeBinary, ReplaceWithMask) {
                this->mask("[true, true]"), this->scalar("null"),
                this->array("[null, null]"));
   this->Assert(ReplaceWithMask, this->array(R"(["aaa", "bbb", "ccc"])"),
-               this->mask("[true, false, null]"), this->scalar(R"("zzz")"),
-               this->array(R"(["zzz", "bbb", null])"));
+               this->mask("[false, null, true]"), this->scalar(R"("zzz")"),
+               this->array(R"(["aaa", null, "zzz"])"));
 }
 
 TEST_F(TestReplaceFixedSizeBinary, ReplaceWithMaskErrors) {
@@ -493,13 +501,13 @@ TYPED_TEST(TestReplaceDecimal, ReplaceWithMask) {
                this->array("[null, null, null, null]"));
   this->Assert(ReplaceWithMask,
                this->array(R"(["0.00", "1.00", "2.00", "3.00", "4.00", "5.00"])"),
-               this->mask("[true, true, false, false, null, null]"),
+               this->mask("[false, false, null, null, true, true]"),
                this->array(R"(["10.00", null])"),
-               this->array(R"(["10.00", null, "2.00", "3.00", null, null])"));
+               this->array(R"(["0.00", "1.00", null, null, "10.00", null])"));
   this->Assert(ReplaceWithMask, this->array("[null, null, null, null, null, null]"),
-               this->mask("[true, true, false, false, null, null]"),
+               this->mask("[false, false, null, null, true, true]"),
                this->array(R"(["10.00", null])"),
-               this->array(R"(["10.00", null, null, null, null, null])"));
+               this->array(R"([null, null, null, null, "10.00", null])"));
 
   this->Assert(ReplaceWithMask, this->array("[]"), this->mask("[]"),
                this->scalar(R"("1.00")"), this->array("[]"));
@@ -510,8 +518,8 @@ TYPED_TEST(TestReplaceDecimal, ReplaceWithMask) {
                this->mask("[true, true]"), this->scalar("null"),
                this->array("[null, null]"));
   this->Assert(ReplaceWithMask, this->array(R"(["0.00", "1.00", "2.00"])"),
-               this->mask("[true, false, null]"), this->scalar(R"("10.00")"),
-               this->array(R"(["10.00", "1.00", null])"));
+               this->mask("[false, null, true]"), this->scalar(R"("10.00")"),
+               this->array(R"(["0.00", null, "10.00"])"));
 }
 
 TEST_F(TestReplaceDayTimeInterval, ReplaceWithMask) {
@@ -560,12 +568,12 @@ TEST_F(TestReplaceDayTimeInterval, ReplaceWithMask) {
                this->array("[null, null, null, null]"));
   this->Assert(
       ReplaceWithMask, this->array("[[1, 2], [1, 2], [1, 2], [1, 2], [1, 2], [1, 2]]"),
-      this->mask("[true, true, false, false, null, null]"), this->array("[[3, 4], null]"),
-      this->array("[[3, 4], null, [1, 2], [1, 2], null, null]"));
+      this->mask("[false, false, null, null, true, true]"), this->array("[[3, 4], null]"),
+      this->array("[[1, 2], [1, 2], null, null, [3, 4], null]"));
   this->Assert(ReplaceWithMask, this->array("[null, null, null, null, null, null]"),
-               this->mask("[true, true, false, false, null, null]"),
+               this->mask("[false, false, null, null, true, true]"),
                this->array("[[3, 4], null]"),
-               this->array("[[3, 4], null, null, null, null, null]"));
+               this->array("[null, null, null, null, [3, 4], null]"));
 
   this->Assert(ReplaceWithMask, this->array("[]"), this->mask("[]"),
                this->scalar("[7, 8]"), this->array("[]"));
@@ -576,8 +584,82 @@ TEST_F(TestReplaceDayTimeInterval, ReplaceWithMask) {
                this->mask("[true, true]"), this->scalar("null"),
                this->array("[null, null]"));
   this->Assert(ReplaceWithMask, this->array("[[1, 2], [3, 4], [5, 6]]"),
-               this->mask("[true, false, null]"), this->scalar("[7, 8]"),
-               this->array("[[7, 8], [3, 4], null]"));
+               this->mask("[false, null, true]"), this->scalar("[7, 8]"),
+               this->array("[[1, 2], null, [7, 8]]"));
+}
+
+TEST_F(TestReplaceMonthDayNanoInterval, ReplaceWithMask) {
+  this->Assert(ReplaceWithMask, this->array("[]"), this->mask_scalar(false),
+               this->array("[]"), this->array("[]"));
+  this->Assert(ReplaceWithMask, this->array("[]"), this->mask_scalar(true),
+               this->array("[]"), this->array("[]"));
+  this->Assert(ReplaceWithMask, this->array("[]"), this->null_mask_scalar(),
+               this->array("[]"), this->array("[]"));
+
+  this->Assert(ReplaceWithMask, this->array("[[1, 2, 4]]"), this->mask_scalar(false),
+               this->array("[]"), this->array("[[1, 2, 4]]"));
+  this->Assert(ReplaceWithMask, this->array("[[1, 2, 4]]"), this->mask_scalar(true),
+               this->array("[[3, 4, -2]]"), this->array("[[3, 4, -2]]"));
+  this->Assert(ReplaceWithMask, this->array("[[1, 2, 4]]"), this->null_mask_scalar(),
+               this->array("[]"), this->array("[null]"));
+
+  this->Assert(ReplaceWithMask, this->array("[[1, 2, 4], [3, 4, -2]]"),
+               this->mask_scalar(false), this->scalar("[7, 0, 8]"),
+               this->array("[[1, 2, 4], [3, 4, -2]]"));
+  this->Assert(ReplaceWithMask, this->array("[[1, 2, 4], [3, 4, -2]]"),
+               this->mask_scalar(true), this->scalar("[7, 0, 8]"),
+               this->array("[[7, 0, 8], [7, 0, 8]]"));
+  this->Assert(ReplaceWithMask, this->array("[[1, 2, 4], [3, 4, -2]]"),
+               this->mask_scalar(true), this->scalar("null"),
+               this->array("[null, null]"));
+
+  this->Assert(ReplaceWithMask, this->array("[]"), this->mask("[]"), this->array("[]"),
+               this->array("[]"));
+  this->Assert(ReplaceWithMask,
+               this->array("[[1, 2, 4], [1, 2, 4], [1, 2, 4], [1, 2, 4]]"),
+               this->mask("[false, false, false, false]"), this->array("[]"),
+               this->array("[[1, 2, 4], [1, 2, 4], [1, 2, 4], [1, 2, 4]]"));
+  this->Assert(ReplaceWithMask,
+               this->array("[[1, 2, 4], [1, 2, 4], [1, 2, 4], [1, 2, 4]]"),
+               this->mask("[true, true, true, true]"),
+               this->array("[[3, 4, -2], [3, 4, -2], [3, 4, -2], [3, 4, -2]]"),
+               this->array("[[3, 4, -2], [3, 4, -2], [3, 4, -2], [3, 4, -2]]"));
+  this->Assert(ReplaceWithMask,
+               this->array("[[1, 2, 4], [1, 2, 4], [1, 2, 4], [1, 2, 4]]"),
+               this->mask("[null, null, null, null]"), this->array("[]"),
+               this->array("[null, null, null, null]"));
+  this->Assert(ReplaceWithMask, this->array("[[1, 2, 4], [1, 2, 4], [1, 2, 4], null]"),
+               this->mask("[false, false, false, false]"), this->array("[]"),
+               this->array("[[1, 2, 4], [1, 2, 4], [1, 2, 4], null]"));
+  this->Assert(ReplaceWithMask, this->array("[[1, 2, 4], [1, 2, 4], [1, 2, 4], null]"),
+               this->mask("[true, true, true, true]"),
+               this->array("[[3, 4, -2], [3, 4, -2], [3, 4, -2], [3, 4, -2]]"),
+               this->array("[[3, 4, -2], [3, 4, -2], [3, 4, -2], [3, 4, -2]]"));
+  this->Assert(ReplaceWithMask, this->array("[[1, 2, 4], [1, 2, 4], [1, 2, 4], null]"),
+               this->mask("[null, null, null, null]"), this->array("[]"),
+               this->array("[null, null, null, null]"));
+  this->Assert(
+      ReplaceWithMask,
+      this->array("[[1, 2, 4], [1, 2, 4], [1, 2, 4], [1, 2, 4], [1, 2, 4], [1, 2, 4]]"),
+      this->mask("[false, false, null, null, true, true]"),
+      this->array("[[3, 4, -2], null]"),
+      this->array("[[1, 2, 4], [1, 2, 4], null, null, [3, 4, -2], null]"));
+  this->Assert(ReplaceWithMask, this->array("[null, null, null, null, null, null]"),
+               this->mask("[false, false, null, null, true, true]"),
+               this->array("[[3, 4, -2], null]"),
+               this->array("[null, null, null, null, [3, 4, -2], null]"));
+
+  this->Assert(ReplaceWithMask, this->array("[]"), this->mask("[]"),
+               this->scalar("[7, 0, 8]"), this->array("[]"));
+  this->Assert(ReplaceWithMask, this->array("[[1, 2, 4], [3, 4, -2]]"),
+               this->mask("[true, true]"), this->scalar("[7, 0, 8]"),
+               this->array("[[7, 0, 8], [7, 0, 8]]"));
+  this->Assert(ReplaceWithMask, this->array("[[1, 2, 4], [3, 4, -2]]"),
+               this->mask("[true, true]"), this->scalar("null"),
+               this->array("[null, null]"));
+  this->Assert(ReplaceWithMask, this->array("[[1, 2, 4], [3, 4, -2], [-5, 6, 7]]"),
+               this->mask("[false, null, true]"), this->scalar("[7, 0, 8]"),
+               this->array("[[1, 2, 4], null, [7, 0, 8]]"));
 }
 
 TYPED_TEST(TestReplaceBinary, ReplaceWithMask) {
@@ -627,13 +709,13 @@ TYPED_TEST(TestReplaceBinary, ReplaceWithMask) {
                this->array(R"([null, null, null, null])"));
   this->Assert(ReplaceWithMask,
                this->array(R"(["a", "bb", "ccc", "dddd", "eeeee", "f"])"),
-               this->mask("[true, true, false, false, null, null]"),
+               this->mask("[false, false, null, null, true, true]"),
                this->array(R"(["ggg", null])"),
-               this->array(R"(["ggg", null, "ccc", "dddd", null, null])"));
+               this->array(R"(["a", "bb", null, null, "ggg", null])"));
   this->Assert(ReplaceWithMask, this->array(R"([null, null, null, null, null, null])"),
-               this->mask("[true, true, false, false, null, null]"),
+               this->mask("[false, false, null, null, true, true]"),
                this->array(R"(["a", null])"),
-               this->array(R"(["a", null, null, null, null, null])"));
+               this->array(R"([null, null, null, null, "a", null])"));
 
   this->Assert(ReplaceWithMask, this->array("[]"), this->mask("[]"),
                this->scalar(R"("zzz")"), this->array("[]"));
@@ -642,8 +724,8 @@ TYPED_TEST(TestReplaceBinary, ReplaceWithMask) {
   this->Assert(ReplaceWithMask, this->array(R"(["a", "bb"])"), this->mask("[true, true]"),
                this->scalar("null"), this->array("[null, null]"));
   this->Assert(ReplaceWithMask, this->array(R"(["a", "bb", "ccc"])"),
-               this->mask("[true, false, null]"), this->scalar(R"("zzz")"),
-               this->array(R"(["zzz", "bb", null])"));
+               this->mask("[false, null, true]"), this->scalar(R"("zzz")"),
+               this->array(R"(["a", null, "zzz"])"));
 }
 
 TYPED_TEST(TestReplaceBinary, ReplaceWithMaskRandom) {
diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc
index 309815606ef81..bed7536319ef0 100644
--- a/cpp/src/arrow/type.cc
+++ b/cpp/src/arrow/type.cc
@@ -2374,7 +2374,7 @@ void InitStaticData() {
                       timestamp(TimeUnit::NANO)};
 
   // Interval types
-  g_interval_types = {day_time_interval(), month_interval()};
+  g_interval_types = {day_time_interval(), month_interval(), month_day_nano_interval()};
 
   // Base binary types (without FixedSizeBinary)
   g_base_binary_types = {binary(), utf8(), large_binary(), large_utf8()};
diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h
index 56d059287b335..5e4c975d3a257 100644
--- a/cpp/src/arrow/type_traits.h
+++ b/cpp/src/arrow/type_traits.h
@@ -235,6 +235,7 @@ struct TypeTraits<MonthDayNanoIntervalType> {
   using ArrayType = MonthDayNanoIntervalArray;
   using BuilderType = MonthDayNanoIntervalBuilder;
   using ScalarType = MonthDayNanoIntervalScalar;
+  using CType = MonthDayNanoIntervalType::c_type;
 
   static constexpr int64_t bytes_required(int64_t elements) {
     return elements *

From f51dc3475273d2cb5f571b534283faf892a4d701 Mon Sep 17 00:00:00 2001
From: Jonathan Keane <jkeane@gmail.com>
Date: Wed, 10 Nov 2021 08:00:48 -0600
Subject: [PATCH 124/194] ARROW-14464: [R] Change write_parquet()'s default
 chunk_size from all rows

Closes #11613 from jonkeane/ARROW-14464-better-chunks

Authored-by: Jonathan Keane <jkeane@gmail.com>
Signed-off-by: Jonathan Keane <jkeane@gmail.com>
---
 r/R/parquet.R                   | 22 ++++++++++++++++++-
 r/tests/testthat/test-parquet.R | 39 +++++++++++++++++++++++++++++++++
 2 files changed, 60 insertions(+), 1 deletion(-)

diff --git a/r/R/parquet.R b/r/R/parquet.R
index ee2ed57de249a..63ff8bd2f13c3 100644
--- a/r/R/parquet.R
+++ b/r/R/parquet.R
@@ -196,7 +196,27 @@ write_parquet <- function(x,
       allow_truncated_timestamps = allow_truncated_timestamps
     )
   )
-  writer$WriteTable(x, chunk_size = chunk_size %||% x$num_rows)
+
+  # determine an approximate chunk size
+  if (is.null(chunk_size)) {
+    num_cells <- x$num_rows * x$num_columns
+    target_cells_per_group <- getOption("arrow.parquet_cells_per_group", 2.5e8)
+
+    if (num_cells < target_cells_per_group) {
+      # If the total number of cells is less than the default 250 million, we want one group
+      num_chunks <- 1
+    } else {
+      # no more than the default 250 million cells (rows * cols) per group
+      num_chunks <- num_cells / target_cells_per_group
+    }
+
+    # but there are no more than 200 chunks
+    num_chunks <- min(num_chunks, getOption("arrow.parquet_max_chunks", 200))
+
+    chunk_size <- x$num_rows / num_chunks
+  }
+
+  writer$WriteTable(x, chunk_size = chunk_size)
   writer$Close()
 
   invisible(x_out)
diff --git a/r/tests/testthat/test-parquet.R b/r/tests/testthat/test-parquet.R
index 55d86b532b02b..8c703f622d256 100644
--- a/r/tests/testthat/test-parquet.R
+++ b/r/tests/testthat/test-parquet.R
@@ -272,3 +272,42 @@ test_that("Error is created when parquet reads a feather file", {
     "Parquet magic bytes not found in footer"
   )
 })
+
+test_that("ParquetFileWrite chunk_size defaults", {
+  tab <- Table$create(x = 1:100)
+  tf <- tempfile()
+  on.exit(unlink(tf))
+
+  # we can alter our default cells per group
+  withr::with_options(
+    list(
+      arrow.parquet_cells_per_group = 25
+    ), {
+      # this will be 4 chunks
+      write_parquet(tab, tf)
+      reader <- ParquetFileReader$create(tf)
+
+      expect_true(reader$ReadRowGroup(0) == Table$create(x = 1:25))
+      expect_true(reader$ReadRowGroup(3) == Table$create(x = 76:100))
+      expect_error(reader$ReadRowGroup(5), "Some index in row_group_indices")
+    })
+
+  # but we always have no more than max_chunks (even if cells_per_group is low!)
+  # use a new tempfile so that windows doesn't complain about the file being over-written
+  tf <- tempfile()
+  on.exit(unlink(tf))
+
+  withr::with_options(
+    list(
+      arrow.parquet_cells_per_group = 25,
+      arrow.parquet_max_chunks = 2
+    ), {
+      # this will be 4 chunks
+      write_parquet(tab, tf)
+      reader <- ParquetFileReader$create(tf)
+
+      expect_true(reader$ReadRowGroup(0) == Table$create(x = 1:50))
+      expect_true(reader$ReadRowGroup(1) == Table$create(x = 51:100))
+      expect_error(reader$ReadRowGroup(3), "Some index in row_group_indices")
+    })
+})

From 3c1f702e0542290c089b94bdf24e8315e1f95655 Mon Sep 17 00:00:00 2001
From: Jonathan Keane <jkeane@gmail.com>
Date: Wed, 10 Nov 2021 08:01:11 -0600
Subject: [PATCH 125/194] ARROW-12315: [R] add max_partitions argument to
 write_dataset()

Closes #9972 from pachadotdev/arrow12315

Authored-by: Jonathan Keane <jkeane@gmail.com>
Signed-off-by: Jonathan Keane <jkeane@gmail.com>
---
 cpp/src/arrow/dataset/file_base.cc    |  5 +++
 r/R/arrowExports.R                    |  5 ++-
 r/R/dataset-write.R                   |  9 ++++-
 r/man/arrow-package.Rd                |  6 +--
 r/man/write_dataset.Rd                | 22 +++++++----
 r/src/arrowExports.cpp                | 11 +++---
 r/src/dataset.cpp                     |  3 +-
 r/tests/testthat/test-dataset-write.R | 53 +++++++++++++++++++++++++++
 r/vignettes/developing.Rmd            |  1 -
 9 files changed, 93 insertions(+), 22 deletions(-)

diff --git a/cpp/src/arrow/dataset/file_base.cc b/cpp/src/arrow/dataset/file_base.cc
index 4ff3c6d2b4e6e..2b605b338f707 100644
--- a/cpp/src/arrow/dataset/file_base.cc
+++ b/cpp/src/arrow/dataset/file_base.cc
@@ -352,6 +352,11 @@ class DatasetWritingSinkNodeConsumer : public compute::SinkNodeConsumer {
     ARROW_ASSIGN_OR_RAISE(auto groups, write_options_.partitioning->Partition(batch));
     batch.reset();  // drop to hopefully conserve memory
 
+    if (write_options_.max_partitions <= 0) {
+      return Status::Invalid("max_partitions must be positive (was ",
+                             write_options_.max_partitions, ")");
+    }
+
     if (groups.batches.size() > static_cast<size_t>(write_options_.max_partitions)) {
       return Status::Invalid("Fragment would be written into ", groups.batches.size(),
                              " partitions. This exceeds the maximum of ",
diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R
index 014b1641f7d00..d478d20803340 100644
--- a/r/R/arrowExports.R
+++ b/r/R/arrowExports.R
@@ -604,8 +604,8 @@ dataset___ScanTask__get_batches <- function(scan_task) {
   .Call(`_arrow_dataset___ScanTask__get_batches`, scan_task)
 }
 
-dataset___Dataset__Write <- function(file_write_options, filesystem, base_dir, partitioning, basename_template, scanner, existing_data_behavior) {
-  invisible(.Call(`_arrow_dataset___Dataset__Write`, file_write_options, filesystem, base_dir, partitioning, basename_template, scanner, existing_data_behavior))
+dataset___Dataset__Write <- function(file_write_options, filesystem, base_dir, partitioning, basename_template, scanner, existing_data_behavior, max_partitions) {
+  invisible(.Call(`_arrow_dataset___Dataset__Write`, file_write_options, filesystem, base_dir, partitioning, basename_template, scanner, existing_data_behavior, max_partitions))
 }
 
 dataset___Scanner__TakeRows <- function(scanner, indices) {
@@ -1799,3 +1799,4 @@ SetIOThreadPoolCapacity <- function(threads) {
 Array__infer_type <- function(x) {
   .Call(`_arrow_Array__infer_type`, x)
 }
+
diff --git a/r/R/dataset-write.R b/r/R/dataset-write.R
index 3a98357b013f6..059922d2fd768 100644
--- a/r/R/dataset-write.R
+++ b/r/R/dataset-write.R
@@ -48,6 +48,8 @@
 #' - "delete_matching" then the writer will delete any existing partitions
 #'   if data is going to be written to those partitions and will leave alone
 #'   partitions which data is not written to.
+#' @param max_partitions maximum number of partitions any batch may be
+#' written into. Default is 1024L.
 #' @param ... additional format-specific arguments. For available Parquet
 #' options, see [write_parquet()]. The available Feather options are
 #' - `use_legacy_format` logical: write data formatted so that Arrow libraries
@@ -108,6 +110,7 @@ write_dataset <- function(dataset,
                           basename_template = paste0("part-{i}.", as.character(format)),
                           hive_style = TRUE,
                           existing_data_behavior = c("overwrite", "error", "delete_matching"),
+                          max_partitions = 1024L,
                           ...) {
   format <- match.arg(format)
   if (inherits(dataset, "arrow_dplyr_query")) {
@@ -136,9 +139,13 @@ write_dataset <- function(dataset,
   existing_data_behavior_opts <- c("delete_matching", "overwrite", "error")
   existing_data_behavior <- match(match.arg(existing_data_behavior), existing_data_behavior_opts) - 1L
 
+  if (!is_integerish(max_partitions, n = 1) || is.na(max_partitions) || max_partitions < 0) {
+    abort("max_partitions must be a positive, non-missing integer")
+  }
+
   dataset___Dataset__Write(
     options, path_and_fs$fs, path_and_fs$path,
     partitioning, basename_template, scanner,
-    existing_data_behavior
+    existing_data_behavior, max_partitions
   )
 }
diff --git a/r/man/arrow-package.Rd b/r/man/arrow-package.Rd
index 021762162b7cb..122f7682e1720 100644
--- a/r/man/arrow-package.Rd
+++ b/r/man/arrow-package.Rd
@@ -6,11 +6,7 @@
 \alias{arrow-package}
 \title{arrow: Integration to 'Apache' 'Arrow'}
 \description{
-'Apache' 'Arrow' <https://arrow.apache.org/> is a cross-language
-    development platform for in-memory data. It specifies a standardized
-    language-independent columnar memory format for flat and hierarchical data,
-    organized for efficient analytic operations on modern hardware. This
-    package provides an interface to the 'Arrow C++' library.
+'Apache' 'Arrow' <https://arrow.apache.org/> is a cross-language development platform for in-memory data. It specifies a standardized language-independent columnar memory format for flat and hierarchical data, organized for efficient analytic operations on modern hardware. This package provides an interface to the 'Arrow C++' library.
 }
 \seealso{
 Useful links:
diff --git a/r/man/write_dataset.Rd b/r/man/write_dataset.Rd
index 76bbaf7c7b541..3cb16bc362272 100644
--- a/r/man/write_dataset.Rd
+++ b/r/man/write_dataset.Rd
@@ -12,6 +12,7 @@ write_dataset(
   basename_template = paste0("part-{i}.", as.character(format)),
   hive_style = TRUE,
   existing_data_behavior = c("overwrite", "error", "delete_matching"),
+  max_partitions = 1024L,
   ...
 )
 }
@@ -40,13 +41,20 @@ will yield \verb{"part-0.feather", ...}.}
 (\code{key1=value1/key2=value2/file.ext}) or as just bare values. Default is \code{TRUE}.}
 
 \item{existing_data_behavior}{The behavior to use when there is already data
-in the destination directory.  Must be one of overwrite, error, or
-delete_matching.  When this is set to "overwrite" (the default) then any
-new files created will overwrite existing files.  When this is set to
-"error" then the operation will fail if the destination directory is not
-empty.  When this is set to "delete_matching" then the writer will delete
-any existing partitions if data is going to be written to those partitions
-and will leave alone partitions which data is not written to.}
+in the destination directory.  Must be one of "overwrite", "error", or
+"delete_matching".
+\itemize{
+\item "overwrite" (the default) then any new files created will overwrite
+existing files
+\item "error" then the operation will fail if the destination directory is not
+empty
+\item "delete_matching" then the writer will delete any existing partitions
+if data is going to be written to those partitions and will leave alone
+partitions which data is not written to.
+}}
+
+\item{max_partitions}{maximum number of partitions any batch may be
+written into. Default is 1024L.}
 
 \item{...}{additional format-specific arguments. For available Parquet
 options, see \code{\link[=write_parquet]{write_parquet()}}. The available Feather options are
diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp
index 5872aa4d2f020..964d374526776 100644
--- a/r/src/arrowExports.cpp
+++ b/r/src/arrowExports.cpp
@@ -2396,8 +2396,8 @@ extern "C" SEXP _arrow_dataset___ScanTask__get_batches(SEXP scan_task_sexp){
 
 // dataset.cpp
 #if defined(ARROW_R_WITH_DATASET)
-void dataset___Dataset__Write(const std::shared_ptr<ds::FileWriteOptions>& file_write_options, const std::shared_ptr<fs::FileSystem>& filesystem, std::string base_dir, const std::shared_ptr<ds::Partitioning>& partitioning, std::string basename_template, const std::shared_ptr<ds::Scanner>& scanner, arrow::dataset::ExistingDataBehavior existing_data_behavior);
-extern "C" SEXP _arrow_dataset___Dataset__Write(SEXP file_write_options_sexp, SEXP filesystem_sexp, SEXP base_dir_sexp, SEXP partitioning_sexp, SEXP basename_template_sexp, SEXP scanner_sexp, SEXP existing_data_behavior_sexp){
+void dataset___Dataset__Write(const std::shared_ptr<ds::FileWriteOptions>& file_write_options, const std::shared_ptr<fs::FileSystem>& filesystem, std::string base_dir, const std::shared_ptr<ds::Partitioning>& partitioning, std::string basename_template, const std::shared_ptr<ds::Scanner>& scanner, arrow::dataset::ExistingDataBehavior existing_data_behavior, int max_partitions);
+extern "C" SEXP _arrow_dataset___Dataset__Write(SEXP file_write_options_sexp, SEXP filesystem_sexp, SEXP base_dir_sexp, SEXP partitioning_sexp, SEXP basename_template_sexp, SEXP scanner_sexp, SEXP existing_data_behavior_sexp, SEXP max_partitions_sexp){
 BEGIN_CPP11
 	arrow::r::Input<const std::shared_ptr<ds::FileWriteOptions>&>::type file_write_options(file_write_options_sexp);
 	arrow::r::Input<const std::shared_ptr<fs::FileSystem>&>::type filesystem(filesystem_sexp);
@@ -2406,12 +2406,13 @@ BEGIN_CPP11
 	arrow::r::Input<std::string>::type basename_template(basename_template_sexp);
 	arrow::r::Input<const std::shared_ptr<ds::Scanner>&>::type scanner(scanner_sexp);
 	arrow::r::Input<arrow::dataset::ExistingDataBehavior>::type existing_data_behavior(existing_data_behavior_sexp);
-	dataset___Dataset__Write(file_write_options, filesystem, base_dir, partitioning, basename_template, scanner, existing_data_behavior);
+	arrow::r::Input<int>::type max_partitions(max_partitions_sexp);
+	dataset___Dataset__Write(file_write_options, filesystem, base_dir, partitioning, basename_template, scanner, existing_data_behavior, max_partitions);
 	return R_NilValue;
 END_CPP11
 }
 #else
-extern "C" SEXP _arrow_dataset___Dataset__Write(SEXP file_write_options_sexp, SEXP filesystem_sexp, SEXP base_dir_sexp, SEXP partitioning_sexp, SEXP basename_template_sexp, SEXP scanner_sexp, SEXP existing_data_behavior_sexp){
+extern "C" SEXP _arrow_dataset___Dataset__Write(SEXP file_write_options_sexp, SEXP filesystem_sexp, SEXP base_dir_sexp, SEXP partitioning_sexp, SEXP basename_template_sexp, SEXP scanner_sexp, SEXP existing_data_behavior_sexp, SEXP max_partitions_sexp){
 	Rf_error("Cannot call dataset___Dataset__Write(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
 }
 #endif
@@ -7320,7 +7321,7 @@ static const R_CallMethodDef CallEntries[] = {
 		{ "_arrow_dataset___Scanner__head", (DL_FUNC) &_arrow_dataset___Scanner__head, 2}, 
 		{ "_arrow_dataset___Scanner__schema", (DL_FUNC) &_arrow_dataset___Scanner__schema, 1}, 
 		{ "_arrow_dataset___ScanTask__get_batches", (DL_FUNC) &_arrow_dataset___ScanTask__get_batches, 1}, 
-		{ "_arrow_dataset___Dataset__Write", (DL_FUNC) &_arrow_dataset___Dataset__Write, 7}, 
+		{ "_arrow_dataset___Dataset__Write", (DL_FUNC) &_arrow_dataset___Dataset__Write, 8}, 
 		{ "_arrow_dataset___Scanner__TakeRows", (DL_FUNC) &_arrow_dataset___Scanner__TakeRows, 2}, 
 		{ "_arrow_dataset___Scanner__CountRows", (DL_FUNC) &_arrow_dataset___Scanner__CountRows, 1}, 
 		{ "_arrow_Int8__initialize", (DL_FUNC) &_arrow_Int8__initialize, 0}, 
diff --git a/r/src/dataset.cpp b/r/src/dataset.cpp
index 7e384aa545daa..a1d24fb51bc10 100644
--- a/r/src/dataset.cpp
+++ b/r/src/dataset.cpp
@@ -517,7 +517,7 @@ void dataset___Dataset__Write(
     const std::shared_ptr<fs::FileSystem>& filesystem, std::string base_dir,
     const std::shared_ptr<ds::Partitioning>& partitioning, std::string basename_template,
     const std::shared_ptr<ds::Scanner>& scanner,
-    arrow::dataset::ExistingDataBehavior existing_data_behavior) {
+    arrow::dataset::ExistingDataBehavior existing_data_behavior, int max_partitions) {
   ds::FileSystemDatasetWriteOptions opts;
   opts.file_write_options = file_write_options;
   opts.existing_data_behavior = existing_data_behavior;
@@ -525,6 +525,7 @@ void dataset___Dataset__Write(
   opts.base_dir = base_dir;
   opts.partitioning = partitioning;
   opts.basename_template = basename_template;
+  opts.max_partitions = max_partitions;
   StopIfNotOk(ds::FileSystemDataset::Write(opts, scanner));
 }
 
diff --git a/r/tests/testthat/test-dataset-write.R b/r/tests/testthat/test-dataset-write.R
index 8e7c077e62285..ef6c719afe66f 100644
--- a/r/tests/testthat/test-dataset-write.R
+++ b/r/tests/testthat/test-dataset-write.R
@@ -452,3 +452,56 @@ test_that("Dataset writing: unsupported features/input validation", {
     write_dataset(ds, tempfile(), basename_template = NULL)
   )
 })
+
+# see https://issues.apache.org/jira/browse/ARROW-12315
+test_that("Max partitions fails with non-integer values and less than required partitions values", {
+  skip_if_not_available("parquet")
+  df <- tibble::tibble(
+    int = 1:10,
+    dbl = as.numeric(1:10),
+    lgl = rep(c(TRUE, FALSE, NA, TRUE, FALSE), 2),
+    chr = letters[1:10],
+  )
+  dst_dir <- make_temp_dir()
+
+  # max_partitions = 10 => pass
+  expect_silent(
+    write_dataset(df, dst_dir, partitioning = "int", max_partitions = 10)
+  )
+
+  # max_partitions < 10 => error
+  expect_error(
+    write_dataset(df, dst_dir, partitioning = "int", max_partitions = 5),
+    "Fragment would be written into 10 partitions. This exceeds the maximum of 5"
+  )
+
+  # negative max_partitions => error
+  expect_error(
+    write_dataset(df, dst_dir, partitioning = "int", max_partitions = -3),
+    "max_partitions must be a positive, non-missing integer"
+  )
+
+  # round(max_partitions, 0) != max_partitions  => error
+  expect_error(
+    write_dataset(df, dst_dir, partitioning = "int", max_partitions = 3.5),
+    "max_partitions must be a positive, non-missing integer"
+  )
+
+  # max_partitions = NULL => fail
+  expect_error(
+    write_dataset(df, dst_dir, partitioning = "int", max_partitions = NULL),
+    "max_partitions must be a positive, non-missing integer"
+  )
+
+  # max_partitions = NA => fail
+  expect_error(
+    write_dataset(df, dst_dir, partitioning = "int", max_partitions = NA_integer_),
+    "max_partitions must be a positive, non-missing integer"
+  )
+
+  # max_partitions = chr => error
+  expect_error(
+    write_dataset(df, dst_dir, partitioning = "int", max_partitions = "foobar"),
+    "max_partitions must be a positive, non-missing integer"
+  )
+})
diff --git a/r/vignettes/developing.Rmd b/r/vignettes/developing.Rmd
index 5cff5e5608c98..9cd76ed33a9dd 100644
--- a/r/vignettes/developing.Rmd
+++ b/r/vignettes/developing.Rmd
@@ -602,4 +602,3 @@ guide](https://arrow.apache.org/docs/developers/cpp/building.html).
 ## Other installation issues
 
 There are a number of scripts that are triggered when the arrow R package is installed. For package users who are not interacting with the underlying code, these should all just work without configuration and pull in the most complete pieces (e.g. official binaries that we host). However, knowing about these scripts can help package developers troubleshoot if things go wrong in them or things go wrong in an install.  See [the installation vignette](./install.html#how-dependencies-are-resolved) for more information.
->>>>>>> master

From bb776d8984aa9d47eaa125917d1b39f70503c6af Mon Sep 17 00:00:00 2001
From: tomersolomon1 <tomer.solomon1@gmail.com>
Date: Wed, 10 Nov 2021 15:05:05 +0100
Subject: [PATCH 126/194] ARROW-14374: [Java] Integration tests for the C data
 Interface implementation for Java

Integration tests for the C data Interface implementation for Java.

Closes #11543 from tomersolomon1/java-c-data-interface-integration

Lead-authored-by: tomersolomon1 <tomer.solomon1@gmail.com>
Co-authored-by: Antoine Pitrou <pitrou@free.fr>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 .github/workflows/java_jni.yml              |  33 +++
 ci/scripts/java_cdata_integration.sh        |  29 +++
 docker-compose.yml                          |  40 ++++
 java/c/src/test/python/integration_tests.py | 223 ++++++++++++++++++++
 4 files changed, 325 insertions(+)
 create mode 100755 ci/scripts/java_cdata_integration.sh
 create mode 100644 java/c/src/test/python/integration_tests.py

diff --git a/.github/workflows/java_jni.yml b/.github/workflows/java_jni.yml
index 82b2a641071fa..d4296b674dfb4 100644
--- a/.github/workflows/java_jni.yml
+++ b/.github/workflows/java_jni.yml
@@ -78,3 +78,36 @@ jobs:
         if: success() && github.event_name == 'push' && github.repository == 'apache/arrow'
         continue-on-error: true
         run: archery docker push debian-java-jni
+
+  docker_integration_python:
+    name: AMD64 Debian 9 Java C Data Interface Integration
+    runs-on: ubuntu-latest
+    if: ${{ !contains(github.event.pull_request.title, 'WIP') }}
+    timeout-minutes: 90
+    steps:
+      - name: Checkout Arrow
+        uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+      - name: Fetch Submodules and Tags
+        run: ci/scripts/util_checkout.sh
+      - name: Free Up Disk Space
+        run: ci/scripts/util_cleanup.sh
+      - name: Cache Docker Volumes
+        uses: actions/cache@v2
+        with:
+          path: .docker
+          key: maven-${{ hashFiles('java/**') }}
+          restore-keys: maven-
+      - name: Setup Python
+        uses: actions/setup-python@v1
+        with:
+          python-version: 3.8
+      - name: Setup Archery
+        run: pip install -e dev/archery[docker]
+      - name: Execute Docker Build
+        run: archery docker run conda-python-java-integration
+      - name: Docker Push
+        if: success() && github.event_name == 'push' && github.repository == 'apache/arrow'
+        continue-on-error: true
+        run: archery docker push conda-python-java-integration
diff --git a/ci/scripts/java_cdata_integration.sh b/ci/scripts/java_cdata_integration.sh
new file mode 100755
index 0000000000000..86ea7cf155350
--- /dev/null
+++ b/ci/scripts/java_cdata_integration.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -ex
+
+arrow_dir=${1}
+export ARROW_SOURCE_DIR=${arrow_dir}
+
+pushd ${arrow_dir}/java/c/src/test/python
+
+python integration_tests.py
+
+popd
diff --git a/docker-compose.yml b/docker-compose.yml
index 9d6a4e3809d2d..6f97a0362269d 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -104,6 +104,7 @@ x-hierarchy:
         - conda-python-dask
         - conda-python-hdfs
         - conda-python-jpype
+        - conda-python-java-integration
         - conda-python-turbodbc
         - conda-python-kartothek
         - conda-python-spark
@@ -1002,6 +1003,45 @@ services:
         /arrow/ci/scripts/java_build.sh /arrow /build &&
         /arrow/ci/scripts/python_test.sh /arrow"]
 
+  conda-python-java-integration:
+    # Usage:
+    #   docker-compose build conda
+    #   docker-compose build conda-cpp
+    #   docker-compose build conda-python
+    #   docker-compose build conda-python-java-integration
+    #   docker-compose run --rm conda-python-java-integration
+    image: ${REPO}:${ARCH}-conda-python-${PYTHON}-java-integration
+    build:
+      context: .
+      dockerfile: ci/docker/conda-python-jpype.dockerfile
+      cache_from:
+        - ${REPO}:${ARCH}-conda-python-${PYTHON}-java-integration
+      args:
+        repo: ${REPO}
+        arch: ${ARCH}
+        python: ${PYTHON}
+        llvm: ${LLVM}
+    shm_size: *shm-size
+    environment:
+      <<: *ccache
+      ARROW_DATASET: "OFF"
+      ARROW_FLIGHT: "OFF"
+      ARROW_GANDIVA: "OFF"
+      ARROW_JAVA_CDATA: "ON"
+      ARROW_ORC: "OFF"
+      ARROW_PARQUET: "OFF"
+      ARROW_PLASMA: "OFF"
+    volumes:
+      - .:/arrow:delegated
+      - ${DOCKER_VOLUME_PREFIX}maven-cache:/root/.m2:delegated
+      - ${DOCKER_VOLUME_PREFIX}debian-ccache:/ccache:delegated
+    command:
+      [ "/arrow/ci/scripts/cpp_build.sh /arrow /build &&
+          /arrow/ci/scripts/python_build.sh /arrow /build &&
+          /arrow/ci/scripts/java_cdata_build.sh /arrow /build/java/c/build /build/java/c &&
+          /arrow/ci/scripts/java_build.sh /arrow /build &&
+          /arrow/ci/scripts/java_cdata_integration.sh /arrow" ]
+
   conda-python-turbodbc:
     # Possible $TURBODBC parameters:
     #  - `latest`: latest release
diff --git a/java/c/src/test/python/integration_tests.py b/java/c/src/test/python/integration_tests.py
new file mode 100644
index 0000000000000..c1f130f21d47a
--- /dev/null
+++ b/java/c/src/test/python/integration_tests.py
@@ -0,0 +1,223 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import decimal
+import gc
+import os
+import sys
+import unittest
+import xml.etree.ElementTree as ET
+
+import jpype
+import pyarrow as pa
+from pyarrow.cffi import ffi
+
+
+def setup_jvm():
+    # This test requires Arrow Java to be built in the same source tree
+    try:
+        arrow_dir = os.environ["ARROW_SOURCE_DIR"]
+    except KeyError:
+        arrow_dir = os.path.join(os.path.dirname(
+            __file__), '..', '..', '..', '..', '..')
+    pom_path = os.path.join(arrow_dir, 'java', 'pom.xml')
+    tree = ET.parse(pom_path)
+    version = tree.getroot().find(
+        'POM:version',
+        namespaces={
+            'POM': 'http://maven.apache.org/POM/4.0.0'
+        }).text
+    jar_path = os.path.join(
+        arrow_dir, 'java', 'tools', 'target',
+        'arrow-tools-{}-jar-with-dependencies.jar'.format(version))
+    jar_path = os.getenv("ARROW_TOOLS_JAR", jar_path)
+    jar_path += ":{}".format(os.path.join(arrow_dir,
+                                          "java", "c/target/arrow-c-data-{}.jar".format(version)))
+    kwargs = {}
+    # This will be the default behaviour in jpype 0.8+
+    kwargs['convertStrings'] = False
+    jpype.startJVM(jpype.getDefaultJVMPath(), "-Djava.class.path=" + jar_path, **kwargs)
+
+
+class Bridge:
+    def __init__(self):
+        self.java_allocator = jpype.JPackage(
+            "org").apache.arrow.memory.RootAllocator(sys.maxsize)
+        self.java_c = jpype.JPackage("org").apache.arrow.c
+
+    def java_to_python_field(self, jfield):
+        c_schema = ffi.new("struct ArrowSchema*")
+        ptr_schema = int(ffi.cast("uintptr_t", c_schema))
+        self.java_c.Data.exportField(self.java_allocator, jfield, None,
+                                     self.java_c.ArrowSchema.wrap(ptr_schema))
+        return pa.Field._import_from_c(ptr_schema)
+
+    def java_to_python_array(self, vector, dictionary_provider=None):
+        c_schema = ffi.new("struct ArrowSchema*")
+        ptr_schema = int(ffi.cast("uintptr_t", c_schema))
+        c_array = ffi.new("struct ArrowArray*")
+        ptr_array = int(ffi.cast("uintptr_t", c_array))
+        self.java_c.Data.exportVector(self.java_allocator, vector, dictionary_provider, self.java_c.ArrowArray.wrap(
+            ptr_array), self.java_c.ArrowSchema.wrap(ptr_schema))
+        return pa.Array._import_from_c(ptr_array, ptr_schema)
+
+    def java_to_python_record_batch(self, root):
+        c_schema = ffi.new("struct ArrowSchema*")
+        ptr_schema = int(ffi.cast("uintptr_t", c_schema))
+        c_array = ffi.new("struct ArrowArray*")
+        ptr_array = int(ffi.cast("uintptr_t", c_array))
+        self.java_c.Data.exportVectorSchemaRoot(self.java_allocator, root, None, self.java_c.ArrowArray.wrap(
+            ptr_array), self.java_c.ArrowSchema.wrap(ptr_schema))
+        return pa.RecordBatch._import_from_c(ptr_array, ptr_schema)
+
+    def python_to_java_field(self, field):
+        c_schema = self.java_c.ArrowSchema.allocateNew(self.java_allocator)
+        field._export_to_c(c_schema.memoryAddress())
+        return self.java_c.Data.importField(self.java_allocator, c_schema, None)
+
+    def python_to_java_array(self, array, dictionary_provider=None):
+        c_schema = self.java_c.ArrowSchema.allocateNew(self.java_allocator)
+        c_array = self.java_c.ArrowArray.allocateNew(self.java_allocator)
+        array._export_to_c(c_array.memoryAddress(), c_schema.memoryAddress())
+        return self.java_c.Data.importVector(self.java_allocator, c_array, c_schema, dictionary_provider)
+
+    def python_to_java_record_batch(self, record_batch):
+        c_schema = self.java_c.ArrowSchema.allocateNew(self.java_allocator)
+        c_array = self.java_c.ArrowArray.allocateNew(self.java_allocator)
+        record_batch._export_to_c(
+            c_array.memoryAddress(), c_schema.memoryAddress())
+        return self.java_c.Data.importVectorSchemaRoot(self.java_allocator, c_array, c_schema, None)
+
+    def close(self):
+        self.java_allocator.close()
+
+
+class TestPythonIntegration(unittest.TestCase):
+    def setUp(self):
+        gc.collect()
+        self.old_allocated_python = pa.total_allocated_bytes()
+        self.bridge = Bridge()
+
+    def tearDown(self):
+        self.bridge.close()
+        gc.collect()
+        diff_python = pa.total_allocated_bytes() - self.old_allocated_python
+        self.assertEqual(
+            pa.total_allocated_bytes(), self.old_allocated_python,
+            f"PyArrow memory was not adequately released: {diff_python} bytes lost")
+
+    def round_trip_field(self, field_generator):
+        original_field = field_generator()
+        java_field = self.bridge.python_to_java_field(original_field)
+        del original_field
+        new_field = self.bridge.java_to_python_field(java_field)
+        del java_field
+
+        expected = field_generator()
+        self.assertEqual(expected, new_field)
+
+    def round_trip_array(self, array_generator, expected_diff=None):
+        original_arr = array_generator()
+        with self.bridge.java_c.CDataDictionaryProvider() as dictionary_provider, \
+                self.bridge.python_to_java_array(original_arr, dictionary_provider) as vector:
+            del original_arr
+            new_array = self.bridge.java_to_python_array(vector, dictionary_provider)
+
+        expected = array_generator()
+        if expected_diff:
+            self.assertEqual(expected, new_array.view(expected.type))
+        self.assertEqual(expected.diff(new_array), expected_diff or '')
+
+    def round_trip_record_batch(self, rb_generator):
+        original_rb = rb_generator()
+        with self.bridge.python_to_java_record_batch(original_rb) as root:
+            del original_rb
+            new_rb = self.bridge.java_to_python_record_batch(root)
+
+        expected = rb_generator()
+        self.assertEqual(expected, new_rb)
+
+    def test_string_array(self):
+        self.round_trip_array(lambda: pa.array([None, "a", "bb", "ccc"]))
+
+    def test_decimal_array(self):
+        data = [
+            round(decimal.Decimal(722.82), 2),
+            round(decimal.Decimal(-934.11), 2),
+            None,
+        ]
+        self.round_trip_array(lambda: pa.array(data, pa.decimal128(5, 2)))
+
+    def test_int_array(self):
+        self.round_trip_array(lambda: pa.array([1, 2, 3], type=pa.int32()))
+
+    def test_list_array(self):
+        self.round_trip_array(lambda: pa.array(
+            [[], [0], [1, 2], [4, 5, 6]], pa.list_(pa.int64())
+        ), "# Array types differed: list<item: int64> vs list<$data$: int64>\n")
+
+    def test_struct_array(self):
+        fields = [
+            ("f1", pa.int32()),
+            ("f2", pa.string()),
+        ]
+        data = [
+            {"f1": 1, "f2": "a"},
+            None,
+            {"f1": 3, "f2": None},
+            {"f1": None, "f2": "d"},
+            {"f1": None, "f2": None},
+        ]
+        self.round_trip_array(lambda: pa.array(data, type=pa.struct(fields)))
+
+    def test_dict(self):
+        self.round_trip_array(
+            lambda: pa.array(["a", "b", None, "d"], pa.dictionary(pa.int64(), pa.utf8())))
+
+    def test_map(self):
+        offsets = [0, None, 2, 6]
+        pykeys = [b"a", b"b", b"c", b"d", b"e", b"f"]
+        pyitems = [1, 2, 3, None, 4, 5]
+        keys = pa.array(pykeys, type="binary")
+        items = pa.array(pyitems, type="i4")
+        self.round_trip_array(
+            lambda: pa.MapArray.from_arrays(offsets, keys, items))
+
+    def test_field(self):
+        self.round_trip_field(lambda: pa.field("aa", pa.bool_()))
+
+    def test_field_nested(self):
+        self.round_trip_field(lambda: pa.field(
+            "test", pa.list_(pa.int32()), nullable=True))
+
+    def test_field_metadata(self):
+        self.round_trip_field(lambda: pa.field("aa", pa.bool_(), {"a": "b"}))
+
+    def test_record_batch_with_list(self):
+        data = [
+            pa.array([[1], [2], [3], [4, 5, 6]]),
+            pa.array([1, 2, 3, 4]),
+            pa.array(['foo', 'bar', 'baz', None]),
+            pa.array([True, None, False, True])
+        ]
+        self.round_trip_record_batch(
+            lambda: pa.RecordBatch.from_arrays(data, ['f0', 'f1', 'f2', 'f3']))
+
+
+if __name__ == '__main__':
+    setup_jvm()
+    unittest.main(verbosity=2)

From 575a4376bb02ba79327eaa61b024f4db9b4a7248 Mon Sep 17 00:00:00 2001
From: David Li <li.davidm96@gmail.com>
Date: Wed, 10 Nov 2021 15:06:51 +0100
Subject: [PATCH 127/194] ARROW-14167: [C++][R] Directly support dictionaries
 in coalesce

This was omitted from ARROW-13390. Note that even after this, there's work for ARROW-14105 to unify the type promotion rules and make them apply to dictionary indices/values as well. Contains some changes from ARROW-13358 and also updates case_when's promotion rules a bit.

Closes #11272 from lidavidm/arrow-14167

Lead-authored-by: David Li <li.davidm96@gmail.com>
Co-authored-by: Ian Cook <ianmcook@gmail.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 .../arrow/compute/kernels/scalar_if_else.cc   |  48 +++++-
 .../compute/kernels/scalar_if_else_test.cc    | 154 +++++++++++++++++-
 r/R/dplyr-functions.R                         |   6 -
 .../testthat/test-dplyr-funcs-conditional.R   |  37 +++--
 4 files changed, 212 insertions(+), 33 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else.cc b/cpp/src/arrow/compute/kernels/scalar_if_else.cc
index 88bc7ca2e76dd..ac33c856208ab 100644
--- a/cpp/src/arrow/compute/kernels/scalar_if_else.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_if_else.cc
@@ -1442,13 +1442,30 @@ struct CaseWhenFunction : ScalarFunction {
       }
     }
 
-    if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
+    // TODO(ARROW-14105): also apply casts to dictionary indices/values
+    if (is_dictionary((*values)[1].type->id()) &&
+        std::all_of(values->begin() + 2, values->end(), [&](const ValueDescr& descr) {
+          return descr.type->Equals(*(*values)[1].type);
+        })) {
+      auto kernel = DispatchExactImpl(this, *values);
+      DCHECK(kernel);
+      return kernel;
+    }
 
     EnsureDictionaryDecoded(values);
-    if (auto type = CommonNumeric(values->data() + 1, values->size() - 1)) {
-      for (auto it = values->begin() + 1; it != values->end(); it++) {
-        it->type = type;
-      }
+    ValueDescr* first_arg = &(*values)[1];
+    const size_t num_args = values->size() - 1;
+    if (auto type = CommonNumeric(first_arg, num_args)) {
+      ReplaceTypes(type, first_arg, num_args);
+    }
+    if (auto type = CommonBinary(first_arg, num_args)) {
+      ReplaceTypes(type, first_arg, num_args);
+    }
+    if (auto type = CommonTemporal(first_arg, num_args)) {
+      ReplaceTypes(type, first_arg, num_args);
+    }
+    if (HasDecimal(*values)) {
+      RETURN_NOT_OK(CastDecimalArgs(first_arg, num_args));
     }
     if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
     return arrow::compute::detail::NoMatchingKernel(this, *values);
@@ -1937,9 +1954,20 @@ struct CoalesceFunction : ScalarFunction {
   Result<const Kernel*> DispatchBest(std::vector<ValueDescr>* values) const override {
     RETURN_NOT_OK(CheckArity(*values));
     using arrow::compute::detail::DispatchExactImpl;
+
+    // TODO(ARROW-14105): also apply casts to dictionary indices/values
+    if (is_dictionary((*values)[0].type->id()) &&
+        std::all_of(values->begin() + 1, values->end(), [&](const ValueDescr& descr) {
+          return descr.type->Equals(*(*values)[0].type);
+        })) {
+      auto kernel = DispatchExactImpl(this, *values);
+      DCHECK(kernel);
+      return kernel;
+    }
+
     // Do not DispatchExact here since we want to rescale decimals if necessary
     EnsureDictionaryDecoded(values);
-    if (auto type = CommonNumeric(*values)) {
+    if (auto type = CommonNumeric(values->data(), values->size())) {
       ReplaceTypes(type, values);
     }
     if (auto type = CommonBinary(values->data(), values->size())) {
@@ -2247,7 +2275,7 @@ static Status ExecVarWidthCoalesceImpl(KernelContext* ctx, const ExecBatch& batc
   }
   ArrayData* output = out->mutable_array();
   std::unique_ptr<ArrayBuilder> raw_builder;
-  RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), out->type(), &raw_builder));
+  RETURN_NOT_OK(MakeBuilderExactIndex(ctx->memory_pool(), out->type(), &raw_builder));
   RETURN_NOT_OK(raw_builder->Reserve(batch.length));
   RETURN_NOT_OK(reserve_data(raw_builder.get()));
 
@@ -2391,7 +2419,8 @@ struct CoalesceFunctor<Type, enable_if_base_binary<Type>> {
 
 template <typename Type>
 struct CoalesceFunctor<
-    Type, enable_if_t<is_nested_type<Type>::value && !is_union_type<Type>::value>> {
+    Type, enable_if_t<(is_nested_type<Type>::value || is_dictionary_type<Type>::value) &&
+                      !is_union_type<Type>::value>> {
   static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
     RETURN_NOT_OK(CheckIdenticalTypes(&batch.values[0], batch.values.size()));
     for (const auto& datum : batch.values) {
@@ -2425,7 +2454,7 @@ struct CoalesceFunctor<Type, enable_if_union<Type>> {
   static Status ExecArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
     ArrayData* output = out->mutable_array();
     std::unique_ptr<ArrayBuilder> raw_builder;
-    RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), out->type(), &raw_builder));
+    RETURN_NOT_OK(MakeBuilderExactIndex(ctx->memory_pool(), out->type(), &raw_builder));
     RETURN_NOT_OK(raw_builder->Reserve(batch.length));
 
     const UnionType& type = checked_cast<const UnionType&>(*out->type());
@@ -2861,6 +2890,7 @@ void RegisterScalarIfElse(FunctionRegistry* registry) {
     AddCoalesceKernel(func, Type::STRUCT, CoalesceFunctor<StructType>::Exec);
     AddCoalesceKernel(func, Type::DENSE_UNION, CoalesceFunctor<DenseUnionType>::Exec);
     AddCoalesceKernel(func, Type::SPARSE_UNION, CoalesceFunctor<SparseUnionType>::Exec);
+    AddCoalesceKernel(func, Type::DICTIONARY, CoalesceFunctor<DictionaryType>::Exec);
     DCHECK_OK(registry->AddFunction(std::move(func)));
   }
   {
diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc
index 8c869ceef895d..bf895ae581488 100644
--- a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc
@@ -1103,7 +1103,8 @@ TYPED_TEST(TestCaseWhenDict, Simple) {
 }
 
 TYPED_TEST(TestCaseWhenDict, Mixed) {
-  auto type = dictionary(default_type_instance<TypeParam>(), utf8());
+  auto index_type = default_type_instance<TypeParam>();
+  auto type = dictionary(index_type, utf8());
   auto cond1 = ArrayFromJSON(boolean(), "[true, true, null, null]");
   auto cond2 = ArrayFromJSON(boolean(), "[true, false, true, null]");
   auto dict = R"(["a", null, "bc", "def"])";
@@ -1128,6 +1129,17 @@ TYPED_TEST(TestCaseWhenDict, Mixed) {
       "case_when",
       {MakeStruct({cond1, cond2}), values_null, values2_dict, values1_decoded},
       /*result_is_encoded=*/false);
+
+  // If we have mismatched dictionary types, we decode (for now)
+  auto values3_dict =
+      DictArrayFromJSON(dictionary(index_type, binary()), "[2, 1, null, 0]", dict);
+  auto values4_dict = DictArrayFromJSON(
+      dictionary(index_type->id() == Type::UINT8 ? int8() : uint8(), utf8()),
+      "[2, 1, null, 0]", dict);
+  CheckDictionary("case_when", {MakeStruct({cond1, cond2}), values1_dict, values3_dict},
+                  /*result_is_encoded=*/false);
+  CheckDictionary("case_when", {MakeStruct({cond1, cond2}), values1_dict, values4_dict},
+                  /*result_is_encoded=*/false);
 }
 
 TYPED_TEST(TestCaseWhenDict, NestedSimple) {
@@ -2113,6 +2125,17 @@ TEST(TestCaseWhen, UnionBoolString) {
 TEST(TestCaseWhen, DispatchBest) {
   CheckDispatchBest("case_when", {struct_({field("", boolean())}), int64(), int32()},
                     {struct_({field("", boolean())}), int64(), int64()});
+  CheckDispatchBest("case_when",
+                    {struct_({field("", boolean())}), binary(), large_utf8()},
+                    {struct_({field("", boolean())}), large_binary(), large_binary()});
+  CheckDispatchBest(
+      "case_when",
+      {struct_({field("", boolean())}), timestamp(TimeUnit::SECOND), date32()},
+      {struct_({field("", boolean())}), timestamp(TimeUnit::SECOND),
+       timestamp(TimeUnit::SECOND)});
+  CheckDispatchBest(
+      "case_when", {struct_({field("", boolean())}), decimal128(38, 0), decimal128(1, 1)},
+      {struct_({field("", boolean())}), decimal256(39, 1), decimal256(39, 1)});
 
   ASSERT_RAISES(Invalid, CallFunction("case_when", {}));
   // Too many/too few conditions
@@ -2385,6 +2408,132 @@ TYPED_TEST(TestCoalesceList, Errors) {
                                }));
 }
 
+template <typename Type>
+class TestCoalesceDict : public ::testing::Test {};
+
+TYPED_TEST_SUITE(TestCoalesceDict, IntegralArrowTypes);
+
+TYPED_TEST(TestCoalesceDict, Simple) {
+  for (const auto& dict :
+       {JsonDict{utf8(), R"(["a", null, "bc", "def"])"},
+        JsonDict{int64(), "[1, null, 2, 3]"},
+        JsonDict{decimal256(3, 2), R"(["1.23", null, "3.45", "6.78"])"}}) {
+    auto type = dictionary(default_type_instance<TypeParam>(), dict.type);
+    auto values_null = DictArrayFromJSON(type, "[null, null, null, null]", dict.value);
+    auto values1 = DictArrayFromJSON(type, "[0, null, 3, null]", dict.value);
+    auto values2 = DictArrayFromJSON(type, "[2, 1, null, null]", dict.value);
+    auto scalar = DictScalarFromJSON(type, "2", dict.value);
+
+    // Easy case: all arguments have the same dictionary
+    CheckDictionary("coalesce", {values1, values2});
+    CheckDictionary("coalesce", {values1, values2, values1});
+    CheckDictionary("coalesce", {values_null, values1});
+    CheckDictionary("coalesce", {values1, values_null});
+    CheckDictionary("coalesce", {values1, scalar});
+    CheckDictionary("coalesce", {values_null, scalar});
+    CheckDictionary("coalesce", {scalar, values1});
+  }
+}
+
+TYPED_TEST(TestCoalesceDict, Mixed) {
+  auto index_type = default_type_instance<TypeParam>();
+  auto type = dictionary(index_type, utf8());
+  auto dict = R"(["a", null, "bc", "def"])";
+  auto values_null = DictArrayFromJSON(type, "[null, null, null, null]", dict);
+  auto values1_dict = DictArrayFromJSON(type, "[0, null, 3, 1]", dict);
+  auto values1_decoded = ArrayFromJSON(utf8(), R"(["a", null, "def", null])");
+  auto values2_dict = DictArrayFromJSON(type, "[2, 1, null, 0]", dict);
+  auto values2_decoded = ArrayFromJSON(utf8(), R"(["bc", null, null, "a"])");
+  auto scalar = ScalarFromJSON(utf8(), R"("bc")");
+
+  // If we have mixed dictionary/non-dictionary arguments, we decode dictionaries
+  CheckDictionary("coalesce", {values1_dict, values2_decoded},
+                  /*result_is_encoded=*/false);
+  CheckDictionary("coalesce", {values1_decoded, values2_dict},
+                  /*result_is_encoded=*/false);
+  CheckDictionary("coalesce", {values1_dict, values2_dict, values1_decoded},
+                  /*result_is_encoded=*/false);
+  CheckDictionary("coalesce", {values_null, values2_dict, values1_decoded},
+                  /*result_is_encoded=*/false);
+  CheckDictionary("coalesce", {values_null, scalar}, /*result_is_encoded=*/false);
+  CheckDictionary("coalesce", {scalar, values_null}, /*result_is_encoded=*/false);
+  CheckDictionary("coalesce", {values1_dict, scalar}, /*result_is_encoded=*/false);
+  CheckDictionary("coalesce", {scalar, values2_dict}, /*result_is_encoded=*/false);
+
+  // If we have mismatched dictionary types, we decode (for now)
+  auto values3_dict =
+      DictArrayFromJSON(dictionary(index_type, binary()), "[2, 1, null, 0]", dict);
+  auto values4_dict = DictArrayFromJSON(
+      dictionary(index_type->id() == Type::UINT8 ? int8() : uint8(), utf8()),
+      "[2, 1, null, 0]", dict);
+  CheckDictionary("coalesce", {values1_dict, values3_dict}, /*result_is_encoded=*/false);
+  CheckDictionary("coalesce", {values1_dict, values4_dict}, /*result_is_encoded=*/false);
+}
+
+TYPED_TEST(TestCoalesceDict, NestedSimple) {
+  auto index_type = default_type_instance<TypeParam>();
+  auto inner_type = dictionary(index_type, utf8());
+  auto type = list(inner_type);
+  auto dict = R"(["a", null, "bc", "def"])";
+  auto values_null = MakeListOfDict(ArrayFromJSON(int32(), "[null, null, null, null, 0]"),
+                                    DictArrayFromJSON(inner_type, "[]", dict));
+  auto values1_backing = DictArrayFromJSON(inner_type, "[0, null, 3, 1]", dict);
+  auto values2_backing = DictArrayFromJSON(inner_type, "[2, 1, null, 0]", dict);
+  auto values1 =
+      MakeListOfDict(ArrayFromJSON(int32(), "[0, 2, 2, 3, 4]"), values1_backing);
+  auto values2 =
+      MakeListOfDict(ArrayFromJSON(int32(), "[0, 1, null, 2, 4]"), values2_backing);
+  auto scalar =
+      Datum(std::make_shared<ListScalar>(DictArrayFromJSON(inner_type, "[0, 1]", dict)));
+
+  CheckDictionary("coalesce", {values1, values2}, /*result_is_encoded=*/false);
+  CheckDictionary("coalesce", {values1, scalar}, /*result_is_encoded=*/false);
+  CheckDictionary("coalesce", {scalar, values2}, /*result_is_encoded=*/false);
+  CheckDictionary("coalesce", {values_null, values2}, /*result_is_encoded=*/false);
+  CheckDictionary("coalesce", {values1, values_null}, /*result_is_encoded=*/false);
+}
+
+TYPED_TEST(TestCoalesceDict, DifferentDictionaries) {
+  auto type = dictionary(default_type_instance<TypeParam>(), utf8());
+  auto dict1 = R"(["a", "", "bc", "def"])";
+  auto dict2 = R"(["bc", "foo", "", "a"])";
+  auto values1_null = DictArrayFromJSON(type, "[null, null, null, null]", dict1);
+  auto values2_null = DictArrayFromJSON(type, "[null, null, null, null]", dict2);
+  auto values1 = DictArrayFromJSON(type, "[null, 0, 3, 1]", dict1);
+  auto values2 = DictArrayFromJSON(type, "[2, 1, 0, null]", dict2);
+  auto scalar1 = DictScalarFromJSON(type, "0", dict1);
+  auto scalar2 = DictScalarFromJSON(type, "0", dict2);
+
+  CheckDictionary("coalesce", {values1, values2});
+  CheckDictionary("coalesce", {values1, scalar2});
+  CheckDictionary("coalesce", {scalar1, values2});
+  CheckDictionary("coalesce", {values1, scalar2});
+  CheckDictionary("coalesce", {values1_null, values2});
+  CheckDictionary("coalesce", {values1, values2_null});
+
+  // Test dictionaries with nulls (where decoding before/after calling coalesce changes
+  // the results)
+  dict1 = R"(["a", null, "bc", "def"])";
+  dict2 = R"(["bc", "foo", null, "a"])";
+  values1 = DictArrayFromJSON(type, "[null, 0, 3, 1]", dict1);
+  values2 = DictArrayFromJSON(type, "[2, 1, 0, null]", dict2);
+  scalar1 = DictScalarFromJSON(type, "0", dict1);
+
+  // Note this is sensitive to the implementation. Nulls are emitted here
+  // because a non-null index mapped to a null dictionary value and was emitted
+  // as a null (instead of encoding null in the dictionary)
+  CheckScalarNonRecursive(
+      "coalesce", {values1, values2},
+      DictArrayFromJSON(type, "[null, 0, 1, null]", R"(["a", "def"])"));
+  CheckScalarNonRecursive("coalesce", {values1, scalar1},
+                          DictArrayFromJSON(type, "[0, 0, 1, null]", R"(["a", "def"])"));
+  // The dictionary gets preserved since a leading non-null scalar just gets
+  // broadcasted and returned without going through the rest of the kernel
+  // implementation
+  CheckScalarNonRecursive("coalesce", {scalar1, values1},
+                          DictArrayFromJSON(type, "[0, 0, 0, 0]", dict1));
+}
+
 TEST(TestCoalesce, Null) {
   auto type = null();
   auto scalar_null = ScalarFromJSON(type, "null");
@@ -2770,6 +2919,9 @@ TEST(TestCoalesce, DispatchBest) {
                                      sparse_union({field("a", boolean())}),
                                      dense_union({field("a", boolean())}),
                                  });
+  CheckDispatchBest("coalesce",
+                    {dictionary(int8(), binary()), dictionary(int16(), large_utf8())},
+                    {large_binary(), large_binary()});
 }
 
 template <typename Type>
diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R
index 0ef30b2225562..170ab4b8ffb4b 100644
--- a/r/R/dplyr-functions.R
+++ b/r/R/dplyr-functions.R
@@ -77,12 +77,6 @@ nse_funcs$coalesce <- function(...) {
       arg <- Expression$scalar(arg)
     }
 
-    # coalesce doesn't yet support factors/dictionaries
-    # TODO: remove this after ARROW-14167 is merged
-    if (nse_funcs$is.factor(arg)) {
-      warning("Dictionaries (in R: factors) are currently converted to strings (characters) in coalesce", call. = FALSE)
-    }
-
     if (last_arg && arg$type_id() %in% TYPES_WITH_NAN) {
       # store the NA_real_ in the same type as arg to avoid avoid casting
       # smaller float types to larger float types
diff --git a/r/tests/testthat/test-dplyr-funcs-conditional.R b/r/tests/testthat/test-dplyr-funcs-conditional.R
index 4f27007958073..4e4346a7592ca 100644
--- a/r/tests/testthat/test-dplyr-funcs-conditional.R
+++ b/r/tests/testthat/test-dplyr-funcs-conditional.R
@@ -301,6 +301,26 @@ test_that("coalesce()", {
     df
   )
 
+  # factor
+  df_fct <- df %>%
+    transmute(across(everything(), ~ factor(.x, levels = c("a", "b", "c"))))
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        cw = coalesce(w),
+        cz = coalesce(z),
+        cwx = coalesce(w, x),
+        cwxy = coalesce(w, x, y),
+        cwxyz = coalesce(w, x, y, z)
+      ) %>%
+      collect() %>%
+      # Arrow coalesce() kernel does not preserve unused factor levels,
+      # so reset the levels of all the factor columns to make the test pass
+      # (ARROW-14649)
+      transmute(across(where(is.factor), ~ factor(.x, levels = c("a", "b", "c")))),
+    df_fct
+  )
+
   # integer
   df <- tibble(
     w = c(NA_integer_, NA_integer_, NA_integer_),
@@ -383,23 +403,6 @@ test_that("coalesce()", {
     df
   )
 
-  # factors
-  # TODO: remove the mutate + warning after ARROW-14167 is merged and Arrow
-  # supports factors in coalesce
-  df <- tibble(
-    x = factor("a", levels = c("a", "z")),
-    y = factor("b", levels = c("a", "b", "c"))
-  )
-  compare_dplyr_binding(
-    .input %>%
-      mutate(c = coalesce(x, y)) %>%
-      collect() %>%
-      # This is a no-op on the Arrow side, but necessary to make the results equal
-      mutate(c = as.character(c)),
-    df,
-    warning = "Dictionaries .* are currently converted to strings .* in coalesce"
-  )
-
   # no arguments
   expect_error(
     nse_funcs$coalesce(),

From e216c2e109071ab23e220d98a9748db94ee75c59 Mon Sep 17 00:00:00 2001
From: Nic Crane <thisisnic@gmail.com>
Date: Wed, 10 Nov 2021 15:20:30 +0000
Subject: [PATCH 128/194] ARROW-14469: [R] Binding for lubridate::month()
 doesn't have `label` argument implemented

Closes #11575 from thisisnic/ARROW-14469_month_label

Authored-by: Nic Crane <thisisnic@gmail.com>
Signed-off-by: Nic Crane <thisisnic@gmail.com>
---
 r/R/dplyr-functions.R                        | 13 ++++++
 r/R/expression.R                             |  1 -
 r/tests/testthat/test-dplyr-funcs-datetime.R | 48 ++++++++++++++++++++
 3 files changed, 61 insertions(+), 1 deletion(-)

diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R
index 170ab4b8ffb4b..0ccc5734a7f03 100644
--- a/r/R/dplyr-functions.R
+++ b/r/R/dplyr-functions.R
@@ -838,6 +838,19 @@ nse_funcs$wday <- function(x,
   Expression$create("day_of_week", x, options = list(count_from_zero = FALSE, week_start = week_start))
 }
 
+nse_funcs$month <- function(x, label = FALSE, abbr = TRUE, locale = Sys.getlocale("LC_TIME")) {
+  if (label) {
+    if (abbr) {
+      format <- "%b"
+    } else {
+      format <- "%B"
+    }
+    return(Expression$create("strftime", x, options = list(format = format, locale = locale)))
+  }
+
+  Expression$create("month", x)
+}
+
 nse_funcs$is.Date <- function(x) {
   inherits(x, "Date") ||
     (inherits(x, "Expression") && x$type_id() %in% Type[c("DATE32", "DATE64")])
diff --git a/r/R/expression.R b/r/R/expression.R
index 9c2554f9e05c7..bb300de524c4e 100644
--- a/r/R/expression.R
+++ b/r/R/expression.R
@@ -67,7 +67,6 @@
   "epiweek" = "us_week",
   "isoyear" = "iso_year",
   "minute" = "minute",
-  "month" = "month",
   "quarter" = "quarter",
   # second is defined in dplyr-functions.R
   # wday is defined in dplyr-functions.R
diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R
index c8b8005840292..0b1395680cb3f 100644
--- a/r/tests/testthat/test-dplyr-funcs-datetime.R
+++ b/r/tests/testthat/test-dplyr-funcs-datetime.R
@@ -122,6 +122,25 @@ test_that("extract month from timestamp", {
       collect(),
     test_df
   )
+
+  skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168
+
+  compare_dplyr_binding(
+    .input %>%
+      # R returns ordered factor whereas Arrow returns character
+      mutate(x = as.character(month(datetime, label = TRUE))) %>%
+      collect(),
+    test_df,
+    ignore_attr = TRUE
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(x = as.character(month(datetime, label = TRUE, abbr = TRUE))) %>%
+      collect(),
+    test_df,
+    ignore_attr = TRUE
+  )
 })
 
 test_that("extract isoweek from timestamp", {
@@ -286,6 +305,35 @@ test_that("extract epiweek from date", {
   )
 })
 
+test_that("extract month from date", {
+  compare_dplyr_binding(
+    .input %>%
+      mutate(x = month(date)) %>%
+      collect(),
+    test_df
+  )
+
+  skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168
+
+  compare_dplyr_binding(
+    .input %>%
+      # R returns ordered factor whereas Arrow returns character
+      mutate(x = as.character(month(date, label = TRUE))) %>%
+      collect(),
+    test_df,
+    ignore_attr = TRUE
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(x = as.character(month(date, label = TRUE, abbr = TRUE))) %>%
+      collect(),
+    test_df,
+    ignore_attr = TRUE
+  )
+})
+
+
 test_that("extract day from date", {
   compare_dplyr_binding(
     .input %>%

From 3e554c3703557b422fd762fe9c3844e514b32149 Mon Sep 17 00:00:00 2001
From: Nic Crane <thisisnic@gmail.com>
Date: Wed, 10 Nov 2021 15:23:08 +0000
Subject: [PATCH 129/194] ARROW-14657: [R][Docs] Broken link in R docs

Closes #11660 from thisisnic/ARROW-14657_link

Authored-by: Nic Crane <thisisnic@gmail.com>
Signed-off-by: Nic Crane <thisisnic@gmail.com>
---
 r/_pkgdown.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/r/_pkgdown.yml b/r/_pkgdown.yml
index c6a19119ed304..b3db0e0f2d4fb 100644
--- a/r/_pkgdown.yml
+++ b/r/_pkgdown.yml
@@ -44,7 +44,7 @@ navbar:
       text: Project docs
       menu:
         - text: Specification
-          href: https://arrow.apache.org/docs/format/README.html
+          href: https://arrow.apache.org/docs/format/Columnar.html
         - text: C GLib
           href: https://arrow.apache.org/docs/c_glib
         - text: C++

From efc1300fabb4d96a690ed8ac63881f7120f7ec56 Mon Sep 17 00:00:00 2001
From: Benjamin Kietzman <bengilgit@gmail.com>
Date: Wed, 10 Nov 2021 21:03:12 +0100
Subject: [PATCH 130/194] ARROW-14578: [Format][Documentation] Update
 union-of-structs doc

Demoted the section on `union-of-structs with non-overlapping-fields` to a `..note` and made it slightly more verbose, also made the doc slightly more consistent between dense and sparse.

Closes #11607 from bkietz/14578-

Authored-by: Benjamin Kietzman <bengilgit@gmail.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/arrow/ipc/read_write_test.cc | 86 ++++++++++++++++++++++++++--
 docs/source/format/Columnar.rst      | 29 +++++-----
 2 files changed, 94 insertions(+), 21 deletions(-)

diff --git a/cpp/src/arrow/ipc/read_write_test.cc b/cpp/src/arrow/ipc/read_write_test.cc
index 7bff394bcc779..f1c893cb49b37 100644
--- a/cpp/src/arrow/ipc/read_write_test.cc
+++ b/cpp/src/arrow/ipc/read_write_test.cc
@@ -57,6 +57,7 @@
 namespace arrow {
 
 using internal::checked_cast;
+using internal::checked_pointer_cast;
 using internal::GetByteWidth;
 using internal::TemporaryDir;
 
@@ -468,7 +469,7 @@ class IpcTestFixture : public io::MemoryMapFixture, public ExtensionTypesMixin {
     std::vector<std::shared_ptr<Field>> fields = {f0};
     auto schema = std::make_shared<Schema>(fields);
 
-    auto batch = RecordBatch::Make(schema, 0, {array});
+    auto batch = RecordBatch::Make(schema, array->length(), {array});
     CheckRoundtrip(*batch, options, IpcReadOptions::Defaults(), buffer_size);
   }
 
@@ -485,15 +486,15 @@ TEST(MetadataVersion, ForwardsCompatCheck) {
 
 class TestWriteRecordBatch : public ::testing::Test, public IpcTestFixture {
  public:
-  void SetUp() { IpcTestFixture::SetUp(); }
-  void TearDown() { IpcTestFixture::TearDown(); }
+  void SetUp() override { IpcTestFixture::SetUp(); }
+  void TearDown() override { IpcTestFixture::TearDown(); }
 };
 
 class TestIpcRoundTrip : public ::testing::TestWithParam<MakeRecordBatch*>,
                          public IpcTestFixture {
  public:
-  void SetUp() { IpcTestFixture::SetUp(); }
-  void TearDown() { IpcTestFixture::TearDown(); }
+  void SetUp() override { IpcTestFixture::SetUp(); }
+  void TearDown() override { IpcTestFixture::TearDown(); }
 
   void TestMetadataVersion(MetadataVersion expected_version) {
     std::shared_ptr<RecordBatch> batch;
@@ -606,6 +607,81 @@ TEST_P(TestIpcRoundTrip, ZeroLengthArrays) {
   CheckRoundtrip(bin_array2);
 }
 
+TEST_F(TestIpcRoundTrip, SparseUnionOfStructsWithReusedBuffers) {
+  auto storage_type = struct_({
+      field("i", int32()),
+      field("f", float32()),
+      field("s", utf8()),
+  });
+  auto storage = checked_pointer_cast<StructArray>(ArrayFromJSON(storage_type,
+                                                                 R"([
+    {"i": 0, "f": 0.0, "s": "a"},
+    {"i": 1, "f": 0.5, "s": "b"},
+    {"i": 2, "f": 1.5, "s": "c"},
+    {"i": 3, "f": 3.0, "s": "d"}
+  ])"));
+
+  ASSERT_OK_AND_ASSIGN(
+      auto m01, StructArray::Make({storage->field(0), storage->field(1)},
+                                  {storage_type->field(0), storage_type->field(1)}));
+
+  ASSERT_OK_AND_ASSIGN(
+      auto m12, StructArray::Make({storage->field(1), storage->field(2)},
+                                  {storage_type->field(1), storage_type->field(2)}));
+
+  ASSERT_OK_AND_ASSIGN(
+      auto m20, StructArray::Make({storage->field(2), storage->field(0)},
+                                  {storage_type->field(2), storage_type->field(0)}));
+
+  auto ids = ArrayFromJSON(int8(), "[1, 12, 20, 1]");
+
+  ASSERT_OK_AND_ASSIGN(
+      auto sparse,
+      SparseUnionArray::Make(*ids, {m01, m12, m20}, {"m01", "m12", "m20"}, {01, 12, 20}));
+
+  auto expected = ArrayFromJSON(sparse_union(
+                                    {
+                                        field("m01", m01->type()),
+                                        field("m12", m12->type()),
+                                        field("m20", m20->type()),
+                                    },
+                                    {01, 12, 20}),
+                                R"([
+    [1,  {"i": 0,   "f": 0.0}],
+    [12, {"f": 0.5, "s": "b"}],
+    [20, {"s": "c", "i": 2  }],
+    [1,  {"i": 3,   "f": 3.0}]
+  ])");
+
+  AssertArraysEqual(*expected, *sparse, /*verbose=*/true);
+
+  DictionaryMemo ignored;
+  ASSERT_OK_AND_ASSIGN(
+      auto roundtripped_batch,
+      DoStandardRoundTrip(*RecordBatch::Make(schema({field("", sparse->type())}),
+                                             sparse->length(), {sparse}),
+                          IpcWriteOptions::Defaults(), &ignored));
+
+  auto roundtripped =
+      checked_pointer_cast<SparseUnionArray>(roundtripped_batch->column(0));
+  AssertArraysEqual(*expected, *roundtripped, /*verbose=*/true);
+
+  auto roundtripped_m01 = checked_pointer_cast<StructArray>(roundtripped->field(0));
+  auto roundtripped_m12 = checked_pointer_cast<StructArray>(roundtripped->field(1));
+  auto roundtripped_m20 = checked_pointer_cast<StructArray>(roundtripped->field(2));
+
+  // The IPC writer does not take advantage of reusable buffers
+
+  ASSERT_NE(roundtripped_m01->field(0)->data()->buffers,
+            roundtripped_m20->field(1)->data()->buffers);
+
+  ASSERT_NE(roundtripped_m01->field(1)->data()->buffers,
+            roundtripped_m12->field(0)->data()->buffers);
+
+  ASSERT_NE(roundtripped_m12->field(1)->data()->buffers,
+            roundtripped_m20->field(0)->data()->buffers);
+}
+
 TEST_F(TestWriteRecordBatch, WriteWithCompression) {
   random::RandomArrayGenerator rg(/*seed=*/0);
 
diff --git a/docs/source/format/Columnar.rst b/docs/source/format/Columnar.rst
index 85261e7d93316..51e8eb2cf5a67 100644
--- a/docs/source/format/Columnar.rst
+++ b/docs/source/format/Columnar.rst
@@ -557,21 +557,18 @@ each value. Its physical layout is as follows:
   union has a corresponding type id whose values are found in this
   buffer. A union with more than 127 possible types can be modeled as
   a union of unions.
-* Offsets buffer: A buffer of signed int32 values indicating the
+* Offsets buffer: A buffer of signed Int32 values indicating the
   relative offset into the respective child array for the type in a
   given slot. The respective offsets for each child value array must
   be in order / increasing.
 
-Critically, the dense union allows for minimal overhead in the ubiquitous
-union-of-structs with non-overlapping-fields use case (``Union<s1: Struct1, s2:
-Struct2, s3: Struct3, ...>``)
+**Example Layout: ``DenseUnion<f: Float32, i: Int32>``**
 
-**Example Layout: Dense union**
+For the union array: ::
 
-An example layout for logical union of: ``Union<f: float, i: int32>``
-having the values: ``[{f=1.2}, null, {f=3.4}, {i=5}]``
+    [{f=1.2}, null, {f=3.4}, {i=5}]
 
-::
+will have the following layout: ::
 
     * Length: 4, Null count: 0
     * Types buffer:
@@ -587,7 +584,7 @@ having the values: ``[{f=1.2}, null, {f=3.4}, {i=5}]``
       | 0        | 1           | 2          | 0           | unspecified |
 
     * Children arrays:
-      * Field-0 array (f: float):
+      * Field-0 array (f: Float32):
         * Length: 2, Null count: 1
         * Validity bitmap buffer: 00000101
 
@@ -598,7 +595,7 @@ having the values: ``[{f=1.2}, null, {f=3.4}, {i=5}]``
           | 1.2, null, 3.4 | unspecified |
 
 
-      * Field-1 array (i: int32):
+      * Field-1 array (i: Int32):
         * Length: 1, Null count: 0
         * Validity bitmap buffer: Not required
 
@@ -622,11 +619,11 @@ use cases:
 * A sparse union is more amenable to vectorized expression evaluation in some use cases.
 * Equal-length arrays can be interpreted as a union by only defining the types array.
 
-**Example layout: ``SparseUnion<u0: Int32, u1: Float, u2: VarBinary>``**
+**Example layout: ``SparseUnion<i: Int32, f: Float32, s: VarBinary>``**
 
 For the union array: ::
 
-    [{u0=5}, {u1=1.2}, {u2='joe'}, {u1=3.4}, {u0=4}, {u2='mark'}]
+    [{i=5}, {f=1.2}, {s='joe'}, {f=3.4}, {i=4}, {s='mark'}]
 
 will have the following layout: ::
 
@@ -639,7 +636,7 @@ will have the following layout: ::
 
     * Children arrays:
 
-      * u0 (Int32):
+      * i (Int32):
         * Length: 6, Null count: 4
         * Validity bitmap buffer:
 
@@ -653,7 +650,7 @@ will have the following layout: ::
           |------------|-------------|-------------|-------------|-------------|--------------|-----------------------|
           | 5          | unspecified | unspecified | unspecified | 4           |  unspecified | unspecified (padding) |
 
-      * u1 (float):
+      * f (Float32):
         * Length: 6, Null count: 4
         * Validity bitmap buffer:
 
@@ -667,7 +664,7 @@ will have the following layout: ::
           |-------------|-------------|-------------|-------------|-------------|--------------|-----------------------|
           | unspecified |  1.2        | unspecified | 3.4         | unspecified |  unspecified | unspecified (padding) |
 
-      * u2 (`VarBinary`)
+      * s (`VarBinary`)
         * Length: 6, Null count: 4
         * Validity bitmap buffer:
 
@@ -675,7 +672,7 @@ will have the following layout: ::
           |--------------------------|-----------------------|
           | 00100100                 | 0 (padding)           |
 
-        * Offsets buffer (int32)
+        * Offsets buffer (Int32)
 
           | Bytes 0-3  | Bytes 4-7   | Bytes 8-11  | Bytes 12-15 | Bytes 16-19 | Bytes 20-23 | Bytes 24-27 | Bytes 28-63 |
           |------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|

From 799945c6d9fcaf73a87755643f3a4ec428060b53 Mon Sep 17 00:00:00 2001
From: Jonathan Keane <jkeane@gmail.com>
Date: Wed, 10 Nov 2021 17:18:24 -0600
Subject: [PATCH 131/194] ARROW-14594: [R] Enable snappy+lz4 by default

Since they are used by default when they are available

Closes #11614 from jonkeane/ARROW-14594-snappy

Authored-by: Jonathan Keane <jkeane@gmail.com>
Signed-off-by: Jonathan Keane <jkeane@gmail.com>
---
 cpp/cmake_modules/ThirdpartyToolchain.cmake | 23 ++++++++++++----
 cpp/thirdparty/versions.txt                 |  7 +++--
 r/inst/build_arrow_static.sh                |  6 ++---
 r/vignettes/install.Rmd                     | 29 +++++++++++++++------
 4 files changed, 47 insertions(+), 18 deletions(-)

diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake
index 1a85ea39a4b27..085e8d43f5c47 100644
--- a/cpp/cmake_modules/ThirdpartyToolchain.cmake
+++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake
@@ -579,10 +579,20 @@ endif()
 if(DEFINED ENV{ARROW_SNAPPY_URL})
   set(SNAPPY_SOURCE_URL "$ENV{ARROW_SNAPPY_URL}")
 else()
-  set_urls(SNAPPY_SOURCE_URL
-           "https://github.com/google/snappy/archive/${ARROW_SNAPPY_BUILD_VERSION}.tar.gz"
-           "https://github.com/ursa-labs/thirdparty/releases/download/latest/snappy-${ARROW_SNAPPY_BUILD_VERSION}.tar.gz"
-  )
+  if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS
+                                              "4.9")
+    # There is a bug in GCC < 4.9 with Snappy 1.1.9, so revert to 1.1.8 "SNAPPY_OLD" for those (ARROW-14661)
+    set_urls(SNAPPY_SOURCE_URL
+             "https://github.com/google/snappy/archive/${ARROW_SNAPPY_OLD_BUILD_VERSION}.tar.gz"
+             "https://github.com/ursa-labs/thirdparty/releases/download/latest/snappy-${ARROW_SNAPPY_OLD_BUILD_VERSION}.tar.gz"
+    )
+    set(ARROW_SNAPPY_BUILD_SHA256_CHECKSUM ${ARROW_SNAPPY_OLD_BUILD_SHA256_CHECKSUM})
+  else()
+    set_urls(SNAPPY_SOURCE_URL
+             "https://github.com/google/snappy/archive/${ARROW_SNAPPY_BUILD_VERSION}.tar.gz"
+             "https://github.com/ursa-labs/thirdparty/releases/download/latest/snappy-${ARROW_SNAPPY_BUILD_VERSION}.tar.gz"
+    )
+  endif()
 endif()
 
 if(DEFINED ENV{ARROW_THRIFT_URL})
@@ -971,7 +981,10 @@ macro(build_snappy)
   )
 
   set(SNAPPY_CMAKE_ARGS
-      ${EP_COMMON_CMAKE_ARGS} -DCMAKE_INSTALL_LIBDIR=lib -DSNAPPY_BUILD_TESTS=OFF
+      ${EP_COMMON_CMAKE_ARGS}
+      -DCMAKE_INSTALL_LIBDIR=lib
+      -DSNAPPY_BUILD_TESTS=OFF
+      -DSNAPPY_BUILD_BENCHMARKS=OFF
       "-DCMAKE_INSTALL_PREFIX=${SNAPPY_PREFIX}")
 
   externalproject_add(snappy_ep
diff --git a/cpp/thirdparty/versions.txt b/cpp/thirdparty/versions.txt
index 3528d4eff9de8..b47cb110db813 100644
--- a/cpp/thirdparty/versions.txt
+++ b/cpp/thirdparty/versions.txt
@@ -76,8 +76,11 @@ ARROW_RAPIDJSON_BUILD_VERSION=1a803826f1197b5e30703afe4b9c0e7dd48074f5
 ARROW_RAPIDJSON_BUILD_SHA256_CHECKSUM=0b6b780b6c534bfb0b23d29910bfe361e486bcfeaf106db8bc8995792072905a
 ARROW_RE2_BUILD_VERSION=2021-02-02
 ARROW_RE2_BUILD_SHA256_CHECKSUM=1396ab50c06c1a8885fb68bf49a5ecfd989163015fd96699a180d6414937f33f
-ARROW_SNAPPY_BUILD_VERSION=1.1.8
-ARROW_SNAPPY_BUILD_SHA256_CHECKSUM=16b677f07832a612b0836178db7f374e414f94657c138e6993cbfc5dcc58651f
+ARROW_SNAPPY_BUILD_VERSION=1.1.9
+ARROW_SNAPPY_BUILD_SHA256_CHECKSUM=75c1fbb3d618dd3a0483bff0e26d0a92b495bbe5059c8b4f1c962b478b6e06e7
+# There is a bug in GCC < 4.9 with Snappy 1.1.9, so revert to 1.1.8 for those (ARROW-14661)
+ARROW_SNAPPY_OLD_BUILD_VERSION=1.1.8
+ARROW_SNAPPY_OLD_BUILD_SHA256_CHECKSUM=16b677f07832a612b0836178db7f374e414f94657c138e6993cbfc5dcc58651f
 ARROW_THRIFT_BUILD_VERSION=0.13.0
 ARROW_THRIFT_BUILD_SHA256_CHECKSUM=7ad348b88033af46ce49148097afe354d513c1fca7c607b59c33ebb6064b5179
 ARROW_UTF8PROC_BUILD_VERSION=v2.6.1
diff --git a/r/inst/build_arrow_static.sh b/r/inst/build_arrow_static.sh
index c424646e3154e..4512d12df7db3 100755
--- a/r/inst/build_arrow_static.sh
+++ b/r/inst/build_arrow_static.sh
@@ -66,9 +66,9 @@ ${CMAKE} -DARROW_BOOST_USE_SHARED=OFF \
     -DARROW_S3=${ARROW_S3:-$ARROW_DEFAULT_PARAM} \
     -DARROW_WITH_BROTLI=${ARROW_WITH_BROTLI:-$ARROW_DEFAULT_PARAM} \
     -DARROW_WITH_BZ2=${ARROW_WITH_BZ2:-$ARROW_DEFAULT_PARAM} \
-    -DARROW_WITH_LZ4=${ARROW_WITH_LZ4:-$ARROW_DEFAULT_PARAM} \
+    -DARROW_WITH_LZ4=${ARROW_WITH_LZ4:-ON} \
     -DARROW_WITH_RE2=${ARROW_WITH_RE2:-ON} \
-    -DARROW_WITH_SNAPPY=${ARROW_WITH_SNAPPY:-$ARROW_DEFAULT_PARAM} \
+    -DARROW_WITH_SNAPPY=${ARROW_WITH_SNAPPY:-ON} \
     -DARROW_WITH_UTF8PROC=${ARROW_WITH_UTF8PROC:-ON} \
     -DARROW_WITH_ZLIB=${ARROW_WITH_ZLIB:-$ARROW_DEFAULT_PARAM} \
     -DARROW_WITH_ZSTD=${ARROW_WITH_ZSTD:-$ARROW_DEFAULT_PARAM} \
@@ -78,7 +78,7 @@ ${CMAKE} -DARROW_BOOST_USE_SHARED=OFF \
     -DCMAKE_INSTALL_PREFIX=${DEST_DIR} \
     -DCMAKE_EXPORT_NO_PACKAGE_REGISTRY=ON \
     -DCMAKE_FIND_PACKAGE_NO_PACKAGE_REGISTRY=ON \
-    -DCMAKE_UNITY_BUILD=${CMAKE_UNITY_BUILD:-ON} \
+    -DCMAKE_UNITY_BUILD=${CMAKE_UNITY_BUILD:-OFF} \
     ${EXTRA_CMAKE_FLAGS} \
     -G ${CMAKE_GENERATOR:-"Unix Makefiles"} \
     ${SOURCE_DIR}
diff --git a/r/vignettes/install.Rmd b/r/vignettes/install.Rmd
index 5bd76a3719b49..7c813f85c320c 100644
--- a/r/vignettes/install.Rmd
+++ b/r/vignettes/install.Rmd
@@ -10,18 +10,29 @@ vignette: >
 On macOS and Windows, when you `install.packages("arrow")`,
 you get a binary package that contains Arrow’s C++ dependencies along with it.
 On Linux, `install.packages()` retrieves a source package that has to be compiled locally,
-and C++ dependencies need to be resolved as well.
-Generally for R packages with C++ dependencies,
-this requires either installing system packages, which you may not have privileges to do,
-or building the C++ dependencies separately,
-which introduces all sorts of additional ways for things to go wrong.
+and C++ dependencies need to be resolved as well. 
+
+On linux we recommend one of the following for the quickest and easiest 
+installation: 
+
+* Set the environment variable `NOT_CRAN=true` before installing, which will both
+  check for compatible Apache binaries and use those and if those aren't available
+  set a more fully-featured build than default.
+* Using [RStudio's public package manager](https://packagemanager.rstudio.com/client/#/)
+  which includes pre-built binaries
 
 Our goal is to make `install.packages("arrow")` "just work" for as many Linux distributions,
-versions, and configurations as possible.
-This document describes how it works and the options for fine-tuning Linux installation.
-The intended audience for this document is `arrow` R package users on Linux, not developers.
+versions, and configurations as possible with the above options.
+
+This rest of this document describes how it works and the options for fine-tuning Linux installation.
+The intended audience for this document is `arrow` R package users on Linux, not Arrow developers.
 If you're contributing to the Arrow project, see `vignette("developing", package = "arrow") for guidance on setting up your development environment.
 
+Generally compiling and installing R packages with C++ dependencies, requires 
+either installing system packages, which you may not have privileges to do, or 
+building the C++ dependencies separately, which introduces all sorts of 
+additional ways for things to go wrong. 
+
 Note also that if you use `conda` to manage your R environment, this document does not apply.
 You can `conda install -c conda-forge --strict-channel-priority r-arrow` and you'll get the latest official
 release of the R package along with any C++ dependencies.
@@ -33,6 +44,7 @@ release of the R package along with any C++ dependencies.
 Install the latest release of `arrow` from CRAN with
 
 ```r
+Sys.setenv(NOT_CRAN = TRUE)
 install.packages("arrow")
 ```
 
@@ -40,6 +52,7 @@ Daily development builds, which are not official releases,
 can be installed from the Ursa Labs repository:
 
 ```r
+Sys.setenv(NOT_CRAN = TRUE)
 install.packages("arrow", repos = "https://arrow-r-nightly.s3.amazonaws.com")
 ```
 

From 9b5edc2550a34a7334300837579003cb9df276dc Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Thu, 11 Nov 2021 09:39:27 +0900
Subject: [PATCH 132/194] ARROW-14637: [GLib][Ruby] Add support for
 initializing S3 APIs explicitly

Ruby API: Arrow.s3_initialize(log_level: :trace)

Closes #11647 from kou/glib-s3-initialize

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 c_glib/arrow-glib/file-system.cpp             | 229 +++++++++++++++++-
 c_glib/arrow-glib/file-system.h               |  53 ++++
 c_glib/arrow-glib/file-system.hpp             |   5 +
 c_glib/arrow-glib/version.h.in                |  23 ++
 c_glib/doc/arrow-glib/arrow-glib-docs.xml     |   4 +
 c_glib/test/test-s3-global-options.rb         |  36 +++
 ruby/red-arrow/lib/arrow/loader.rb            |   1 +
 ruby/red-arrow/lib/arrow/s3-global-options.rb |  38 +++
 8 files changed, 386 insertions(+), 3 deletions(-)
 create mode 100644 c_glib/test/test-s3-global-options.rb
 create mode 100644 ruby/red-arrow/lib/arrow/s3-global-options.rb

diff --git a/c_glib/arrow-glib/file-system.cpp b/c_glib/arrow-glib/file-system.cpp
index 2c2c36e74bb4c..bb2e19513bb13 100644
--- a/c_glib/arrow-glib/file-system.cpp
+++ b/c_glib/arrow-glib/file-system.cpp
@@ -51,6 +51,8 @@ G_BEGIN_DECLS
  *
  * #GArrowHDFSFileSystem is a class for HDFS-backed file system.
  *
+ * #GArrowS3GlobalOptions is a class for options to initialize S3 APIs.
+ *
  * #GArrowS3FileSystem is a class for S3-backed file system.
  */
 
@@ -72,10 +74,10 @@ enum {
 
 G_DEFINE_TYPE_WITH_PRIVATE(GArrowFileInfo, garrow_file_info, G_TYPE_OBJECT)
 
-#define GARROW_FILE_INFO_GET_PRIVATE(obj)       \
+#define GARROW_FILE_INFO_GET_PRIVATE(object)    \
   static_cast<GArrowFileInfoPrivate *>(         \
-     garrow_file_info_get_instance_private(     \
-       GARROW_FILE_INFO(obj)))
+    garrow_file_info_get_instance_private(      \
+      GARROW_FILE_INFO(object)))
 
 static void
 garrow_file_info_finalize(GObject *object)
@@ -1364,6 +1366,218 @@ garrow_hdfs_file_system_class_init(GArrowHDFSFileSystemClass *klass)
 }
 
 
+#ifndef ARROW_S3
+namespace arrow {
+  namespace fs {
+    enum class S3LogLevel : int8_t { Off, Fatal, Error, Warn, Info, Debug, Trace };
+
+    struct ARROW_EXPORT S3GlobalOptions {
+      S3LogLevel log_level;
+    };
+  }
+}
+#endif
+
+typedef struct GArrowS3GlobalOptionsPrivate_ {
+  arrow::fs::S3GlobalOptions options;
+} GArrowS3GlobalOptionsPrivate;
+
+enum {
+  PROP_S3_GLOBAL_OPTIONS_LOG_LEVEL = 1,
+};
+
+G_DEFINE_TYPE_WITH_PRIVATE(GArrowS3GlobalOptions,
+                           garrow_s3_global_options,
+                           G_TYPE_OBJECT)
+
+#define GARROW_S3_GLOBAL_OPTIONS_GET_PRIVATE(object)    \
+  static_cast<GArrowS3GlobalOptionsPrivate *>(          \
+    garrow_s3_global_options_get_instance_private(      \
+      GARROW_S3_GLOBAL_OPTIONS(object)))
+
+static void
+garrow_s3_global_options_finalize(GObject *object)
+{
+  auto priv = GARROW_S3_GLOBAL_OPTIONS_GET_PRIVATE(object);
+  priv->options.~S3GlobalOptions();
+  G_OBJECT_CLASS(garrow_s3_global_options_parent_class)->finalize(object);
+}
+
+static void
+garrow_s3_global_options_set_property(GObject *object,
+                                      guint prop_id,
+                                      const GValue *value,
+                                      GParamSpec *pspec)
+{
+#ifdef ARROW_S3
+  auto arrow_options =
+    garrow_s3_global_options_get_raw(GARROW_S3_GLOBAL_OPTIONS(object));
+
+  switch (prop_id) {
+  case PROP_S3_GLOBAL_OPTIONS_LOG_LEVEL:
+    arrow_options->log_level =
+      static_cast<arrow::fs::S3LogLevel>(g_value_get_enum(value));
+    break;
+  default:
+    G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
+    break;
+  }
+#else
+  G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
+#endif
+}
+
+static void
+garrow_s3_global_options_get_property(GObject *object,
+                                      guint prop_id,
+                                      GValue *value,
+                                      GParamSpec *pspec)
+{
+#ifdef ARROW_S3
+  auto arrow_options =
+    garrow_s3_global_options_get_raw(GARROW_S3_GLOBAL_OPTIONS(object));
+
+  switch (prop_id) {
+  case PROP_S3_GLOBAL_OPTIONS_LOG_LEVEL:
+    g_value_set_enum(value,
+                     static_cast<GArrowS3LogLevel>(arrow_options->log_level));
+    break;
+  default:
+    G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
+    break;
+  }
+#else
+  G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
+#endif
+}
+
+static void
+garrow_s3_global_options_init(GArrowS3GlobalOptions *object)
+{
+  auto priv = GARROW_S3_GLOBAL_OPTIONS_GET_PRIVATE(object);
+  new(&priv->options) arrow::fs::S3GlobalOptions;
+}
+
+static void
+garrow_s3_global_options_class_init(GArrowS3GlobalOptionsClass *klass)
+{
+  GParamSpec *spec;
+
+  auto gobject_class = G_OBJECT_CLASS(klass);
+
+  gobject_class->finalize     = garrow_s3_global_options_finalize;
+  gobject_class->set_property = garrow_s3_global_options_set_property;
+  gobject_class->get_property = garrow_s3_global_options_get_property;
+
+  /**
+   * GArrowS3GlobalOptions:log-level:
+   *
+   * The log level of S3 APIs.
+   *
+   * Since: 7.0.0
+   */
+  spec = g_param_spec_enum("log-level",
+                           "Log level",
+                           "The log level of S3 APIs",
+                           GARROW_TYPE_S3_LOG_LEVEL,
+                           GARROW_S3_LOG_LEVEL_FATAL,
+                           static_cast<GParamFlags>(G_PARAM_READWRITE |
+                                                    G_PARAM_CONSTRUCT));
+  g_object_class_install_property(gobject_class,
+                                  PROP_S3_GLOBAL_OPTIONS_LOG_LEVEL,
+                                  spec);
+}
+
+/**
+ * garrow_s3_global_options_new:
+ *
+ * Returns: A newly created #GArrowS3GlobalOptions.
+ *
+ * Since: 7.0.0
+ */
+GArrowS3GlobalOptions *
+garrow_s3_global_options_new(void)
+{
+  return GARROW_S3_GLOBAL_OPTIONS(
+    g_object_new(GARROW_TYPE_S3_GLOBAL_OPTIONS, NULL));
+}
+
+
+/**
+ * garrow_s3_is_enabled:
+ *
+ * Returns: %TRUE if Apache Arrow C++ is built with S3 support, %FALSE
+ *   otherwise.
+ *
+ * Since: 7.0.0
+ */
+gboolean
+garrow_s3_is_enabled(void)
+{
+#ifdef ARROW_S3
+  return TRUE;
+#else
+  return FALSE;
+#endif
+}
+
+/**
+ * garrow_s3_initialize:
+ * @options: (nullable): Options to initialize the S3 APIs.
+ * @error: (nullable): Return location for a #GError or %NULL.
+ *
+ * Normally, you don't need to call this function because the S3 APIs
+ * are initialized with the default options automatically. If you want
+ * to call this function, you must call this function before you use
+ * any #GArrowS3FileSystem related APIs.
+ *
+ * Returns: %TRUE on success, %FALSE on error.
+ *
+ * Since: 7.0.0
+ */
+gboolean
+garrow_s3_initialize(GArrowS3GlobalOptions *options,
+                     GError **error)
+{
+#ifdef ARROW_S3
+  auto arrow_options = garrow_s3_global_options_get_raw(options);
+  return garrow::check(error,
+                       arrow::fs::InitializeS3(*arrow_options),
+                       "[s3][initialize]");
+#else
+  return garrow::check(error,
+                       arrow::Status::NotImplemented(
+                         "Apache Arrow C++ isn't built with S3 support"),
+                       "[s3][initialize]");
+#endif
+}
+
+/**
+ * garrow_s3_finalize:
+ * @error: (nullable): Return location for a #GError or %NULL.
+ *
+ * Finalize the S3 APIs.
+ *
+ * Returns: %TRUE on success, %FALSE on error.
+ *
+ * Since: 7.0.0
+ */
+gboolean
+garrow_s3_finalize(GError **error)
+{
+#ifdef ARROW_S3
+  return garrow::check(error,
+                       arrow::fs::FinalizeS3(),
+                       "[s3][finalize]");
+#else
+  return garrow::check(error,
+                       arrow::Status::NotImplemented(
+                         "Apache Arrow C++ isn't built with S3 support"),
+                       "[s3][initialize]");
+#endif
+}
+
+
 G_DEFINE_TYPE(GArrowS3FileSystem,
               garrow_s3_file_system,
               GARROW_TYPE_FILE_SYSTEM)
@@ -1448,3 +1662,12 @@ garrow_slow_file_system_new_raw(
                  "base-file-system", base_file_system,
                  NULL));
 }
+
+#ifdef ARROW_S3
+arrow::fs::S3GlobalOptions *
+garrow_s3_global_options_get_raw(GArrowS3GlobalOptions *options)
+{
+  auto priv = GARROW_S3_GLOBAL_OPTIONS_GET_PRIVATE(options);
+  return &(priv->options);
+}
+#endif
diff --git a/c_glib/arrow-glib/file-system.h b/c_glib/arrow-glib/file-system.h
index dc9fba7dd50a2..7ab356e42402b 100644
--- a/c_glib/arrow-glib/file-system.h
+++ b/c_glib/arrow-glib/file-system.h
@@ -268,6 +268,59 @@ struct _GArrowHDFSFileSystemClass
 };
 
 
+/**
+ * GArrowS3LogLevel:
+ * @GARROW_S3_LOG_LEVEL_OFF: Off.
+ * @GARROW_S3_LOG_LEVEL_FATAL: Fatal. This is the default.
+ * @GARROW_S3_LOG_LEVEL_ERROR: Error.
+ * @GARROW_S3_LOG_LEVEL_WARN: Warn.
+ * @GARROW_S3_LOG_LEVEL_INFO: Info.
+ * @GARROW_S3_LOG_LEVEL_DEBUG: Debug.
+ * @GARROW_S3_LOG_LEVEL_TRACE: Trace.
+ *
+ * They are corresponding to `arrow::fs::S3LogLevel` values.
+ *
+ * Since: 7.0.0
+ */
+typedef enum {
+  GARROW_S3_LOG_LEVEL_OFF,
+  GARROW_S3_LOG_LEVEL_FATAL,
+  GARROW_S3_LOG_LEVEL_ERROR,
+  GARROW_S3_LOG_LEVEL_WARN,
+  GARROW_S3_LOG_LEVEL_INFO,
+  GARROW_S3_LOG_LEVEL_DEBUG,
+  GARROW_S3_LOG_LEVEL_TRACE,
+} GArrowS3LogLevel;
+
+
+#define GARROW_TYPE_S3_GLOBAL_OPTIONS (garrow_s3_global_options_get_type())
+G_DECLARE_DERIVABLE_TYPE(GArrowS3GlobalOptions,
+                         garrow_s3_global_options,
+                         GARROW,
+                         S3_GLOBAL_OPTIONS,
+                         GObject)
+struct _GArrowS3GlobalOptionsClass
+{
+  GObjectClass parent_class;
+};
+
+GARROW_AVAILABLE_IN_7_0
+GArrowS3GlobalOptions *
+garrow_s3_global_options_new(void);
+
+
+GARROW_AVAILABLE_IN_7_0
+gboolean
+garrow_s3_is_enabled(void);
+GARROW_AVAILABLE_IN_7_0
+gboolean
+garrow_s3_initialize(GArrowS3GlobalOptions *options,
+                     GError **error);
+GARROW_AVAILABLE_IN_7_0
+gboolean
+garrow_s3_finalize(GError **error);
+
+
 #define GARROW_TYPE_S3_FILE_SYSTEM (garrow_s3_file_system_get_type())
 G_DECLARE_DERIVABLE_TYPE(GArrowS3FileSystem,
                          garrow_s3_file_system,
diff --git a/c_glib/arrow-glib/file-system.hpp b/c_glib/arrow-glib/file-system.hpp
index 6130d2df52f2d..6d33ba74fb199 100644
--- a/c_glib/arrow-glib/file-system.hpp
+++ b/c_glib/arrow-glib/file-system.hpp
@@ -46,3 +46,8 @@ garrow_slow_file_system_new_raw(
   std::shared_ptr<arrow::fs::FileSystem> *arrow_file_system,
   GArrowFileSystem *base_file_system);
 
+
+#ifdef ARROW_S3
+arrow::fs::S3GlobalOptions *
+garrow_s3_global_options_get_raw(GArrowS3GlobalOptions *options);
+#endif
diff --git a/c_glib/arrow-glib/version.h.in b/c_glib/arrow-glib/version.h.in
index 7b7174e66bdd6..57978d3eb1033 100644
--- a/c_glib/arrow-glib/version.h.in
+++ b/c_glib/arrow-glib/version.h.in
@@ -110,6 +110,15 @@
 #  define GARROW_UNAVAILABLE(major, minor) G_UNAVAILABLE(major, minor)
 #endif
 
+/**
+ * GARROW_VERSION_7_0:
+ *
+ * You can use this macro value for compile time API version check.
+ *
+ * Since: 7.0.0
+ */
+#define GARROW_VERSION_7_0 G_ENCODE_VERSION(7, 0)
+
 /**
  * GARROW_VERSION_6_0:
  *
@@ -274,6 +283,20 @@
 
 #define GARROW_AVAILABLE_IN_ALL
 
+#if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_7_0
+#  define GARROW_DEPRECATED_IN_7_0                GARROW_DEPRECATED
+#  define GARROW_DEPRECATED_IN_7_0_FOR(function)  GARROW_DEPRECATED_FOR(function)
+#else
+#  define GARROW_DEPRECATED_IN_7_0
+#  define GARROW_DEPRECATED_IN_7_0_FOR(function)
+#endif
+
+#if GARROW_VERSION_MAX_ALLOWED < GARROW_VERSION_7_0
+#  define GARROW_AVAILABLE_IN_7_0 GARROW_UNAVAILABLE(7, 0)
+#else
+#  define GARROW_AVAILABLE_IN_7_0
+#endif
+
 #if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_6_0
 #  define GARROW_DEPRECATED_IN_6_0                GARROW_DEPRECATED
 #  define GARROW_DEPRECATED_IN_6_0_FOR(function)  GARROW_DEPRECATED_FOR(function)
diff --git a/c_glib/doc/arrow-glib/arrow-glib-docs.xml b/c_glib/doc/arrow-glib/arrow-glib-docs.xml
index 43f6a7edcd83c..4c37028bb487e 100644
--- a/c_glib/doc/arrow-glib/arrow-glib-docs.xml
+++ b/c_glib/doc/arrow-glib/arrow-glib-docs.xml
@@ -184,6 +184,10 @@
     <title>Index of deprecated API</title>
     <xi:include href="xml/api-index-deprecated.xml"><xi:fallback /></xi:include>
   </index>
+  <index id="api-index-7-0-0" role="7.0.0">
+    <title>Index of new symbols in 7.0.0</title>
+    <xi:include href="xml/api-index-7.0.0.xml"><xi:fallback /></xi:include>
+  </index>
   <index id="api-index-6-0-0" role="6.0.0">
     <title>Index of new symbols in 6.0.0</title>
     <xi:include href="xml/api-index-6.0.0.xml"><xi:fallback /></xi:include>
diff --git a/c_glib/test/test-s3-global-options.rb b/c_glib/test/test-s3-global-options.rb
new file mode 100644
index 0000000000000..b1270e42faf1a
--- /dev/null
+++ b/c_glib/test/test-s3-global-options.rb
@@ -0,0 +1,36 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class TestS3GlobalOptions < Test::Unit::TestCase
+  def setup
+    omit("S3 enabled Apache Arrow C++ is needed") unless Arrow.s3_is_enabled?
+    @options = Arrow::S3GlobalOptions.new
+  end
+
+  sub_test_case("#log_level") do
+    test("default") do
+      assert_equal(Arrow::S3LogLevel::FATAL,
+                   @options.log_level)
+    end
+  end
+
+  test("#log_level=") do
+    @options.log_level = :trace
+    assert_equal(Arrow::S3LogLevel::TRACE,
+                 @options.log_level)
+  end
+end
diff --git a/ruby/red-arrow/lib/arrow/loader.rb b/ruby/red-arrow/lib/arrow/loader.rb
index 804a9489429c4..3e9a976221fa1 100644
--- a/ruby/red-arrow/lib/arrow/loader.rb
+++ b/ruby/red-arrow/lib/arrow/loader.rb
@@ -93,6 +93,7 @@ def require_libraries
       require "arrow/record-batch-reader"
       require "arrow/record-batch-stream-reader"
       require "arrow/rolling-window"
+      require "arrow/s3-global-options"
       require "arrow/scalar"
       require "arrow/schema"
       require "arrow/slicer"
diff --git a/ruby/red-arrow/lib/arrow/s3-global-options.rb b/ruby/red-arrow/lib/arrow/s3-global-options.rb
new file mode 100644
index 0000000000000..82332a411b194
--- /dev/null
+++ b/ruby/red-arrow/lib/arrow/s3-global-options.rb
@@ -0,0 +1,38 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+  class S3GlobalOptions
+    class << self
+      # @api private
+      def try_convert(value)
+        case value
+        when Hash
+          options = new
+          value.each do |k, v|
+            setter = :"#{k}="
+            return unless options.respond_to?(setter)
+            options.__send__(setter, v)
+          end
+          options
+        else
+          nil
+        end
+      end
+    end
+  end
+end

From 225d9547d2363bd0eb8c85bdd0dd98a6014069d7 Mon Sep 17 00:00:00 2001
From: Jonathan Keane <jkeane@gmail.com>
Date: Thu, 11 Nov 2021 12:32:25 -0600
Subject: [PATCH 133/194] ARROW-14667: [R][C++] segfault on calls to
 arrow::S3FileSystem$create

Closes #11680 from jonkeane/ARROW-14667

Authored-by: Jonathan Keane <jkeane@gmail.com>
Signed-off-by: Jonathan Keane <jkeane@gmail.com>
---
 r/src/filesystem.cpp               |  4 +++-
 r/tests/testthat/test-filesystem.R | 21 +++++++++++++--------
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/r/src/filesystem.cpp b/r/src/filesystem.cpp
index b7ec605361dae..e523202747a60 100644
--- a/r/src/filesystem.cpp
+++ b/r/src/filesystem.cpp
@@ -287,6 +287,9 @@ std::shared_ptr<fs::S3FileSystem> fs___S3FileSystem__create(
     std::string session_name = "", std::string external_id = "", int load_frequency = 900,
     std::string region = "", std::string endpoint_override = "", std::string scheme = "",
     bool background_writes = true) {
+  // We need to ensure that S3 is initialized before we start messing with the
+  // options
+  StopIfNotOk(fs::EnsureS3Initialized());
   fs::S3Options s3_opts;
   // Handle auth (anonymous, keys, default)
   // (validation/internal coherence handled in R)
@@ -316,7 +319,6 @@ std::shared_ptr<fs::S3FileSystem> fs___S3FileSystem__create(
   /// default true
   s3_opts.background_writes = background_writes;
 
-  StopIfNotOk(fs::EnsureS3Initialized());
   auto io_context = arrow::io::IOContext(gc_memory_pool());
   return ValueOrStop(fs::S3FileSystem::Make(s3_opts, io_context));
 }
diff --git a/r/tests/testthat/test-filesystem.R b/r/tests/testthat/test-filesystem.R
index 5ee096f13b44e..5dd84a7202fd3 100644
--- a/r/tests/testthat/test-filesystem.R
+++ b/r/tests/testthat/test-filesystem.R
@@ -132,6 +132,19 @@ test_that("LocalFileSystem + Selector", {
   expect_equal(sum(types == FileType$Directory), 1L)
 })
 
+# This test_that block must be above the two that follow it because S3FileSystem$create
+# uses a slightly different set of cpp code that is R-only, so if there are bugs
+# in the initialization of S3 (e.g. ARROW-14667) they will not be caught because
+# the blocks "FileSystem$from_uri" and "SubTreeFileSystem$create() with URI" actually
+# initialize it
+test_that("S3FileSystem", {
+  skip_on_cran()
+  skip_if_not_available("s3")
+  skip_if_offline()
+  s3fs <- S3FileSystem$create()
+  expect_r6_class(s3fs, "S3FileSystem")
+})
+
 test_that("FileSystem$from_uri", {
   skip_on_cran()
   skip_if_not_available("s3")
@@ -153,14 +166,6 @@ test_that("SubTreeFileSystem$create() with URI", {
   )
 })
 
-test_that("S3FileSystem", {
-  skip_on_cran()
-  skip_if_not_available("s3")
-  skip_if_offline()
-  s3fs <- S3FileSystem$create()
-  expect_r6_class(s3fs, "S3FileSystem")
-})
-
 test_that("s3_bucket", {
   skip_on_cran()
   skip_if_not_available("s3")

From a3746040d8a3ddb84bab6c7ca4771b6c120e3444 Mon Sep 17 00:00:00 2001
From: Nic Crane <thisisnic@gmail.com>
Date: Thu, 11 Nov 2021 18:50:50 +0000
Subject: [PATCH 134/194] ARROW-13888: [R] Rephrase docs for schema()'s
 ellipses argument and rephrase error message

Closes #11645 from thisisnic/ARROW-13888_rephrase_schema_docs

Authored-by: Nic Crane <thisisnic@gmail.com>
Signed-off-by: Nic Crane <thisisnic@gmail.com>
---
 r/NAMESPACE                    |  1 +
 r/R/arrow-package.R            |  1 +
 r/R/schema.R                   | 19 ++++++++++++++++---
 r/man/Schema.Rd                | 13 ++++++++++---
 r/tests/testthat/test-schema.R | 26 +++++++++++++++++++++++---
 5 files changed, 51 insertions(+), 9 deletions(-)

diff --git a/r/NAMESPACE b/r/NAMESPACE
index 572e5e24c9a1c..31fd9cf6a872e 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -332,6 +332,7 @@ importFrom(rlang,is_character)
 importFrom(rlang,is_false)
 importFrom(rlang,is_integerish)
 importFrom(rlang,is_interactive)
+importFrom(rlang,is_list)
 importFrom(rlang,is_quosure)
 importFrom(rlang,list2)
 importFrom(rlang,new_data_mask)
diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R
index edc2652b6b63d..a72483a0d6df7 100644
--- a/r/R/arrow-package.R
+++ b/r/R/arrow-package.R
@@ -23,6 +23,7 @@
 #' @importFrom rlang eval_tidy new_data_mask syms env new_environment env_bind set_names exec
 #' @importFrom rlang is_bare_character quo_get_expr quo_get_env quo_set_expr .data seq2 is_interactive
 #' @importFrom rlang expr caller_env is_character quo_name is_quosure enexpr enexprs as_quosure
+#' @importFrom rlang is_list
 #' @importFrom tidyselect vars_pull vars_rename vars_select eval_select
 #' @useDynLib arrow, .registration = TRUE
 #' @keywords internal
diff --git a/r/R/schema.R b/r/R/schema.R
index c3dfee5f9fe35..0d66aeb83c4b9 100644
--- a/r/R/schema.R
+++ b/r/R/schema.R
@@ -18,7 +18,7 @@
 #' @include arrow-package.R
 #' @title Schema class
 #'
-#' @description A `Schema` is a list of [Field]s, which map names to
+#' @description A `Schema` is an Arrow object containing [Field]s, which map names to
 #' Arrow [data types][data-type]. Create a `Schema` when you
 #' want to convert an R `data.frame` to Arrow but don't want to rely on the
 #' default mapping of R types to Arrow types, such as when you want to choose a
@@ -78,6 +78,14 @@
 #' @rdname Schema
 #' @name Schema
 #' @examplesIf arrow_available()
+#' schema(a = int32(), b = float64())
+#'
+#' schema(
+#'   field("b", double()),
+#'   field("c", bool(), nullable = FALSE),
+#'   field("d", string())
+#' )
+#'
 #' df <- data.frame(col1 = 2:4, col2 = c(0.1, 0.3, 0.5))
 #' tab1 <- arrow_table(df)
 #' tab1$schema
@@ -155,6 +163,12 @@ Schema <- R6Class("Schema",
 )
 Schema$create <- function(...) {
   .list <- list2(...)
+
+  # if we were provided only a list of types or fields, use that
+  if (length(.list) == 1 && is_list(.list[[1]])) {
+    .list <- .list[[1]]
+  }
+
   if (all(map_lgl(.list, ~ inherits(., "Field")))) {
     schema_(.list)
   } else {
@@ -185,8 +199,7 @@ print_schema_fields <- function(s) {
   paste(map_chr(s$fields, ~ .$ToString()), collapse = "\n")
 }
 
-#' @param ... named list containing [data types][data-type] or
-#'   a list of [fields][field] containing the fields for the schema
+#' @param ... [fields][field] or field name/[data type][data-type] pairs
 #' @export
 #' @rdname Schema
 schema <- Schema$create
diff --git a/r/man/Schema.Rd b/r/man/Schema.Rd
index 7322c70f2c774..7a2d255190887 100644
--- a/r/man/Schema.Rd
+++ b/r/man/Schema.Rd
@@ -9,11 +9,10 @@
 schema(...)
 }
 \arguments{
-\item{...}{named list containing \link[=data-type]{data types} or
-a list of \link[=field]{fields} containing the fields for the schema}
+\item{...}{\link[=field]{fields} or field name/\link[=data-type]{data type} pairs}
 }
 \description{
-A \code{Schema} is a list of \link{Field}s, which map names to
+A \code{Schema} is an Arrow object containing \link{Field}s, which map names to
 Arrow \link[=data-type]{data types}. Create a \code{Schema} when you
 want to convert an R \code{data.frame} to Arrow but don't want to rely on the
 default mapping of R types to Arrow types, such as when you want to choose a
@@ -77,6 +76,14 @@ the metadata is dropped.
 
 \examples{
 \dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+schema(a = int32(), b = float64())
+
+schema(
+  field("b", double()),
+  field("c", bool(), nullable = FALSE),
+  field("d", string())
+)
+
 df <- data.frame(col1 = 2:4, col2 = c(0.1, 0.3, 0.5))
 tab1 <- arrow_table(df)
 tab1$schema
diff --git a/r/tests/testthat/test-schema.R b/r/tests/testthat/test-schema.R
index 8473550df652d..b7209a3b1c7aa 100644
--- a/r/tests/testthat/test-schema.R
+++ b/r/tests/testthat/test-schema.R
@@ -40,9 +40,13 @@ test_that("Schema print method", {
 
 test_that("Schema with non-nullable fields", {
   expect_output(
-    print(schema(field("b", double()),
-                 field("c", bool(), nullable = FALSE),
-                 field("d", string()))),
+    print(
+      schema(
+        field("b", double()),
+        field("c", bool(), nullable = FALSE),
+        field("d", string())
+      )
+    ),
     paste(
       "Schema",
       "b: double",
@@ -218,3 +222,19 @@ test_that("Schema to C-interface", {
   # must clean up the pointer or we leak
   delete_arrow_schema(ptr)
 })
+
+test_that("Schemas from lists", {
+  name_list_schema <- schema(list(b = double(), c = string(), d = int8()))
+
+
+  field_list_schema <- schema(
+    list(
+      field("b", double()),
+      field("c", bool()),
+      field("d", string())
+    )
+  )
+
+  expect_equal(name_list_schema, schema(b = double(), c = string(), d = int8()))
+  expect_equal(field_list_schema, schema(b = double(), c = bool(), d = string()))
+})

From 9c13affae48d70c647c1883ce95fc72e47bb94f1 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Fri, 12 Nov 2021 06:09:42 +0900
Subject: [PATCH 135/194] ARROW-14670: [Release][Java] Build missing javadoc
 and source .jar

arrow-java-root-X.Y.Z-source-release.zip isn't generated yet. It's
out-of-score of this change. arrow-java-root-X.Y.Z-source-release.zip
is the last file that is generated by "mvn deploy" but not Crossbow.

Closes #11675 from kou/release-java-upload

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 ci/scripts/java_full_build.sh | 21 ++++++++++++-------
 dev/tasks/tasks.yml           | 38 ++++++++++++++++++++++++++++++++++-
 2 files changed, 51 insertions(+), 8 deletions(-)

diff --git a/ci/scripts/java_full_build.sh b/ci/scripts/java_full_build.sh
index e452b80985a09..df80d189a5774 100755
--- a/ci/scripts/java_full_build.sh
+++ b/ci/scripts/java_full_build.sh
@@ -27,15 +27,22 @@ export ARROW_TEST_DATA=${arrow_dir}/testing/data
 pushd ${arrow_dir}/java
 
 # build the entire project
-mvn clean install \
-  -Parrow-c-data \
-  -Parrow-jni \
-  -Darrow.cpp.build.dir=$dist_dir \
-  -Darrow.c.jni.dist.dir=$dist_dir
+mvn clean \
+    install \
+    source:jar \
+    javadoc:jar \
+    -Parrow-c-data \
+    -Parrow-jni \
+    -Darrow.cpp.build.dir=$dist_dir \
+    -Darrow.c.jni.dist.dir=$dist_dir
 
-# copy all jars and pom files to the distribution folder
+# copy all jar, zip and pom files to the distribution folder
+find . \
+     "(" -name "*-javadoc.jar" -o -name "*-sources.jar" ")" \
+     -exec echo {} ";" \
+     -exec cp {} $dist_dir ";"
 find ~/.m2/repository/org/apache/arrow \
-     "(" -name "*.jar" -o -name "*.pom" ")" \
+     "(" -name "*.jar" -o -name "*.zip" -o -name "*.pom" ")" \
      -exec echo {} ";" \
      -exec cp {} $dist_dir ";"
 
diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml
index 4ab72d0318de5..4e3042786acf0 100644
--- a/dev/tasks/tasks.yml
+++ b/dev/tasks/tasks.yml
@@ -693,64 +693,100 @@ tasks:
     ci: github
     template: java-jars/github.yml
     artifacts:
+      - arrow-algorithm-{no_rc_version}-javadoc.jar
+      - arrow-algorithm-{no_rc_version}-sources.jar
       - arrow-algorithm-{no_rc_version}-tests.jar
       - arrow-algorithm-{no_rc_version}.jar
       - arrow-algorithm-{no_rc_version}.pom
+      - arrow-avro-{no_rc_version}-javadoc.jar
+      - arrow-avro-{no_rc_version}-sources.jar
       - arrow-avro-{no_rc_version}-tests.jar
       - arrow-avro-{no_rc_version}.jar
       - arrow-avro-{no_rc_version}.pom
+      - arrow-c-data-{no_rc_version}-javadoc.jar
+      - arrow-c-data-{no_rc_version}-sources.jar
       - arrow-c-data-{no_rc_version}-tests.jar
       - arrow-c-data-{no_rc_version}.jar
       - arrow-c-data-{no_rc_version}.pom
+      - arrow-compression-{no_rc_version}-javadoc.jar
+      - arrow-compression-{no_rc_version}-sources.jar
       - arrow-compression-{no_rc_version}-tests.jar
       - arrow-compression-{no_rc_version}.jar
       - arrow-compression-{no_rc_version}.pom
+      - arrow-dataset-{no_rc_version}-javadoc.jar
+      - arrow-dataset-{no_rc_version}-sources.jar
       - arrow-dataset-{no_rc_version}-tests.jar
       - arrow-dataset-{no_rc_version}.jar
       - arrow-dataset-{no_rc_version}.pom
+      - arrow-format-{no_rc_version}-javadoc.jar
+      - arrow-format-{no_rc_version}-sources.jar
       - arrow-format-{no_rc_version}-tests.jar
       - arrow-format-{no_rc_version}.jar
       - arrow-format-{no_rc_version}.pom
+      - arrow-gandiva-{no_rc_version}-javadoc.jar
+      - arrow-gandiva-{no_rc_version}-sources.jar
       - arrow-gandiva-{no_rc_version}-tests.jar
       - arrow-gandiva-{no_rc_version}.jar
       - arrow-gandiva-{no_rc_version}.pom
+      # - arrow-java-root-{no_rc_version}-source-release.zip
       - arrow-java-root-{no_rc_version}.pom
+      - arrow-jdbc-{no_rc_version}-javadoc.jar
+      - arrow-jdbc-{no_rc_version}-sources.jar
       - arrow-jdbc-{no_rc_version}-tests.jar
       - arrow-jdbc-{no_rc_version}.jar
       - arrow-jdbc-{no_rc_version}.pom
-      - arrow-memory-{no_rc_version}.pom
+      - arrow-memory-core-{no_rc_version}-javadoc.jar
+      - arrow-memory-core-{no_rc_version}-sources.jar
       - arrow-memory-core-{no_rc_version}-tests.jar
       - arrow-memory-core-{no_rc_version}.jar
       - arrow-memory-core-{no_rc_version}.pom
+      - arrow-memory-netty-{no_rc_version}-javadoc.jar
+      - arrow-memory-netty-{no_rc_version}-sources.jar
       - arrow-memory-netty-{no_rc_version}-tests.jar
       - arrow-memory-netty-{no_rc_version}.jar
       - arrow-memory-netty-{no_rc_version}.pom
+      - arrow-memory-unsafe-{no_rc_version}-javadoc.jar
+      - arrow-memory-unsafe-{no_rc_version}-sources.jar
       - arrow-memory-unsafe-{no_rc_version}-tests.jar
       - arrow-memory-unsafe-{no_rc_version}.jar
       - arrow-memory-unsafe-{no_rc_version}.pom
+      - arrow-memory-{no_rc_version}.pom
+      - arrow-orc-{no_rc_version}-javadoc.jar
+      - arrow-orc-{no_rc_version}-sources.jar
       - arrow-orc-{no_rc_version}-tests.jar
       - arrow-orc-{no_rc_version}.jar
       - arrow-orc-{no_rc_version}.pom
+      - arrow-performance-{no_rc_version}-sources.jar
       - arrow-performance-{no_rc_version}-tests.jar
       - arrow-performance-{no_rc_version}.jar
       - arrow-performance-{no_rc_version}.pom
+      - arrow-plasma-{no_rc_version}-javadoc.jar
+      - arrow-plasma-{no_rc_version}-sources.jar
       - arrow-plasma-{no_rc_version}-tests.jar
       - arrow-plasma-{no_rc_version}.jar
       - arrow-plasma-{no_rc_version}.pom
       - arrow-tools-{no_rc_version}-jar-with-dependencies.jar
+      - arrow-tools-{no_rc_version}-javadoc.jar
+      - arrow-tools-{no_rc_version}-sources.jar
       - arrow-tools-{no_rc_version}-tests.jar
       - arrow-tools-{no_rc_version}.jar
       - arrow-tools-{no_rc_version}.pom
+      - arrow-vector-{no_rc_version}-javadoc.jar
       - arrow-vector-{no_rc_version}-shade-format-flatbuffers.jar
+      - arrow-vector-{no_rc_version}-sources.jar
       - arrow-vector-{no_rc_version}-tests.jar
       - arrow-vector-{no_rc_version}.jar
       - arrow-vector-{no_rc_version}.pom
       - flight-core-{no_rc_version}-jar-with-dependencies.jar
+      - flight-core-{no_rc_version}-javadoc.jar
       - flight-core-{no_rc_version}-shaded-ext.jar
       - flight-core-{no_rc_version}-shaded.jar
+      - flight-core-{no_rc_version}-sources.jar
       - flight-core-{no_rc_version}-tests.jar
       - flight-core-{no_rc_version}.jar
       - flight-core-{no_rc_version}.pom
+      - flight-grpc-{no_rc_version}-javadoc.jar
+      - flight-grpc-{no_rc_version}-sources.jar
       - flight-grpc-{no_rc_version}-tests.jar
       - flight-grpc-{no_rc_version}.jar
       - flight-grpc-{no_rc_version}.pom

From fd8c3dfd80abddd57715cfc4ea79b1408ecb6aa3 Mon Sep 17 00:00:00 2001
From: Konstantin Ilchenko <k.ilchenko@appodeal.com>
Date: Fri, 12 Nov 2021 10:52:43 +0900
Subject: [PATCH 136/194] ARROW-14636: [Ruby] Add Cookbook section to
 documentation

Closes #11584 from simpl1g/ruby-add-red-arrow-examples

Authored-by: Konstantin Ilchenko <k.ilchenko@appodeal.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 ruby/README.md | 119 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 119 insertions(+)

diff --git a/ruby/README.md b/ruby/README.md
index fbcf6152085c7..abcee71cbd2f8 100644
--- a/ruby/README.md
+++ b/ruby/README.md
@@ -34,3 +34,122 @@ There are the official Ruby bindings for Apache Arrow.
 [Red Parquet](https://github.com/apache/arrow/tree/master/ruby/red-parquet) is the Parquet bindings.
 
 
+## Cookbook
+
+### Getting Started
+
+```shell
+gem install red-arrow
+gem install red-parquet # for parquet support
+gem install red-arrow-dataset # reading from s3 / folders
+```
+
+### Create table
+#### From file
+```ruby
+require 'arrow'
+require 'parquet'
+
+table = Arrow::Table.load('data.arrow')
+table = Arrow::Table.load('data.csv', format: :csv)
+table = Arrow::Table.load('data.parquet', format: :parquet)
+```
+#### From Ruby hash
+Types will be detected automatically
+```ruby
+table = Arrow::Table.new('name' => ['Tom', 'Max'], 'age' => [22, 23])
+```
+#### From String
+Suppose you have your data available via HTTP. Let's connect to demo ClickHouse DB. See https://play.clickhouse.com/ for details
+```ruby
+require 'net/http'
+
+params = {
+  query: "SELECT WatchID as watch FROM hits_v1 LIMIT 10 FORMAT Arrow",
+  user: "playground",
+  password: "clickhouse",
+  database: "datasets"
+}
+uri = URI('https://play-api.clickhouse.com:8443')
+uri.query = URI.encode_www_form(params)
+resp = Net::HTTP.get(uri)
+table = Arrow::Table.load(Arrow::Buffer.new(resp))
+```
+#### From S3
+```ruby
+require 'arrow-dataset'
+
+s3_uri = URI('s3://bucket/public.csv')
+Arrow::Table.load(s3_uri)
+```
+For private access you can pass access_key and secret_key in following way:
+```ruby
+require 'cgi/util'
+
+s3_uri = URI("s3://#{CGI.escape(access_key)}:#{CGI.escape(secret_key)}@bucket/private.parquet")
+Arrow::Table.load(s3_uri)
+```
+#### From multiple files in folder
+```ruby
+require 'arrow-dataset'
+
+Arrow::Table.load(URI("file:///your/folder/"), format: :parquet)
+```
+
+### Filtering
+Uses concept of slicers in Arrow
+```ruby
+table = Arrow::Table.new(
+  'name' => ['Tom', 'Max', 'Kate'],
+  'age' => [22, 23, 19]
+)
+table.slice { |slicer| slicer['age'] > 19 }
+# => #<Arrow::Table:0x7fa38838c448 ptr=0x7fa3ad269f40>
+#   name	age
+# 0	Tom 	 22
+# 1	Max 	 23
+
+table.slice { |slicer| slicer['age'].in?(19..22) }
+# => #<Arrow::Table:0x7fa3881cf998 ptr=0x7fa3a4bb5f30>
+#   name	age
+# 0	Tom 	 22
+# 1	Kate	 19
+```
+Multiple slice conditions can be joined using and(`&`) / or (`|`) / xor(`^`) logical operations
+```ruby
+table.slice { |slicer| (slicer['age'] > 19) & (slicer['age'] < 23) }
+# => #<Arrow::Table:0x7fa3882cc300 ptr=0x7fa3ad260b00>
+#   name	age
+# 0	Tom 	 22
+```
+
+### Operations
+Arrow compute functions can be accessed through `Arrow::Function`
+```ruby
+add = Arrow::Function.find('add')
+add.execute([table['age'].data, table['age'].data]).value
+# => #<Arrow::ChunkedArray:0x7fa389b87250 ptr=0x7fa3a4bb5c40 [
+#   [
+#     44,
+#     46,
+#     38
+#   ]
+# ]>
+```
+
+### Grouping
+```ruby
+table = Arrow::Table.new(
+  'name' => ['Tom', 'Max', 'Kate', 'Tom'],
+  'amount' => [10, 2, 3, 5]
+)
+table.group('name').sum('amount')
+# => #<Arrow::Table:0x7fa389894ae8 ptr=0x7fa364141a50>
+#   name	amount
+# 0	Kate	     3
+# 1	Max 	     2
+# 2	Tom 	    15
+```
+
+### Joining
+Work in progress, see https://issues.apache.org/jira/browse/ARROW-14531

From 3a663e5972548bcd92841561fdeb3119ed17377b Mon Sep 17 00:00:00 2001
From: Yibo Cai <yibo.cai@arm.com>
Date: Fri, 12 Nov 2021 11:24:41 +0900
Subject: [PATCH 137/194] ARROW-14682: [dev] Verify go on non x86 archs

Closes #11683 from cyb70289/go-verify

Authored-by: Yibo Cai <yibo.cai@arm.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 dev/release/verify-release-candidate.sh | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh
index 0222fafcdce70..0f697fa6e27a3 100755
--- a/dev/release/verify-release-candidate.sh
+++ b/dev/release/verify-release-candidate.sh
@@ -470,7 +470,13 @@ test_ruby() {
 
 test_go() {
   local VERSION=1.15.14
-  local ARCH=amd64
+
+  local ARCH="$(uname -m)"
+  if [ "$ARCH" == "x86_64" ]; then
+    ARCH=amd64
+  elif [ "$ARCH" == "aarch64" ]; then
+    ARCH=arm64
+  fi
 
   if [ "$(uname)" == "Darwin" ]; then
     local OS=darwin

From 2b51f7cb4e53e38c4a63afc10f379d1b582b62a1 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Fri, 12 Nov 2021 13:39:41 +0900
Subject: [PATCH 138/194] ARROW-14683: [Release][Java] Build missing
 source-release.zip

Closes #11684 from kou/release-java-source-release

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 ci/scripts/java_full_build.sh  | 15 ++++++++++++++-
 dev/tasks/java-jars/github.yml |  4 +++-
 dev/tasks/tasks.yml            |  2 +-
 3 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/ci/scripts/java_full_build.sh b/ci/scripts/java_full_build.sh
index df80d189a5774..6f12a2fd3d18f 100755
--- a/ci/scripts/java_full_build.sh
+++ b/ci/scripts/java_full_build.sh
@@ -26,15 +26,28 @@ export ARROW_TEST_DATA=${arrow_dir}/testing/data
 
 pushd ${arrow_dir}/java
 
+# generate dummy GPG key for -Papache-release.
+# -Papache-release generates signs (*.asc) of artifacts.
+# We don't use these signs in our release process.
+(echo "Key-Type: RSA"; \
+ echo "Key-Length: 4096"; \
+ echo "Name-Real: Build"; \
+ echo "Name-Email: build@example.com"; \
+ echo "%no-protection") | \
+  gpg --full-generate-key --batch
+
 # build the entire project
 mvn clean \
     install \
+    assembly:single \
     source:jar \
     javadoc:jar \
+    -Papache-release \
     -Parrow-c-data \
     -Parrow-jni \
     -Darrow.cpp.build.dir=$dist_dir \
-    -Darrow.c.jni.dist.dir=$dist_dir
+    -Darrow.c.jni.dist.dir=$dist_dir \
+    -DdescriptorId=source-release
 
 # copy all jar, zip and pom files to the distribution folder
 find . \
diff --git a/dev/tasks/java-jars/github.yml b/dev/tasks/java-jars/github.yml
index 81d31dd4cff2f..221f98514cb0f 100644
--- a/dev/tasks/java-jars/github.yml
+++ b/dev/tasks/java-jars/github.yml
@@ -112,4 +112,6 @@ jobs:
           arrow/ci/scripts/java_full_build.sh \
             $GITHUB_WORKSPACE/arrow \
             $GITHUB_WORKSPACE/arrow/java-dist
-      {{ macros.github_upload_releases(["arrow/java-dist/*.jar", "arrow/java-dist/*.pom"])|indent }}
+      {{ macros.github_upload_releases(["arrow/java-dist/*.jar",
+                                        "arrow/java-dist/*.pom",
+                                        "arrow/java-dist/*.zip"])|indent }}
diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml
index 4e3042786acf0..cfa6a79518764 100644
--- a/dev/tasks/tasks.yml
+++ b/dev/tasks/tasks.yml
@@ -728,7 +728,7 @@ tasks:
       - arrow-gandiva-{no_rc_version}-tests.jar
       - arrow-gandiva-{no_rc_version}.jar
       - arrow-gandiva-{no_rc_version}.pom
-      # - arrow-java-root-{no_rc_version}-source-release.zip
+      - arrow-java-root-{no_rc_version}-source-release.zip
       - arrow-java-root-{no_rc_version}.pom
       - arrow-jdbc-{no_rc_version}-javadoc.jar
       - arrow-jdbc-{no_rc_version}-sources.jar

From bc219186db40ec6966eab91ee2dec24b2ebcfca1 Mon Sep 17 00:00:00 2001
From: Jonathan Keane <jkeane@gmail.com>
Date: Fri, 12 Nov 2021 16:11:51 +0000
Subject: [PATCH 139/194] ARROW-14693: [R] Non-integers being passed to
 chunk_size

Closes #11692 from jonkeane/ARROW-14693

Authored-by: Jonathan Keane <jkeane@gmail.com>
Signed-off-by: Nic Crane <thisisnic@gmail.com>
---
 r/R/parquet.R                   |  5 +++--
 r/tests/testthat/test-parquet.R | 14 +++++++-------
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/r/R/parquet.R b/r/R/parquet.R
index 63ff8bd2f13c3..33cbb33f33941 100644
--- a/r/R/parquet.R
+++ b/r/R/parquet.R
@@ -207,13 +207,14 @@ write_parquet <- function(x,
       num_chunks <- 1
     } else {
       # no more than the default 250 million cells (rows * cols) per group
-      num_chunks <- num_cells / target_cells_per_group
+      # and we use floor, then ceiling to ensure that these are whole numbers
+      num_chunks <- floor(num_cells / target_cells_per_group)
     }
 
     # but there are no more than 200 chunks
     num_chunks <- min(num_chunks, getOption("arrow.parquet_max_chunks", 200))
 
-    chunk_size <- x$num_rows / num_chunks
+    chunk_size <- ceiling(x$num_rows / num_chunks)
   }
 
   writer$WriteTable(x, chunk_size = chunk_size)
diff --git a/r/tests/testthat/test-parquet.R b/r/tests/testthat/test-parquet.R
index 8c703f622d256..46625693fcb79 100644
--- a/r/tests/testthat/test-parquet.R
+++ b/r/tests/testthat/test-parquet.R
@@ -274,7 +274,7 @@ test_that("Error is created when parquet reads a feather file", {
 })
 
 test_that("ParquetFileWrite chunk_size defaults", {
-  tab <- Table$create(x = 1:100)
+  tab <- Table$create(x = 1:101)
   tf <- tempfile()
   on.exit(unlink(tf))
 
@@ -287,9 +287,9 @@ test_that("ParquetFileWrite chunk_size defaults", {
       write_parquet(tab, tf)
       reader <- ParquetFileReader$create(tf)
 
-      expect_true(reader$ReadRowGroup(0) == Table$create(x = 1:25))
-      expect_true(reader$ReadRowGroup(3) == Table$create(x = 76:100))
-      expect_error(reader$ReadRowGroup(5), "Some index in row_group_indices")
+      expect_true(reader$ReadRowGroup(0) == Table$create(x = 1:26))
+      expect_true(reader$ReadRowGroup(3) == Table$create(x = 79:101))
+      expect_error(reader$ReadRowGroup(4), "Some index in row_group_indices")
     })
 
   # but we always have no more than max_chunks (even if cells_per_group is low!)
@@ -306,8 +306,8 @@ test_that("ParquetFileWrite chunk_size defaults", {
       write_parquet(tab, tf)
       reader <- ParquetFileReader$create(tf)
 
-      expect_true(reader$ReadRowGroup(0) == Table$create(x = 1:50))
-      expect_true(reader$ReadRowGroup(1) == Table$create(x = 51:100))
-      expect_error(reader$ReadRowGroup(3), "Some index in row_group_indices")
+      expect_true(reader$ReadRowGroup(0) == Table$create(x = 1:51))
+      expect_true(reader$ReadRowGroup(1) == Table$create(x = 52:101))
+      expect_error(reader$ReadRowGroup(2), "Some index in row_group_indices")
     })
 })

From 5bacdeaf290e833948e002e529a7ef070c8577e6 Mon Sep 17 00:00:00 2001
From: Matthew Topol <mtopol@factset.com>
Date: Mon, 15 Nov 2021 06:17:42 +0900
Subject: [PATCH 140/194] ARROW-14493: [Release][Go] Add update of import path
 for major versions to script

When a major version update goes out, the Go import path and module name should update accordingly with a `/v<majorversion>` suffix.

Closes #11566 from zeroshade/arrow-14493-major-version-upgrade

Lead-authored-by: Matthew Topol <mtopol@factset.com>
Co-authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 dev/release/01-prepare-test.rb                | 664 +++++++-----------
 dev/release/post-13-go.sh                     |   6 +-
 dev/release/rat_exclude_files.txt             |   3 +-
 dev/release/test-helper.rb                    |   2 +
 dev/release/utils-prepare.sh                  |   8 +
 go/arrow/_examples/helloworld/main.go         |   6 +-
 go/arrow/array/array.go                       |   8 +-
 go/arrow/array/array_test.go                  |  10 +-
 go/arrow/array/binary.go                      |   2 +-
 go/arrow/array/binary_test.go                 |   4 +-
 go/arrow/array/binarybuilder.go               |   6 +-
 go/arrow/array/binarybuilder_test.go          |   6 +-
 go/arrow/array/boolean.go                     |   6 +-
 go/arrow/array/boolean_test.go                |   4 +-
 go/arrow/array/booleanbuilder.go              |   8 +-
 go/arrow/array/booleanbuilder_test.go         |   6 +-
 go/arrow/array/bufferbuilder.go               |   6 +-
 go/arrow/array/bufferbuilder_byte.go          |   2 +-
 go/arrow/array/bufferbuilder_numeric.gen.go   |   6 +-
 .../array/bufferbuilder_numeric.gen.go.tmpl   |   6 +-
 go/arrow/array/bufferbuilder_numeric_test.go  |   4 +-
 go/arrow/array/builder.go                     |   6 +-
 go/arrow/array/builder_test.go                |   4 +-
 go/arrow/array/compare.go                     |   4 +-
 go/arrow/array/compare_test.go                |  10 +-
 go/arrow/array/concat.go                      |   8 +-
 go/arrow/array/concat_test.go                 |  10 +-
 go/arrow/array/data.go                        |   6 +-
 go/arrow/array/data_test.go                   |   4 +-
 go/arrow/array/decimal128.go                  |  12 +-
 go/arrow/array/decimal128_test.go             |   8 +-
 go/arrow/array/extension.go                   |   4 +-
 go/arrow/array/extension_test.go              |   8 +-
 go/arrow/array/fixed_size_list.go             |   8 +-
 go/arrow/array/fixed_size_list_test.go        |   6 +-
 go/arrow/array/fixedsize_binary.go            |   2 +-
 go/arrow/array/fixedsize_binary_test.go       |   6 +-
 go/arrow/array/fixedsize_binarybuilder.go     |   6 +-
 .../array/fixedsize_binarybuilder_test.go     |   4 +-
 go/arrow/array/float16.go                     |   4 +-
 go/arrow/array/float16_builder.go             |  10 +-
 go/arrow/array/float16_builder_test.go        |   6 +-
 go/arrow/array/interval.go                    |  10 +-
 go/arrow/array/interval_test.go               |   6 +-
 go/arrow/array/list.go                        |   8 +-
 go/arrow/array/list_test.go                   |   6 +-
 go/arrow/array/map.go                         |   6 +-
 go/arrow/array/map_test.go                    |   6 +-
 go/arrow/array/null.go                        |   6 +-
 go/arrow/array/null_test.go                   |   6 +-
 go/arrow/array/numeric.gen.go                 |   2 +-
 go/arrow/array/numeric.gen.go.tmpl            |   2 +-
 go/arrow/array/numeric_test.go                |   6 +-
 go/arrow/array/numericbuilder.gen.go          |   8 +-
 go/arrow/array/numericbuilder.gen.go.tmpl     |   8 +-
 go/arrow/array/numericbuilder.gen_test.go     |   6 +-
 .../array/numericbuilder.gen_test.go.tmpl     |   6 +-
 go/arrow/array/record.go                      |   6 +-
 go/arrow/array/record_test.go                 |   6 +-
 go/arrow/array/string.go                      |   4 +-
 go/arrow/array/string_test.go                 |   6 +-
 go/arrow/array/struct.go                      |   8 +-
 go/arrow/array/struct_test.go                 |   6 +-
 go/arrow/array/table.go                       |   4 +-
 go/arrow/array/table_test.go                  |   6 +-
 go/arrow/arrio/arrio.go                       |   4 +-
 go/arrow/arrio/arrio_test.go                  |  12 +-
 go/arrow/bitutil/bitmaps.go                   |   4 +-
 go/arrow/bitutil/bitmaps_test.go              |   2 +-
 go/arrow/bitutil/bitutil.go                   |   2 +-
 go/arrow/bitutil/bitutil_test.go              |   4 +-
 go/arrow/cdata/cdata.go                       |   8 +-
 go/arrow/cdata/cdata_exports.go               |   8 +-
 go/arrow/cdata/cdata_test.go                  |   8 +-
 go/arrow/cdata/cdata_test_framework.go        |   4 +-
 go/arrow/cdata/exports.go                     |   2 +-
 go/arrow/cdata/interface.go                   |   8 +-
 go/arrow/cdata/test/go.mod                    |  23 -
 go/arrow/cdata/test/go.sum                    | 205 ------
 go/arrow/cdata/test/test_cimport.go           |   8 +-
 go/arrow/csv/common.go                        |   4 +-
 go/arrow/csv/reader.go                        |   8 +-
 go/arrow/csv/reader_test.go                   |   6 +-
 go/arrow/csv/writer.go                        |   4 +-
 go/arrow/csv/writer_test.go                   |   8 +-
 go/arrow/datatype.go                          |   2 +-
 go/arrow/datatype_binary_test.go              |   2 +-
 go/arrow/datatype_extension_test.go           |   4 +-
 go/arrow/datatype_fixedwidth_test.go          |   2 +-
 go/arrow/datatype_null_test.go                |   2 +-
 go/arrow/decimal128/decimal128.go             |   2 +-
 go/arrow/decimal128/decimal128_test.go        |   4 +-
 go/arrow/example_test.go                      |   8 +-
 go/arrow/flight/basic_auth_flight_test.go     |   2 +-
 go/arrow/flight/example_flight_server_test.go |   2 +-
 go/arrow/flight/flight_middleware_test.go     |   4 +-
 go/arrow/flight/flight_test.go                |  10 +-
 go/arrow/flight/gen.go                        |   2 +-
 go/arrow/flight/record_batch_reader.go        |   8 +-
 go/arrow/flight/record_batch_writer.go        |   8 +-
 go/arrow/float16/float16.go                   |   2 +-
 go/arrow/go.mod                               |  37 -
 go/arrow/go.sum                               | 213 ------
 go/arrow/internal/arrdata/arrdata.go          |  16 +-
 go/arrow/internal/arrdata/ioutil.go           |  12 +-
 go/arrow/internal/arrjson/arrjson.go          |  16 +-
 go/arrow/internal/arrjson/arrjson_test.go     |   8 +-
 go/arrow/internal/arrjson/option.go           |   6 +-
 go/arrow/internal/arrjson/reader.go           |  10 +-
 go/arrow/internal/arrjson/writer.go           |   8 +-
 go/arrow/internal/cpu/cpu_test.go             |   2 +-
 .../arrow-flight-integration-client/main.go   |   2 +-
 .../arrow-flight-integration-server/main.go   |   2 +-
 .../internal/flight_integration/scenario.go   |  14 +-
 .../internal/testing/gen/random_array_gen.go  |   8 +-
 go/arrow/internal/testing/tools/bits_test.go  |   2 +-
 .../internal/testing/types/extension_types.go |   4 +-
 go/arrow/ipc/cmd/arrow-cat/main.go            |   6 +-
 go/arrow/ipc/cmd/arrow-cat/main_test.go       |   8 +-
 go/arrow/ipc/cmd/arrow-file-to-stream/main.go |   8 +-
 .../ipc/cmd/arrow-file-to-stream/main_test.go |   6 +-
 .../cmd/arrow-json-integration-test/main.go   |  14 +-
 .../arrow-json-integration-test/main_test.go  |   6 +-
 go/arrow/ipc/cmd/arrow-ls/main.go             |   6 +-
 go/arrow/ipc/cmd/arrow-ls/main_test.go        |   8 +-
 go/arrow/ipc/cmd/arrow-stream-to-file/main.go |   8 +-
 .../ipc/cmd/arrow-stream-to-file/main_test.go |   6 +-
 go/arrow/ipc/compression.go                   |   4 +-
 go/arrow/ipc/dict.go                          |   6 +-
 go/arrow/ipc/dict_test.go                     |   6 +-
 go/arrow/ipc/file_reader.go                   |  12 +-
 go/arrow/ipc/file_test.go                     |   6 +-
 go/arrow/ipc/file_writer.go                   |  12 +-
 go/arrow/ipc/ipc.go                           |  10 +-
 go/arrow/ipc/ipc_test.go                      |   8 +-
 go/arrow/ipc/message.go                       |   8 +-
 go/arrow/ipc/metadata.go                      |   8 +-
 go/arrow/ipc/metadata_test.go                 |  12 +-
 go/arrow/ipc/reader.go                        |  12 +-
 go/arrow/ipc/stream_test.go                   |   6 +-
 go/arrow/ipc/writer.go                        |  12 +-
 go/arrow/ipc/writer_test.go                   |   8 +-
 go/arrow/math/float64.go                      |   2 +-
 go/arrow/math/float64_avx2_amd64.go           |   2 +-
 go/arrow/math/float64_sse4_amd64.go           |   2 +-
 go/arrow/math/float64_test.go                 |   6 +-
 go/arrow/math/int64.go                        |   2 +-
 go/arrow/math/int64_avx2_amd64.go             |   2 +-
 go/arrow/math/int64_sse4_amd64.go             |   2 +-
 go/arrow/math/int64_test.go                   |   6 +-
 go/arrow/math/math_amd64.go                   |   2 +-
 go/arrow/math/type.go.tmpl                    |   2 +-
 go/arrow/math/type_simd_amd64.go.tmpl         |   2 +-
 go/arrow/math/type_test.go.tmpl               |   6 +-
 go/arrow/math/uint64.go                       |   2 +-
 go/arrow/math/uint64_avx2_amd64.go            |   2 +-
 go/arrow/math/uint64_sse4_amd64.go            |   2 +-
 go/arrow/math/uint64_test.go                  |   6 +-
 go/arrow/memory/buffer.go                     |   2 +-
 go/arrow/memory/buffer_test.go                |   2 +-
 go/arrow/memory/cgo_allocator.go              |   2 +-
 go/arrow/memory/memory_amd64.go               |   2 +-
 go/arrow/memory/memory_test.go                |   2 +-
 go/arrow/scalar/binary.go                     |   4 +-
 go/arrow/scalar/compare.go                    |   2 +-
 go/arrow/scalar/nested.go                     |   8 +-
 go/arrow/scalar/numeric.gen.go                |   2 +-
 go/arrow/scalar/numeric.gen_test.go           |   4 +-
 go/arrow/scalar/numeric.gen_test.go.tmpl      |   4 +-
 go/arrow/scalar/parse.go                      |   8 +-
 go/arrow/scalar/scalar.go                     |  16 +-
 go/arrow/scalar/scalar_test.go                |  10 +-
 go/arrow/scalar/temporal.go                   |   2 +-
 go/arrow/tensor/numeric.gen.go                |   4 +-
 go/arrow/tensor/numeric.gen.go.tmpl           |   4 +-
 go/arrow/tensor/numeric.gen_test.go           |   8 +-
 go/arrow/tensor/numeric.gen_test.go.tmpl      |   8 +-
 go/arrow/tensor/tensor.go                     |   6 +-
 go/arrow/tensor/tensor_test.go                |   8 +-
 go/arrow/type_traits_boolean.go               |   2 +-
 go/arrow/type_traits_decimal128.go            |   4 +-
 go/arrow/type_traits_float16.go               |   4 +-
 go/arrow/type_traits_interval.go              |   4 +-
 go/arrow/type_traits_numeric.gen.go           |   4 +-
 go/arrow/type_traits_numeric.gen.go.tmpl      |   4 +-
 go/arrow/type_traits_numeric.gen_test.go      |   2 +-
 go/arrow/type_traits_numeric.gen_test.go.tmpl |   2 +-
 go/arrow/type_traits_test.go                  |   6 +-
 go/{parquet => }/go.mod                       |  23 +-
 go/go.sum                                     | 533 ++++++++++++++
 go/parquet/compress/brotli.go                 |   2 +-
 go/parquet/compress/compress.go               |   2 +-
 go/parquet/compress/compress_test.go          |   2 +-
 go/parquet/compress/zstd.go                   |   2 +-
 go/parquet/doc.go                             |  10 +-
 go/parquet/encryption_properties.go           |   2 +-
 go/parquet/encryption_properties_test.go      |   4 +-
 go/parquet/file/column_reader.go              |  14 +-
 go/parquet/file/column_reader_test.go         |  12 +-
 go/parquet/file/column_reader_types.gen.go    |   6 +-
 .../file/column_reader_types.gen.go.tmpl      |   4 +-
 go/parquet/file/file_reader.go                |   8 +-
 go/parquet/file/file_reader_test.go           |  14 +-
 go/parquet/file/level_conversion.go           |   8 +-
 go/parquet/file/level_conversion_test.go      |   6 +-
 go/parquet/file/page_reader.go                |  16 +-
 go/parquet/file/row_group_reader.go           |  10 +-
 go/parquet/go.sum                             | 245 -------
 .../internal/encoding/boolean_decoder.go      |   6 +-
 .../internal/encoding/boolean_encoder.go      |   6 +-
 .../internal/encoding/byte_array_decoder.go   |   4 +-
 .../internal/encoding/byte_array_encoder.go   |   6 +-
 go/parquet/internal/encoding/decoder.go       |  12 +-
 .../internal/encoding/delta_bit_packing.go    |   8 +-
 .../internal/encoding/delta_byte_array.go     |   6 +-
 .../encoding/delta_length_byte_array.go       |   6 +-
 go/parquet/internal/encoding/encoder.go       |  14 +-
 .../encoding/encoding_benchmarks_test.go      |  16 +-
 go/parquet/internal/encoding/encoding_test.go |  14 +-
 .../encoding/fixed_len_byte_array_decoder.go  |   4 +-
 .../encoding/fixed_len_byte_array_encoder.go  |   4 +-
 go/parquet/internal/encoding/levels.go        |   8 +-
 go/parquet/internal/encoding/levels_test.go   |  10 +-
 go/parquet/internal/encoding/memo_table.go    |  10 +-
 .../internal/encoding/memo_table_test.go      |   6 +-
 .../internal/encoding/memo_table_types.gen.go |   4 +-
 .../encoding/memo_table_types.gen.go.tmpl     |   2 +-
 .../encoding/plain_encoder_types.gen.go       |   8 +-
 .../encoding/plain_encoder_types.gen.go.tmpl  |   6 +-
 .../internal/encoding/typed_encoder.gen.go    |  12 +-
 .../encoding/typed_encoder.gen.go.tmpl        |  10 +-
 go/parquet/internal/encoding/types.go         |   8 +-
 go/parquet/internal/encryption/aes.go         |   2 +-
 go/parquet/internal/encryption/decryptor.go   |   4 +-
 go/parquet/internal/encryption/encryptor.go   |   4 +-
 .../internal/hashing/xxh3_memo_table.gen.go   |   6 +-
 .../hashing/xxh3_memo_table.gen.go.tmpl       |   4 +-
 .../internal/hashing/xxh3_memo_table.go       |   8 +-
 go/parquet/internal/testutils/pagebuilder.go  |  14 +-
 go/parquet/internal/testutils/random.go       |  10 +-
 go/parquet/internal/testutils/random_arrow.go |   6 +-
 go/parquet/internal/testutils/utils.go        |   2 +-
 go/parquet/internal/thrift/helpers.go         |   2 +-
 .../internal/utils/bit_benchmark_test.go      |   6 +-
 .../internal/utils/bit_block_counter.go       |   2 +-
 .../internal/utils/bit_block_counter_test.go  |   6 +-
 go/parquet/internal/utils/bit_reader.go       |   6 +-
 go/parquet/internal/utils/bit_reader_test.go  |  10 +-
 go/parquet/internal/utils/bit_run_reader.go   |   4 +-
 .../internal/utils/bit_run_reader_test.go     |   6 +-
 .../internal/utils/bit_set_run_reader.go      |   2 +-
 .../internal/utils/bit_set_run_reader_test.go |   4 +-
 go/parquet/internal/utils/bit_writer.go       |   2 +-
 go/parquet/internal/utils/bitmap_writer.go    |   2 +-
 .../internal/utils/bitmap_writer_test.go      |   4 +-
 go/parquet/internal/utils/rle.go              |   4 +-
 .../internal/utils/typed_rle_dict.gen.go      |   2 +-
 .../internal/utils/typed_rle_dict.gen.go.tmpl |   2 +-
 go/parquet/metadata/app_version.go            |   4 +-
 go/parquet/metadata/column_chunk.go           |  14 +-
 go/parquet/metadata/file.go                   |  12 +-
 go/parquet/metadata/metadata_test.go          |   6 +-
 go/parquet/metadata/row_group.go              |   8 +-
 go/parquet/metadata/stat_compare_test.go      |   4 +-
 go/parquet/metadata/statistics.go             |  16 +-
 go/parquet/metadata/statistics_test.go        |  10 +-
 go/parquet/metadata/statistics_types.gen.go   |  12 +-
 .../metadata/statistics_types.gen.go.tmpl     |   8 +-
 go/parquet/reader_properties.go               |   2 +-
 go/parquet/reader_writer_properties_test.go   |   6 +-
 go/parquet/schema/column.go                   |   4 +-
 go/parquet/schema/converted_types.go          |   2 +-
 go/parquet/schema/converted_types_test.go     |   2 +-
 go/parquet/schema/helpers.go                  |   2 +-
 go/parquet/schema/helpers_test.go             |   4 +-
 go/parquet/schema/logical_types.go            |   6 +-
 go/parquet/schema/logical_types_test.go       |   4 +-
 go/parquet/schema/node.go                     |   4 +-
 go/parquet/schema/reflection.go               |   4 +-
 go/parquet/schema/reflection_test.go          |   4 +-
 go/parquet/schema/schema.go                   |   4 +-
 go/parquet/schema/schema_element_test.go      |   4 +-
 go/parquet/schema/schema_flatten_test.go      |   4 +-
 go/parquet/schema/schema_test.go              |   6 +-
 go/parquet/types.go                           |   4 +-
 go/parquet/writer_properties.go               |   4 +-
 286 files changed, 1635 insertions(+), 1963 deletions(-)
 delete mode 100644 go/arrow/cdata/test/go.mod
 delete mode 100644 go/arrow/cdata/test/go.sum
 delete mode 100644 go/arrow/go.mod
 delete mode 100644 go/arrow/go.sum
 rename go/{parquet => }/go.mod (67%)
 create mode 100644 go/go.sum
 delete mode 100644 go/parquet/go.sum

diff --git a/dev/release/01-prepare-test.rb b/dev/release/01-prepare-test.rb
index 51665ec02ad7a..14b4e53353513 100644
--- a/dev/release/01-prepare-test.rb
+++ b/dev/release/01-prepare-test.rb
@@ -132,417 +132,273 @@ def test_linux_packages
 
   def test_version_pre_tag
     omit_on_release_branch
+
+    expected_changes = [
+      {
+        path: "c_glib/meson.build",
+        hunks: [
+          ["-version = '#{@snapshot_version}'",
+           "+version = '#{@release_version}'"],
+        ],
+      },
+      {
+        path: "ci/scripts/PKGBUILD",
+        hunks: [
+          ["-pkgver=#{@previous_version}.9000",
+           "+pkgver=#{@release_version}"],
+        ],
+      },
+      {
+        path: "cpp/CMakeLists.txt",
+        hunks: [
+          ["-set(ARROW_VERSION \"#{@snapshot_version}\")",
+           "+set(ARROW_VERSION \"#{@release_version}\")"],
+        ],
+      },
+      {
+        path: "cpp/vcpkg.json",
+        hunks: [
+          ["-  \"version-string\": \"#{@snapshot_version}\",",
+           "+  \"version-string\": \"#{@release_version}\","],
+        ],
+      },
+      {
+        path: "csharp/Directory.Build.props",
+        hunks: [
+          ["-    <Version>#{@snapshot_version}</Version>",
+           "+    <Version>#{@release_version}</Version>"],
+        ],
+      },
+      {
+        path: "dev/tasks/homebrew-formulae/apache-arrow.rb",
+        hunks: [
+          ["-  url \"https://www.apache.org/dyn/closer.lua?path=arrow/arrow-#{@snapshot_version}/apache-arrow-#{@snapshot_version}.tar.gz\"",
+           "+  url \"https://www.apache.org/dyn/closer.lua?path=arrow/arrow-#{@release_version}/apache-arrow-#{@release_version}.tar.gz\""],
+        ],
+      },
+      {
+        path: "dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb",
+        hunks: [
+          ["-  url \"https://www.apache.org/dyn/closer.lua?path=arrow/arrow-#{@previous_version}.9000/apache-arrow-#{@previous_version}.9000.tar.gz\"",
+           "+  url \"https://www.apache.org/dyn/closer.lua?path=arrow/arrow-#{@release_version}/apache-arrow-#{@release_version}.tar.gz\""],
+        ],
+      },
+      {
+        path: "js/package.json",
+        hunks: [
+          ["-  \"version\": \"#{@snapshot_version}\"",
+           "+  \"version\": \"#{@release_version}\""],
+        ],
+      },
+      {
+        path: "matlab/CMakeLists.txt",
+        hunks: [
+          ["-set(MLARROW_VERSION \"#{@snapshot_version}\")",
+           "+set(MLARROW_VERSION \"#{@release_version}\")"],
+        ],
+      },
+      {
+        path: "python/setup.py",
+        hunks: [
+          ["-default_version = '#{@snapshot_version}'",
+           "+default_version = '#{@release_version}'"],
+        ],
+      },
+      {
+        path: "r/DESCRIPTION",
+        hunks: [
+          ["-Version: #{@previous_version}.9000",
+           "+Version: #{@release_version}"],
+        ],
+      },
+      {
+        path: "r/NEWS.md",
+        hunks: [
+          ["-\# arrow #{@previous_version}.9000",
+           "+\# arrow #{@release_version}"],
+        ],
+      },
+    ]
+
+    Dir.glob("java/**/pom.xml") do |path|
+      version = "<version>#{@snapshot_version}</version>"
+      lines = File.readlines(path, chomp: true)
+      target_lines = lines.grep(/#{Regexp.escape(version)}/)
+      hunks = []
+      target_lines.each do |line|
+        new_line = line.gsub(@snapshot_version) do
+          @release_version
+        end
+        hunks << [
+          "-#{line}",
+          "+#{new_line}",
+        ]
+      end
+      expected_changes << {hunks: hunks, path: path}
+    end
+
+    Dir.glob("ruby/**/version.rb") do |path|
+      version = "  VERSION = \"#{@snapshot_version}\""
+      new_version = "  VERSION = \"#{@release_version}\""
+      expected_changes << {
+        hunks: [
+          [
+            "-#{version}",
+            "+#{new_version}",
+          ]
+        ],
+        path: path,
+      }
+    end
+
     prepare("VERSION_PRE_TAG")
-    assert_equal([
-                   {
-                     path: "c_glib/meson.build",
-                     hunks: [
-                       ["-version = '#{@snapshot_version}'",
-                        "+version = '#{@release_version}'"],
-                     ],
-                   },
-                   {
-                     path: "ci/scripts/PKGBUILD",
-                     hunks: [
-                       ["-pkgver=#{@previous_version}.9000",
-                        "+pkgver=#{@release_version}"],
-                     ],
-                   },
-                   {
-                     path: "cpp/CMakeLists.txt",
-                     hunks: [
-                       ["-set(ARROW_VERSION \"#{@snapshot_version}\")",
-                        "+set(ARROW_VERSION \"#{@release_version}\")"],
-                     ],
-                   },
-                   {
-                     path: "cpp/vcpkg.json",
-                     hunks: [
-                       ["-  \"version-string\": \"#{@snapshot_version}\",",
-                        "+  \"version-string\": \"#{@release_version}\","],
-                     ],
-                   },
-                   {
-                     path: "csharp/Directory.Build.props",
-                     hunks: [
-                       ["-    <Version>#{@snapshot_version}</Version>",
-                        "+    <Version>#{@release_version}</Version>"],
-                     ],
-                   },
-                   {
-                     path: "dev/tasks/homebrew-formulae/apache-arrow.rb",
-                     hunks: [
-                       ["-  url \"https://www.apache.org/dyn/closer.lua?path=arrow/arrow-#{@snapshot_version}/apache-arrow-#{@snapshot_version}.tar.gz\"",
-                        "+  url \"https://www.apache.org/dyn/closer.lua?path=arrow/arrow-#{@release_version}/apache-arrow-#{@release_version}.tar.gz\""],
-                     ],
-                   },
-                   {
-                     path: "dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb",
-                     hunks: [
-                       ["-  url \"https://www.apache.org/dyn/closer.lua?path=arrow/arrow-#{@previous_version}.9000/apache-arrow-#{@previous_version}.9000.tar.gz\"",
-                        "+  url \"https://www.apache.org/dyn/closer.lua?path=arrow/arrow-#{@release_version}/apache-arrow-#{@release_version}.tar.gz\""],
-                     ],
-                   },
-                   {
-                     path: "java/adapter/avro/pom.xml",
-                     hunks: [
-                       ["-    <version>#{@snapshot_version}</version>",
-                        "+    <version>#{@release_version}</version>"],
-                     ],
-                   },
-                   {
-                     hunks: [
-                       ["-        <version>#{@snapshot_version}</version>",
-                        "+        <version>#{@release_version}</version>"],
-                     ],
-                     path: "java/adapter/jdbc/pom.xml",
-                   },
-                   {
-                     hunks: [
-                       ["-        <version>#{@snapshot_version}</version>",
-                        "+        <version>#{@release_version}</version>"],
-                     ],
-                     path: "java/adapter/orc/pom.xml",
-                   },
-                   { hunks: [["-    <version>#{@snapshot_version}</version>",
-                              "+    <version>#{@release_version}</version>"]],
-                     path: "java/algorithm/pom.xml" },
-                   { hunks: [["-        <version>#{@snapshot_version}</version>",
-                              "+        <version>#{@release_version}</version>"]],
-                     path: "java/c/pom.xml" },
-                   { hunks: [["-    <version>#{@snapshot_version}</version>",
-                              "+    <version>#{@release_version}</version>"]],
-                     path: "java/compression/pom.xml" },
-                   { hunks: [["-        <version>#{@snapshot_version}</version>",
-                              "+        <version>#{@release_version}</version>"]],
-                     path: "java/dataset/pom.xml" },
-                   { hunks: [["-    <version>#{@snapshot_version}</version>",
-                              "+    <version>#{@release_version}</version>"]],
-                     path: "java/flight/flight-core/pom.xml" },
-                   { hunks: [["-    <version>#{@snapshot_version}</version>",
-                              "+    <version>#{@release_version}</version>"]],
-                     path: "java/flight/flight-grpc/pom.xml" },
-                   { hunks: [["-  <version>#{@snapshot_version}</version>", "+  <version>#{@release_version}</version>"]],
-                     path: "java/format/pom.xml" },
-                   { hunks: [["-      <version>#{@snapshot_version}</version>",
-                              "+      <version>#{@release_version}</version>"]],
-                     path: "java/gandiva/pom.xml" },
-                   { hunks: [["-    <version>#{@snapshot_version}</version>",
-                              "+    <version>#{@release_version}</version>"]],
-                     path: "java/memory/memory-core/pom.xml" },
-                   { hunks: [["-    <version>#{@snapshot_version}</version>",
-                              "+    <version>#{@release_version}</version>"]],
-                     path: "java/memory/memory-netty/pom.xml" },
-                   { hunks: [["-    <version>#{@snapshot_version}</version>",
-                              "+    <version>#{@release_version}</version>"]],
-                     path: "java/memory/memory-unsafe/pom.xml" },
-                   { hunks: [["-    <version>#{@snapshot_version}</version>",
-                              "+    <version>#{@release_version}</version>"]],
-                     path: "java/memory/pom.xml" },
-                   { hunks: [["-        <version>#{@snapshot_version}</version>",
-                              "+        <version>#{@release_version}</version>"],
-                             ["-            <version>#{@snapshot_version}</version>",
-                              "+            <version>#{@release_version}</version>"]],
-                     path: "java/performance/pom.xml" },
-                   { hunks: [["-        <version>#{@snapshot_version}</version>",
-                              "+        <version>#{@release_version}</version>"]],
-                     path: "java/plasma/pom.xml" },
-                   { hunks: [["-  <version>#{@snapshot_version}</version>", "+  <version>#{@release_version}</version>"]],
-                     path: "java/pom.xml" },
-                   { hunks: [["-        <version>#{@snapshot_version}</version>",
-                              "+        <version>#{@release_version}</version>"]],
-                     path: "java/tools/pom.xml" },
-                   { hunks: [["-    <version>#{@snapshot_version}</version>",
-                              "+    <version>#{@release_version}</version>"]],
-                     path: "java/vector/pom.xml" },
-                   {
-                     path: "js/package.json",
-                     hunks: [
-                       ["-  \"version\": \"#{@snapshot_version}\"",
-                        "+  \"version\": \"#{@release_version}\""],
-                     ],
-                   },
-                   {
-                     path: "matlab/CMakeLists.txt",
-                     hunks: [
-                       ["-set(MLARROW_VERSION \"#{@snapshot_version}\")",
-                        "+set(MLARROW_VERSION \"#{@release_version}\")"],
-                     ],
-                   },
-                   {
-                     path: "python/setup.py",
-                     hunks: [
-                       ["-default_version = '#{@snapshot_version}'",
-                        "+default_version = '#{@release_version}'"],
-                     ],
-                   },
-                   {
-                     path: "r/DESCRIPTION",
-                     hunks: [
-                       ["-Version: #{@previous_version}.9000",
-                        "+Version: #{@release_version}"],
-                     ],
-                   },
-                   {
-                     path: "r/NEWS.md",
-                     hunks: [
-                       ["-\# arrow #{@previous_version}.9000",
-                        "+\# arrow #{@release_version}"],
-                     ],
-                   },
-                   {
-                     path: "ruby/red-arrow-cuda/lib/arrow-cuda/version.rb",
-                     hunks: [
-                       ["-  VERSION = \"#{@snapshot_version}\"",
-                        "+  VERSION = \"#{@release_version}\""],
-                     ],
-                   },
-                   {
-                     path: "ruby/red-arrow-dataset/lib/arrow-dataset/version.rb",
-                     hunks: [
-                       ["-  VERSION = \"#{@snapshot_version}\"",
-                        "+  VERSION = \"#{@release_version}\""],
-                     ],
-                   },
-                   {
-                     path: "ruby/red-arrow-flight/lib/arrow-flight/version.rb",
-                     hunks: [
-                       ["-  VERSION = \"#{@snapshot_version}\"",
-                        "+  VERSION = \"#{@release_version}\""],
-                     ],
-                   },
-                   {
-                     path: "ruby/red-arrow/lib/arrow/version.rb",
-                     hunks: [
-                       ["-  VERSION = \"#{@snapshot_version}\"",
-                        "+  VERSION = \"#{@release_version}\""],
-                     ],
-                   },
-                   {
-                     path: "ruby/red-gandiva/lib/gandiva/version.rb",
-                     hunks: [
-                       ["-  VERSION = \"#{@snapshot_version}\"",
-                        "+  VERSION = \"#{@release_version}\""],
-                     ],
-                   },
-                   {
-                     path: "ruby/red-parquet/lib/parquet/version.rb",
-                     hunks: [
-                       ["-  VERSION = \"#{@snapshot_version}\"",
-                        "+  VERSION = \"#{@release_version}\""],
-                     ],
-                   },
-                   {
-                     path: "ruby/red-plasma/lib/plasma/version.rb",
-                     hunks: [
-                       ["-  VERSION = \"#{@snapshot_version}\"",
-                        "+  VERSION = \"#{@release_version}\""],
-                     ],
-                   },
-                 ],
+    assert_equal(expected_changes.sort_by {|diff| diff[:path]},
                  parse_patch(git("log", "-n", "1", "-p")))
   end
 
   def test_version_post_tag
     omit_on_release_branch
+
+    expected_changes = [
+      {
+        path: "c_glib/meson.build",
+        hunks: [
+          ["-version = '#{@snapshot_version}'",
+           "+version = '#{@next_snapshot_version}'"],
+        ],
+      },
+      {
+        path: "ci/scripts/PKGBUILD",
+        hunks: [
+          ["-pkgver=#{@previous_version}.9000",
+           "+pkgver=#{@release_version}.9000"],
+        ],
+      },
+      {
+        path: "cpp/CMakeLists.txt",
+        hunks: [
+          ["-set(ARROW_VERSION \"#{@snapshot_version}\")",
+           "+set(ARROW_VERSION \"#{@next_snapshot_version}\")"],
+        ],
+      },
+      {
+        path: "cpp/vcpkg.json",
+        hunks: [
+          ["-  \"version-string\": \"#{@snapshot_version}\",",
+           "+  \"version-string\": \"#{@next_snapshot_version}\","],
+        ],
+      },
+      {
+        path: "csharp/Directory.Build.props",
+        hunks: [
+          ["-    <Version>#{@snapshot_version}</Version>",
+           "+    <Version>#{@next_snapshot_version}</Version>"],
+        ],
+      },
+      {
+        path: "dev/tasks/homebrew-formulae/apache-arrow.rb",
+        hunks: [
+          ["-  url \"https://www.apache.org/dyn/closer.lua?path=arrow/arrow-#{@snapshot_version}/apache-arrow-#{@snapshot_version}.tar.gz\"",
+           "+  url \"https://www.apache.org/dyn/closer.lua?path=arrow/arrow-#{@next_snapshot_version}/apache-arrow-#{@next_snapshot_version}.tar.gz\""],
+        ],
+      },
+      {
+        path: "dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb",
+        hunks: [
+          ["-  url \"https://www.apache.org/dyn/closer.lua?path=arrow/arrow-#{@previous_version}.9000/apache-arrow-#{@previous_version}.9000.tar.gz\"",
+           "+  url \"https://www.apache.org/dyn/closer.lua?path=arrow/arrow-#{@release_version}.9000/apache-arrow-#{@release_version}.9000.tar.gz\""],
+        ],
+      },
+      {
+        path: "js/package.json",
+        hunks: [
+          ["-  \"version\": \"#{@snapshot_version}\"",
+           "+  \"version\": \"#{@next_snapshot_version}\""],
+        ],
+      },
+      {
+        path: "matlab/CMakeLists.txt",
+        hunks: [
+          ["-set(MLARROW_VERSION \"#{@snapshot_version}\")",
+           "+set(MLARROW_VERSION \"#{@next_snapshot_version}\")"],
+        ],
+      },
+      {
+        path: "python/setup.py",
+        hunks: [
+          ["-default_version = '#{@snapshot_version}'",
+           "+default_version = '#{@next_snapshot_version}'"],
+        ],
+      },
+      {
+        path: "r/DESCRIPTION",
+        hunks: [
+          ["-Version: #{@previous_version}.9000",
+           "+Version: #{@release_version}.9000"],
+        ],
+      },
+      {
+        path: "r/NEWS.md",
+        hunks: [
+          ["-# arrow #{@previous_version}.9000",
+           "+# arrow #{@release_version}.9000",
+           "+",
+           "+# arrow #{@release_version}",],
+        ],
+      },
+    ]
+
+    Dir.glob("go/**/{go.mod,*.go,*.go.*}") do |path|
+      import_path = "github.com/apache/arrow/go/v#{@snapshot_major_version}"
+      lines = File.readlines(path, chomp: true)
+      target_lines = lines.grep(/#{Regexp.escape(import_path)}/)
+      next if target_lines.empty?
+      hunk = []
+      target_lines.each do |line|
+        hunk << "-#{line}"
+      end
+      target_lines.each do |line|
+        new_line = line.gsub("v#{@snapshot_major_version}") do
+          "v#{@next_major_version}"
+        end
+        hunk << "+#{new_line}"
+      end
+      expected_changes << {hunks: [hunk], path: path}
+    end
+
+    Dir.glob("java/**/pom.xml") do |path|
+      version = "<version>#{@snapshot_version}</version>"
+      lines = File.readlines(path, chomp: true)
+      target_lines = lines.grep(/#{Regexp.escape(version)}/)
+      hunks = []
+      target_lines.each do |line|
+        new_line = line.gsub(@snapshot_version) do
+          @next_snapshot_version
+        end
+        hunks << [
+          "-#{line}",
+          "+#{new_line}",
+        ]
+      end
+      expected_changes << {hunks: hunks, path: path}
+    end
+
+    Dir.glob("ruby/**/version.rb") do |path|
+      version = "  VERSION = \"#{@snapshot_version}\""
+      new_version = "  VERSION = \"#{@next_snapshot_version}\""
+      expected_changes << {
+        hunks: [
+          [
+            "-#{version}",
+            "+#{new_version}",
+          ]
+        ],
+        path: path,
+      }
+    end
+
     bump_versions("VERSION_POST_TAG")
-    assert_equal([
-                   {
-                     path: "c_glib/meson.build",
-                     hunks: [
-                       ["-version = '#{@snapshot_version}'",
-                        "+version = '#{@next_snapshot_version}'"],
-                     ],
-                   },
-                   {
-                     path: "ci/scripts/PKGBUILD",
-                     hunks: [
-                       ["-pkgver=#{@previous_version}.9000",
-                        "+pkgver=#{@release_version}.9000"],
-                     ],
-                   },
-                   {
-                     path: "cpp/CMakeLists.txt",
-                     hunks: [
-                       ["-set(ARROW_VERSION \"#{@snapshot_version}\")",
-                        "+set(ARROW_VERSION \"#{@next_snapshot_version}\")"],
-                     ],
-                   },
-                   {
-                     path: "cpp/vcpkg.json",
-                     hunks: [
-                       ["-  \"version-string\": \"#{@snapshot_version}\",",
-                        "+  \"version-string\": \"#{@next_snapshot_version}\","],
-                     ],
-                   },
-                   {
-                     path: "csharp/Directory.Build.props",
-                     hunks: [
-                       ["-    <Version>#{@snapshot_version}</Version>",
-                        "+    <Version>#{@next_snapshot_version}</Version>"],
-                     ],
-                   },
-                   {
-                     path: "dev/tasks/homebrew-formulae/apache-arrow.rb",
-                     hunks: [
-                       ["-  url \"https://www.apache.org/dyn/closer.lua?path=arrow/arrow-#{@snapshot_version}/apache-arrow-#{@snapshot_version}.tar.gz\"",
-                        "+  url \"https://www.apache.org/dyn/closer.lua?path=arrow/arrow-#{@next_snapshot_version}/apache-arrow-#{@next_snapshot_version}.tar.gz\""],
-                     ],
-                   },
-                   {
-                     path: "dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb",
-                     hunks: [
-                       ["-  url \"https://www.apache.org/dyn/closer.lua?path=arrow/arrow-#{@previous_version}.9000/apache-arrow-#{@previous_version}.9000.tar.gz\"",
-                        "+  url \"https://www.apache.org/dyn/closer.lua?path=arrow/arrow-#{@release_version}.9000/apache-arrow-#{@release_version}.9000.tar.gz\""],
-                     ],
-                   },
-                   { path: "java/adapter/avro/pom.xml",
-                     hunks: [["-    <version>#{@snapshot_version}</version>",
-                             "+    <version>#{@next_snapshot_version}</version>"]] },
-                   { hunks: [["-        <version>#{@snapshot_version}</version>",
-                              "+        <version>#{@next_snapshot_version}</version>"]],
-                     path: "java/adapter/jdbc/pom.xml" },
-                   { hunks: [["-        <version>#{@snapshot_version}</version>",
-                              "+        <version>#{@next_snapshot_version}</version>"]],
-                     path: "java/adapter/orc/pom.xml" },
-                   { hunks: [["-    <version>#{@snapshot_version}</version>",
-                              "+    <version>#{@next_snapshot_version}</version>"]],
-                     path: "java/algorithm/pom.xml" },
-                   { hunks: [["-        <version>#{@snapshot_version}</version>",
-                              "+        <version>#{@next_snapshot_version}</version>"]],
-                     path: "java/c/pom.xml" },
-                   { hunks: [["-    <version>#{@snapshot_version}</version>",
-                              "+    <version>#{@next_snapshot_version}</version>"]],
-                     path: "java/compression/pom.xml" },
-                   { hunks: [["-        <version>#{@snapshot_version}</version>",
-                              "+        <version>#{@next_snapshot_version}</version>"]],
-                     path: "java/dataset/pom.xml" },
-                   { hunks: [["-    <version>#{@snapshot_version}</version>",
-                              "+    <version>#{@next_snapshot_version}</version>"]],
-                     path: "java/flight/flight-core/pom.xml" },
-                   { hunks: [["-    <version>#{@snapshot_version}</version>",
-                              "+    <version>#{@next_snapshot_version}</version>"]],
-                     path: "java/flight/flight-grpc/pom.xml" },
-                   { hunks: [["-  <version>#{@snapshot_version}</version>", "+  <version>#{@next_snapshot_version}</version>"]],
-                     path: "java/format/pom.xml" },
-                   { hunks: [["-      <version>#{@snapshot_version}</version>",
-                              "+      <version>#{@next_snapshot_version}</version>"]],
-                     path: "java/gandiva/pom.xml" },
-                   { hunks: [["-    <version>#{@snapshot_version}</version>",
-                              "+    <version>#{@next_snapshot_version}</version>"]],
-                     path: "java/memory/memory-core/pom.xml" },
-                   { hunks: [["-    <version>#{@snapshot_version}</version>",
-                              "+    <version>#{@next_snapshot_version}</version>"]],
-                     path: "java/memory/memory-netty/pom.xml" },
-                   { hunks: [["-    <version>#{@snapshot_version}</version>",
-                              "+    <version>#{@next_snapshot_version}</version>"]],
-                     path: "java/memory/memory-unsafe/pom.xml" },
-                   { hunks: [["-    <version>#{@snapshot_version}</version>",
-                              "+    <version>#{@next_snapshot_version}</version>"]],
-                     path: "java/memory/pom.xml" },
-                   { hunks: [["-        <version>#{@snapshot_version}</version>",
-                              "+        <version>#{@next_snapshot_version}</version>"],
-                             ["-            <version>#{@snapshot_version}</version>",
-                              "+            <version>#{@next_snapshot_version}</version>"]],
-                     path: "java/performance/pom.xml" },
-                   { hunks: [["-        <version>#{@snapshot_version}</version>",
-                              "+        <version>#{@next_snapshot_version}</version>"]],
-                     path: "java/plasma/pom.xml" },
-                   { hunks: [["-  <version>#{@snapshot_version}</version>", "+  <version>#{@next_snapshot_version}</version>"]],
-                     path: "java/pom.xml" },
-                   { hunks: [["-        <version>#{@snapshot_version}</version>",
-                              "+        <version>#{@next_snapshot_version}</version>"]],
-                     path: "java/tools/pom.xml" },
-                   { hunks: [["-    <version>#{@snapshot_version}</version>",
-                              "+    <version>#{@next_snapshot_version}</version>"]],
-                     path: "java/vector/pom.xml" },
-                   {
-                     path: "js/package.json",
-                     hunks: [
-                       ["-  \"version\": \"#{@snapshot_version}\"",
-                        "+  \"version\": \"#{@next_snapshot_version}\""],
-                     ],
-                   },
-                   {
-                     path: "matlab/CMakeLists.txt",
-                     hunks: [
-                       ["-set(MLARROW_VERSION \"#{@snapshot_version}\")",
-                        "+set(MLARROW_VERSION \"#{@next_snapshot_version}\")"],
-                     ],
-                   },
-                   {
-                     path: "python/setup.py",
-                     hunks: [
-                       ["-default_version = '#{@snapshot_version}'",
-                        "+default_version = '#{@next_snapshot_version}'"],
-                     ],
-                   },
-                   {
-                     path: "r/DESCRIPTION",
-                     hunks: [
-                       ["-Version: #{@previous_version}.9000",
-                        "+Version: #{@release_version}.9000"],
-                     ],
-                   },
-                   {
-                     path: "r/NEWS.md",
-                     hunks: [
-                       ["-# arrow #{@previous_version}.9000",
-                        "+# arrow #{@release_version}.9000",
-                        "+",
-                        "+# arrow #{@release_version}",],
-                     ],
-                   },
-                   {
-                     path: "ruby/red-arrow-cuda/lib/arrow-cuda/version.rb",
-                     hunks: [
-                       ["-  VERSION = \"#{@snapshot_version}\"",
-                        "+  VERSION = \"#{@next_snapshot_version}\""],
-                     ],
-                   },
-                   {
-                     path: "ruby/red-arrow-dataset/lib/arrow-dataset/version.rb",
-                     hunks: [
-                       ["-  VERSION = \"#{@snapshot_version}\"",
-                        "+  VERSION = \"#{@next_snapshot_version}\""],
-                     ],
-                   },
-                   {
-                     path: "ruby/red-arrow-flight/lib/arrow-flight/version.rb",
-                     hunks: [
-                       ["-  VERSION = \"#{@snapshot_version}\"",
-                        "+  VERSION = \"#{@next_snapshot_version}\""],
-                     ],
-                   },
-                   {
-                     path: "ruby/red-arrow/lib/arrow/version.rb",
-                     hunks: [
-                       ["-  VERSION = \"#{@snapshot_version}\"",
-                        "+  VERSION = \"#{@next_snapshot_version}\""],
-                     ],
-                   },
-                   {
-                     path: "ruby/red-gandiva/lib/gandiva/version.rb",
-                     hunks: [
-                       ["-  VERSION = \"#{@snapshot_version}\"",
-                        "+  VERSION = \"#{@next_snapshot_version}\""],
-                     ],
-                   },
-                   {
-                     path: "ruby/red-parquet/lib/parquet/version.rb",
-                     hunks: [
-                       ["-  VERSION = \"#{@snapshot_version}\"",
-                        "+  VERSION = \"#{@next_snapshot_version}\""],
-                     ],
-                   },
-                   {
-                     path: "ruby/red-plasma/lib/plasma/version.rb",
-                     hunks: [
-                       ["-  VERSION = \"#{@snapshot_version}\"",
-                        "+  VERSION = \"#{@next_snapshot_version}\""],
-                     ],
-                   },
-                 ],
+    assert_equal(expected_changes.sort_by {|diff| diff[:path]},
                  parse_patch(git("log", "-n", "1", "-p")))
   end
 
diff --git a/dev/release/post-13-go.sh b/dev/release/post-13-go.sh
index 52b32b33cd2b7..7c60348379564 100755
--- a/dev/release/post-13-go.sh
+++ b/dev/release/post-13-go.sh
@@ -28,11 +28,7 @@ fi
 
 version=$1
 version_tag="apache-arrow-${version}"
-go_arrow_tag="go/arrow/v${version}"
-go_parquet_tag="go/parquet/v${version}"
+go_arrow_tag="go/v${version}"
 
 git tag "${go_arrow_tag}" "${version_tag}"
-git tag "${go_parquet_tag}" "${version_tag}"
-
 git push apache "${go_arrow_tag}"
-git push apache "${go_parquet_tag}"
diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt
index afb25ff49eaba..a1d9139165172 100644
--- a/dev/release/rat_exclude_files.txt
+++ b/dev/release/rat_exclude_files.txt
@@ -123,7 +123,7 @@ dev/tasks/requirements*.txt
 dev/tasks/conda-recipes/*
 docs/requirements.txt
 go/arrow/flight/Flight_grpc.pb.go
-go/arrow/go.sum
+go/go.sum
 go/arrow/Gopkg.lock
 go/arrow/flight/Flight.pb.go
 go/arrow/flight/Flight_grpc.pb.go
@@ -132,7 +132,6 @@ go/arrow/type_string.go
 go/arrow/cdata/test/go.sum
 go/*.tmpldata
 go/*.s
-go/parquet/go.sum
 go/parquet/internal/gen-go/parquet/GoUnusedProtection__.go
 go/parquet/internal/gen-go/parquet/parquet-consts.go
 go/parquet/internal/gen-go/parquet/parquet.go
diff --git a/dev/release/test-helper.rb b/dev/release/test-helper.rb
index 8a272ddfe04a2..61a4ae013ae6c 100644
--- a/dev/release/test-helper.rb
+++ b/dev/release/test-helper.rb
@@ -76,9 +76,11 @@ def detect_versions
     top_dir = Pathname(__dir__).parent.parent
     cpp_cmake_lists = top_dir + "cpp" + "CMakeLists.txt"
     @snapshot_version = cpp_cmake_lists.read[/ARROW_VERSION "(.+?)"/, 1]
+    @snapshot_major_version = @snapshot_version.split(".")[0]
     @release_version = @snapshot_version.gsub(/-SNAPSHOT\z/, "")
     @so_version = compute_so_version(@release_version)
     @next_version = @release_version.gsub(/\A\d+/) {|major| major.succ}
+    @next_major_version = @next_version.split(".")[0]
     @next_snapshot_version = "#{@next_version}-SNAPSHOT"
     @next_so_version = compute_so_version(@next_version)
     r_description = top_dir + "r" + "DESCRIPTION"
diff --git a/dev/release/utils-prepare.sh b/dev/release/utils-prepare.sh
index 7ba786a754cb7..c2f5cfb6e4958 100644
--- a/dev/release/utils-prepare.sh
+++ b/dev/release/utils-prepare.sh
@@ -32,6 +32,7 @@ update_versions() {
       local r_version=${base_version}.9000
       ;;
   esac
+  local major_version=${version/.*/}
 
   pushd "${ARROW_DIR}/c_glib"
   sed -i.bak -E -e \
@@ -142,4 +143,11 @@ update_versions() {
   rm -f */*/*/version.rb.bak
   git add */*/*/version.rb
   popd
+
+  pushd "${ARROW_DIR}/go"
+  find . "(" -name "*.go*" -o -name "go.mod" ")" -exec sed -i.bak -E -e \
+    "s|(github\\.com/apache/arrow/go)/v[0-9]+|\1/v${major_version}|" {} \;
+  find . -name "*.bak" -exec rm {} \;
+  git add .
+  popd
 }
diff --git a/go/arrow/_examples/helloworld/main.go b/go/arrow/_examples/helloworld/main.go
index ab21f6b42c055..d08039b039087 100644
--- a/go/arrow/_examples/helloworld/main.go
+++ b/go/arrow/_examples/helloworld/main.go
@@ -17,9 +17,9 @@
 package main
 
 import (
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/math"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/math"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 )
 
 func main() {
diff --git a/go/arrow/array/array.go b/go/arrow/array/array.go
index 577971714a25c..3ac8c623a51c1 100644
--- a/go/arrow/array/array.go
+++ b/go/arrow/array/array.go
@@ -14,14 +14,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package array // import "github.com/apache/arrow/go/arrow/array"
+package array
 
 import (
 	"sync/atomic"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/bitutil"
-	"github.com/apache/arrow/go/arrow/internal/debug"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/bitutil"
+	"github.com/apache/arrow/go/v7/arrow/internal/debug"
 )
 
 // A type which satisfies array.Interface represents an immutable sequence of values.
diff --git a/go/arrow/array/array_test.go b/go/arrow/array/array_test.go
index b986332990b48..44418b041a7ef 100644
--- a/go/arrow/array/array_test.go
+++ b/go/arrow/array/array_test.go
@@ -19,11 +19,11 @@ package array_test
 import (
 	"testing"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/internal/testing/tools"
-	"github.com/apache/arrow/go/arrow/internal/testing/types"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/internal/testing/tools"
+	"github.com/apache/arrow/go/v7/arrow/internal/testing/types"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 	"github.com/stretchr/testify/assert"
 )
 
diff --git a/go/arrow/array/binary.go b/go/arrow/array/binary.go
index 865a59df56091..db932a1e3e54c 100644
--- a/go/arrow/array/binary.go
+++ b/go/arrow/array/binary.go
@@ -22,7 +22,7 @@ import (
 	"strings"
 	"unsafe"
 
-	"github.com/apache/arrow/go/arrow"
+	"github.com/apache/arrow/go/v7/arrow"
 )
 
 // A type which represents an immutable sequence of variable-length binary strings.
diff --git a/go/arrow/array/binary_test.go b/go/arrow/array/binary_test.go
index a7bbd47568b28..36f55c302ab26 100644
--- a/go/arrow/array/binary_test.go
+++ b/go/arrow/array/binary_test.go
@@ -22,8 +22,8 @@ import (
 
 	"github.com/stretchr/testify/assert"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 )
 
 func TestBinary(t *testing.T) {
diff --git a/go/arrow/array/binarybuilder.go b/go/arrow/array/binarybuilder.go
index 17562fcc76536..24ac914957176 100644
--- a/go/arrow/array/binarybuilder.go
+++ b/go/arrow/array/binarybuilder.go
@@ -20,9 +20,9 @@ import (
 	"math"
 	"sync/atomic"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/internal/debug"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/internal/debug"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 )
 
 const (
diff --git a/go/arrow/array/binarybuilder_test.go b/go/arrow/array/binarybuilder_test.go
index 145435ba4a9c9..d5fdf94c03fd9 100644
--- a/go/arrow/array/binarybuilder_test.go
+++ b/go/arrow/array/binarybuilder_test.go
@@ -20,9 +20,9 @@ import (
 	"bytes"
 	"testing"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 	"github.com/stretchr/testify/assert"
 )
 
diff --git a/go/arrow/array/boolean.go b/go/arrow/array/boolean.go
index e352e6e043b3d..e4c8a77b32437 100644
--- a/go/arrow/array/boolean.go
+++ b/go/arrow/array/boolean.go
@@ -20,9 +20,9 @@ import (
 	"fmt"
 	"strings"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/bitutil"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/bitutil"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 )
 
 // A type which represents an immutable sequence of boolean values.
diff --git a/go/arrow/array/boolean_test.go b/go/arrow/array/boolean_test.go
index e2d58050b695b..a76f1c16c30eb 100644
--- a/go/arrow/array/boolean_test.go
+++ b/go/arrow/array/boolean_test.go
@@ -22,8 +22,8 @@ import (
 	"strings"
 	"testing"
 
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 )
 
 func TestBooleanSliceData(t *testing.T) {
diff --git a/go/arrow/array/booleanbuilder.go b/go/arrow/array/booleanbuilder.go
index 4a38156849af4..7c81c2ec93401 100644
--- a/go/arrow/array/booleanbuilder.go
+++ b/go/arrow/array/booleanbuilder.go
@@ -19,10 +19,10 @@ package array
 import (
 	"sync/atomic"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/bitutil"
-	"github.com/apache/arrow/go/arrow/internal/debug"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/bitutil"
+	"github.com/apache/arrow/go/v7/arrow/internal/debug"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 )
 
 type BooleanBuilder struct {
diff --git a/go/arrow/array/booleanbuilder_test.go b/go/arrow/array/booleanbuilder_test.go
index 26de4c9c6f749..b0f8e3f03f344 100644
--- a/go/arrow/array/booleanbuilder_test.go
+++ b/go/arrow/array/booleanbuilder_test.go
@@ -19,9 +19,9 @@ package array_test
 import (
 	"testing"
 
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/internal/testing/tools"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/internal/testing/tools"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 	"github.com/stretchr/testify/assert"
 )
 
diff --git a/go/arrow/array/bufferbuilder.go b/go/arrow/array/bufferbuilder.go
index bcc7153b5598f..4d535d29d07ea 100644
--- a/go/arrow/array/bufferbuilder.go
+++ b/go/arrow/array/bufferbuilder.go
@@ -19,9 +19,9 @@ package array
 import (
 	"sync/atomic"
 
-	"github.com/apache/arrow/go/arrow/bitutil"
-	"github.com/apache/arrow/go/arrow/internal/debug"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow/bitutil"
+	"github.com/apache/arrow/go/v7/arrow/internal/debug"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 )
 
 // A bufferBuilder provides common functionality for populating memory with a sequence of type-specific values.
diff --git a/go/arrow/array/bufferbuilder_byte.go b/go/arrow/array/bufferbuilder_byte.go
index f5f5445b7ac44..185a85fc3f570 100644
--- a/go/arrow/array/bufferbuilder_byte.go
+++ b/go/arrow/array/bufferbuilder_byte.go
@@ -16,7 +16,7 @@
 
 package array
 
-import "github.com/apache/arrow/go/arrow/memory"
+import "github.com/apache/arrow/go/v7/arrow/memory"
 
 type byteBufferBuilder struct {
 	bufferBuilder
diff --git a/go/arrow/array/bufferbuilder_numeric.gen.go b/go/arrow/array/bufferbuilder_numeric.gen.go
index 4cdf42685f90b..9018beb4e0ee0 100644
--- a/go/arrow/array/bufferbuilder_numeric.gen.go
+++ b/go/arrow/array/bufferbuilder_numeric.gen.go
@@ -19,9 +19,9 @@
 package array
 
 import (
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/bitutil"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/bitutil"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 )
 
 type int32BufferBuilder struct {
diff --git a/go/arrow/array/bufferbuilder_numeric.gen.go.tmpl b/go/arrow/array/bufferbuilder_numeric.gen.go.tmpl
index a0ff764767eed..b2a95ad54404e 100644
--- a/go/arrow/array/bufferbuilder_numeric.gen.go.tmpl
+++ b/go/arrow/array/bufferbuilder_numeric.gen.go.tmpl
@@ -17,9 +17,9 @@
 package array
 
 import (
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/bitutil"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/bitutil"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 )
 
 {{range .In}}
diff --git a/go/arrow/array/bufferbuilder_numeric_test.go b/go/arrow/array/bufferbuilder_numeric_test.go
index 7834679ff75e8..4e88eedb7dfd9 100644
--- a/go/arrow/array/bufferbuilder_numeric_test.go
+++ b/go/arrow/array/bufferbuilder_numeric_test.go
@@ -20,8 +20,8 @@ import (
 	"testing"
 	"unsafe"
 
-	"github.com/apache/arrow/go/arrow/memory"
-	"github.com/apache/arrow/go/arrow/endian"
+	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow/endian"
 	"github.com/stretchr/testify/assert"
 )
 
diff --git a/go/arrow/array/builder.go b/go/arrow/array/builder.go
index 77d0d4290da10..ad56d8102525c 100644
--- a/go/arrow/array/builder.go
+++ b/go/arrow/array/builder.go
@@ -20,9 +20,9 @@ import (
 	"fmt"
 	"sync/atomic"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/bitutil"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/bitutil"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 )
 
 const (
diff --git a/go/arrow/array/builder_test.go b/go/arrow/array/builder_test.go
index 76dfaee4c45d2..9bc5cc0c77bed 100644
--- a/go/arrow/array/builder_test.go
+++ b/go/arrow/array/builder_test.go
@@ -19,8 +19,8 @@ package array
 import (
 	"testing"
 
-	"github.com/apache/arrow/go/arrow/internal/testing/tools"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow/internal/testing/tools"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 	"github.com/stretchr/testify/assert"
 )
 
diff --git a/go/arrow/array/compare.go b/go/arrow/array/compare.go
index 89c81ef10eb3f..fc2bb61eb6ad3 100644
--- a/go/arrow/array/compare.go
+++ b/go/arrow/array/compare.go
@@ -19,8 +19,8 @@ package array
 import (
 	"math"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/float16"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/float16"
 	"golang.org/x/xerrors"
 )
 
diff --git a/go/arrow/array/compare_test.go b/go/arrow/array/compare_test.go
index 4006087943962..a15f1c5d3df8e 100644
--- a/go/arrow/array/compare_test.go
+++ b/go/arrow/array/compare_test.go
@@ -21,11 +21,11 @@ import (
 	"math"
 	"testing"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/float16"
-	"github.com/apache/arrow/go/arrow/internal/arrdata"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/float16"
+	"github.com/apache/arrow/go/v7/arrow/internal/arrdata"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 	"github.com/stretchr/testify/assert"
 )
 
diff --git a/go/arrow/array/concat.go b/go/arrow/array/concat.go
index de859e5fc0fd2..f9865bf689fea 100644
--- a/go/arrow/array/concat.go
+++ b/go/arrow/array/concat.go
@@ -20,10 +20,10 @@ import (
 	"math"
 	"math/bits"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/bitutil"
-	"github.com/apache/arrow/go/arrow/internal/debug"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/bitutil"
+	"github.com/apache/arrow/go/v7/arrow/internal/debug"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 	"golang.org/x/xerrors"
 )
 
diff --git a/go/arrow/array/concat_test.go b/go/arrow/array/concat_test.go
index 6cc27215a0be6..5f2b91ddb6169 100644
--- a/go/arrow/array/concat_test.go
+++ b/go/arrow/array/concat_test.go
@@ -22,11 +22,11 @@ import (
 	"sort"
 	"testing"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/bitutil"
-	"github.com/apache/arrow/go/arrow/internal/testing/gen"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/bitutil"
+	"github.com/apache/arrow/go/v7/arrow/internal/testing/gen"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/suite"
 	"golang.org/x/exp/rand"
diff --git a/go/arrow/array/data.go b/go/arrow/array/data.go
index a68031739ba8a..be63f5fac60bd 100644
--- a/go/arrow/array/data.go
+++ b/go/arrow/array/data.go
@@ -22,9 +22,9 @@ import (
 	"sync/atomic"
 	"unsafe"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/internal/debug"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/internal/debug"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 )
 
 // Data represents the memory and metadata of an Arrow array.
diff --git a/go/arrow/array/data_test.go b/go/arrow/array/data_test.go
index de87b80afabb2..32913d0e02633 100644
--- a/go/arrow/array/data_test.go
+++ b/go/arrow/array/data_test.go
@@ -19,8 +19,8 @@ package array
 import (
 	"testing"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 	"github.com/stretchr/testify/assert"
 )
 
diff --git a/go/arrow/array/decimal128.go b/go/arrow/array/decimal128.go
index 3acf6b9c444df..10559661820b4 100644
--- a/go/arrow/array/decimal128.go
+++ b/go/arrow/array/decimal128.go
@@ -14,18 +14,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package array // import "github.com/apache/arrow/go/arrow/array"
+package array
 
 import (
 	"fmt"
 	"strings"
 	"sync/atomic"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/bitutil"
-	"github.com/apache/arrow/go/arrow/decimal128"
-	"github.com/apache/arrow/go/arrow/internal/debug"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/bitutil"
+	"github.com/apache/arrow/go/v7/arrow/decimal128"
+	"github.com/apache/arrow/go/v7/arrow/internal/debug"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 )
 
 // A type which represents an immutable sequence of 128-bit decimal values.
diff --git a/go/arrow/array/decimal128_test.go b/go/arrow/array/decimal128_test.go
index 5a39d92716a61..038876e13c254 100644
--- a/go/arrow/array/decimal128_test.go
+++ b/go/arrow/array/decimal128_test.go
@@ -19,10 +19,10 @@ package array_test
 import (
 	"testing"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/decimal128"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/decimal128"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 	"github.com/stretchr/testify/assert"
 )
 
diff --git a/go/arrow/array/extension.go b/go/arrow/array/extension.go
index d79103cc171d6..5172158ca243d 100644
--- a/go/arrow/array/extension.go
+++ b/go/arrow/array/extension.go
@@ -19,8 +19,8 @@ package array
 import (
 	"reflect"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 	"golang.org/x/xerrors"
 )
 
diff --git a/go/arrow/array/extension_test.go b/go/arrow/array/extension_test.go
index c053d38872f0e..061df4622a4ff 100644
--- a/go/arrow/array/extension_test.go
+++ b/go/arrow/array/extension_test.go
@@ -19,10 +19,10 @@ package array_test
 import (
 	"testing"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/internal/testing/types"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/internal/testing/types"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 	"github.com/stretchr/testify/suite"
 )
 
diff --git a/go/arrow/array/fixed_size_list.go b/go/arrow/array/fixed_size_list.go
index 5cfaa5c620ef1..1bf07d67e0ce1 100644
--- a/go/arrow/array/fixed_size_list.go
+++ b/go/arrow/array/fixed_size_list.go
@@ -21,10 +21,10 @@ import (
 	"strings"
 	"sync/atomic"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/bitutil"
-	"github.com/apache/arrow/go/arrow/internal/debug"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/bitutil"
+	"github.com/apache/arrow/go/v7/arrow/internal/debug"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 )
 
 // FixedSizeList represents an immutable sequence of N array values.
diff --git a/go/arrow/array/fixed_size_list_test.go b/go/arrow/array/fixed_size_list_test.go
index d72bc0b7f390d..fb58982705ff6 100644
--- a/go/arrow/array/fixed_size_list_test.go
+++ b/go/arrow/array/fixed_size_list_test.go
@@ -20,9 +20,9 @@ import (
 	"reflect"
 	"testing"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 )
 
 func TestFixedSizeListArray(t *testing.T) {
diff --git a/go/arrow/array/fixedsize_binary.go b/go/arrow/array/fixedsize_binary.go
index 58274141a7d29..6da2d5d45e1b0 100644
--- a/go/arrow/array/fixedsize_binary.go
+++ b/go/arrow/array/fixedsize_binary.go
@@ -21,7 +21,7 @@ import (
 	"fmt"
 	"strings"
 
-	"github.com/apache/arrow/go/arrow"
+	"github.com/apache/arrow/go/v7/arrow"
 )
 
 // A type which represents an immutable sequence of fixed-length binary strings.
diff --git a/go/arrow/array/fixedsize_binary_test.go b/go/arrow/array/fixedsize_binary_test.go
index 4d2d72450006e..1646ef30cbc48 100644
--- a/go/arrow/array/fixedsize_binary_test.go
+++ b/go/arrow/array/fixedsize_binary_test.go
@@ -21,9 +21,9 @@ import (
 
 	"github.com/stretchr/testify/assert"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 )
 
 func TestFixedSizeBinary(t *testing.T) {
diff --git a/go/arrow/array/fixedsize_binarybuilder.go b/go/arrow/array/fixedsize_binarybuilder.go
index 8a2f65f59c63f..8aaa93cd48bda 100644
--- a/go/arrow/array/fixedsize_binarybuilder.go
+++ b/go/arrow/array/fixedsize_binarybuilder.go
@@ -20,9 +20,9 @@ import (
 	"fmt"
 	"sync/atomic"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/internal/debug"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/internal/debug"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 )
 
 // A FixedSizeBinaryBuilder is used to build a FixedSizeBinary array using the Append methods.
diff --git a/go/arrow/array/fixedsize_binarybuilder_test.go b/go/arrow/array/fixedsize_binarybuilder_test.go
index 08740c5c9eb83..80f803c2a3e42 100644
--- a/go/arrow/array/fixedsize_binarybuilder_test.go
+++ b/go/arrow/array/fixedsize_binarybuilder_test.go
@@ -19,8 +19,8 @@ package array
 import (
 	"testing"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 	"github.com/stretchr/testify/assert"
 )
 
diff --git a/go/arrow/array/float16.go b/go/arrow/array/float16.go
index 931e2d96c0767..a7fd4eb1449e6 100644
--- a/go/arrow/array/float16.go
+++ b/go/arrow/array/float16.go
@@ -20,8 +20,8 @@ import (
 	"fmt"
 	"strings"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/float16"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/float16"
 )
 
 // A type which represents an immutable sequence of Float16 values.
diff --git a/go/arrow/array/float16_builder.go b/go/arrow/array/float16_builder.go
index 80864279a061e..1cf538af6a9be 100644
--- a/go/arrow/array/float16_builder.go
+++ b/go/arrow/array/float16_builder.go
@@ -19,11 +19,11 @@ package array
 import (
 	"sync/atomic"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/bitutil"
-	"github.com/apache/arrow/go/arrow/float16"
-	"github.com/apache/arrow/go/arrow/internal/debug"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/bitutil"
+	"github.com/apache/arrow/go/v7/arrow/float16"
+	"github.com/apache/arrow/go/v7/arrow/internal/debug"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 )
 
 type Float16Builder struct {
diff --git a/go/arrow/array/float16_builder_test.go b/go/arrow/array/float16_builder_test.go
index 23d25d260ae0e..52bf7f83484ef 100644
--- a/go/arrow/array/float16_builder_test.go
+++ b/go/arrow/array/float16_builder_test.go
@@ -19,9 +19,9 @@ package array_test
 import (
 	"testing"
 
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/float16"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/float16"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 	"github.com/stretchr/testify/assert"
 )
 
diff --git a/go/arrow/array/interval.go b/go/arrow/array/interval.go
index c9191ebab7f92..2e0c1eed11459 100644
--- a/go/arrow/array/interval.go
+++ b/go/arrow/array/interval.go
@@ -14,17 +14,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package array // import "github.com/apache/arrow/go/arrow/array"
+package array
 
 import (
 	"fmt"
 	"strings"
 	"sync/atomic"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/bitutil"
-	"github.com/apache/arrow/go/arrow/internal/debug"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/bitutil"
+	"github.com/apache/arrow/go/v7/arrow/internal/debug"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 	"golang.org/x/xerrors"
 )
 
diff --git a/go/arrow/array/interval_test.go b/go/arrow/array/interval_test.go
index 0e983fd08ba86..2c44015593352 100644
--- a/go/arrow/array/interval_test.go
+++ b/go/arrow/array/interval_test.go
@@ -20,9 +20,9 @@ import (
 	"math"
 	"testing"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 	"github.com/stretchr/testify/assert"
 )
 
diff --git a/go/arrow/array/list.go b/go/arrow/array/list.go
index 22014c8ec0113..1b81da07b3886 100644
--- a/go/arrow/array/list.go
+++ b/go/arrow/array/list.go
@@ -21,10 +21,10 @@ import (
 	"strings"
 	"sync/atomic"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/bitutil"
-	"github.com/apache/arrow/go/arrow/internal/debug"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/bitutil"
+	"github.com/apache/arrow/go/v7/arrow/internal/debug"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 )
 
 // List represents an immutable sequence of array values.
diff --git a/go/arrow/array/list_test.go b/go/arrow/array/list_test.go
index b328cf4ac7510..0978ce4de082b 100644
--- a/go/arrow/array/list_test.go
+++ b/go/arrow/array/list_test.go
@@ -20,9 +20,9 @@ import (
 	"reflect"
 	"testing"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 )
 
 func TestListArray(t *testing.T) {
diff --git a/go/arrow/array/map.go b/go/arrow/array/map.go
index 16fc5e91bbdc0..7728420603a29 100644
--- a/go/arrow/array/map.go
+++ b/go/arrow/array/map.go
@@ -14,11 +14,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package array // import "github.com/apache/arrow/go/arrow/array"
+package array
 
 import (
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 )
 
 // Map represents an immutable sequence of Key/Value structs. It is a
diff --git a/go/arrow/array/map_test.go b/go/arrow/array/map_test.go
index 9c9615558228b..9aa6ec22ab021 100644
--- a/go/arrow/array/map_test.go
+++ b/go/arrow/array/map_test.go
@@ -19,9 +19,9 @@ package array_test
 import (
 	"testing"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 	"github.com/stretchr/testify/assert"
 )
 
diff --git a/go/arrow/array/null.go b/go/arrow/array/null.go
index 56d7fa45fa187..945232e2b56d5 100644
--- a/go/arrow/array/null.go
+++ b/go/arrow/array/null.go
@@ -20,9 +20,9 @@ import (
 	"strings"
 	"sync/atomic"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/internal/debug"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/internal/debug"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 )
 
 // Null represents an immutable, degenerate array with no physical storage.
diff --git a/go/arrow/array/null_test.go b/go/arrow/array/null_test.go
index 1339c4c0bbf2a..43c870ae4125c 100644
--- a/go/arrow/array/null_test.go
+++ b/go/arrow/array/null_test.go
@@ -19,9 +19,9 @@ package array_test
 import (
 	"testing"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 )
 
 func TestNullArray(t *testing.T) {
diff --git a/go/arrow/array/numeric.gen.go b/go/arrow/array/numeric.gen.go
index 068ab1ec98e79..f958c713c46e4 100644
--- a/go/arrow/array/numeric.gen.go
+++ b/go/arrow/array/numeric.gen.go
@@ -22,7 +22,7 @@ import (
 	"fmt"
 	"strings"
 
-	"github.com/apache/arrow/go/arrow"
+	"github.com/apache/arrow/go/v7/arrow"
 )
 
 // A type which represents an immutable sequence of int64 values.
diff --git a/go/arrow/array/numeric.gen.go.tmpl b/go/arrow/array/numeric.gen.go.tmpl
index b742823a18d8d..70cf40ff614fc 100644
--- a/go/arrow/array/numeric.gen.go.tmpl
+++ b/go/arrow/array/numeric.gen.go.tmpl
@@ -20,7 +20,7 @@ import (
 	"fmt"
 	"strings"
 
-	"github.com/apache/arrow/go/arrow"
+	"github.com/apache/arrow/go/v7/arrow"
 )
 
 {{range .In}}
diff --git a/go/arrow/array/numeric_test.go b/go/arrow/array/numeric_test.go
index fc7f04addbe0d..e21d8e22425eb 100644
--- a/go/arrow/array/numeric_test.go
+++ b/go/arrow/array/numeric_test.go
@@ -20,9 +20,9 @@ import (
 	"reflect"
 	"testing"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 	"github.com/stretchr/testify/assert"
 )
 
diff --git a/go/arrow/array/numericbuilder.gen.go b/go/arrow/array/numericbuilder.gen.go
index 5423d8ce515c6..0802d81c102ef 100644
--- a/go/arrow/array/numericbuilder.gen.go
+++ b/go/arrow/array/numericbuilder.gen.go
@@ -21,10 +21,10 @@ package array
 import (
 	"sync/atomic"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/bitutil"
-	"github.com/apache/arrow/go/arrow/internal/debug"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/bitutil"
+	"github.com/apache/arrow/go/v7/arrow/internal/debug"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 )
 
 type Int64Builder struct {
diff --git a/go/arrow/array/numericbuilder.gen.go.tmpl b/go/arrow/array/numericbuilder.gen.go.tmpl
index 149a88aed915a..e066d4fbb3869 100644
--- a/go/arrow/array/numericbuilder.gen.go.tmpl
+++ b/go/arrow/array/numericbuilder.gen.go.tmpl
@@ -17,10 +17,10 @@
 package array
 
 import (
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/bitutil"
-	"github.com/apache/arrow/go/arrow/internal/debug"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/bitutil"
+	"github.com/apache/arrow/go/v7/arrow/internal/debug"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 )
 
 {{range .In}}
diff --git a/go/arrow/array/numericbuilder.gen_test.go b/go/arrow/array/numericbuilder.gen_test.go
index d5f6aaab664cf..6eefe9c4a4bf6 100644
--- a/go/arrow/array/numericbuilder.gen_test.go
+++ b/go/arrow/array/numericbuilder.gen_test.go
@@ -21,9 +21,9 @@ package array_test
 import (
 	"testing"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 	"github.com/stretchr/testify/assert"
 )
 
diff --git a/go/arrow/array/numericbuilder.gen_test.go.tmpl b/go/arrow/array/numericbuilder.gen_test.go.tmpl
index e4f78108830c1..7b90bb35f5c01 100644
--- a/go/arrow/array/numericbuilder.gen_test.go.tmpl
+++ b/go/arrow/array/numericbuilder.gen_test.go.tmpl
@@ -19,9 +19,9 @@ package array_test
 import (
 	"testing"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 	"github.com/stretchr/testify/assert"
 )
 
diff --git a/go/arrow/array/record.go b/go/arrow/array/record.go
index f65a4e8e06c4d..741bc51b7f876 100644
--- a/go/arrow/array/record.go
+++ b/go/arrow/array/record.go
@@ -21,9 +21,9 @@ import (
 	"strings"
 	"sync/atomic"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/internal/debug"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/internal/debug"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 )
 
 // RecordReader reads a stream of records.
diff --git a/go/arrow/array/record_test.go b/go/arrow/array/record_test.go
index 8df5a6a6fc667..56b831b87d79f 100644
--- a/go/arrow/array/record_test.go
+++ b/go/arrow/array/record_test.go
@@ -21,9 +21,9 @@ import (
 	"reflect"
 	"testing"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 )
 
 func TestRecord(t *testing.T) {
diff --git a/go/arrow/array/string.go b/go/arrow/array/string.go
index f7e6afccfafa3..b0b291f0c0c1e 100644
--- a/go/arrow/array/string.go
+++ b/go/arrow/array/string.go
@@ -23,8 +23,8 @@ import (
 	"strings"
 	"unsafe"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 )
 
 const (
diff --git a/go/arrow/array/string_test.go b/go/arrow/array/string_test.go
index f71eb37ff055e..f50fe51958bc7 100644
--- a/go/arrow/array/string_test.go
+++ b/go/arrow/array/string_test.go
@@ -21,9 +21,9 @@ import (
 	"reflect"
 	"testing"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 	"github.com/stretchr/testify/assert"
 )
 
diff --git a/go/arrow/array/struct.go b/go/arrow/array/struct.go
index 71bdab8f960af..807eb70d73a0f 100644
--- a/go/arrow/array/struct.go
+++ b/go/arrow/array/struct.go
@@ -22,10 +22,10 @@ import (
 	"strings"
 	"sync/atomic"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/bitutil"
-	"github.com/apache/arrow/go/arrow/internal/debug"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/bitutil"
+	"github.com/apache/arrow/go/v7/arrow/internal/debug"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 )
 
 // Struct represents an ordered sequence of relative types.
diff --git a/go/arrow/array/struct_test.go b/go/arrow/array/struct_test.go
index b948530086a5f..7c5a6bdfc41df 100644
--- a/go/arrow/array/struct_test.go
+++ b/go/arrow/array/struct_test.go
@@ -20,9 +20,9 @@ import (
 	"reflect"
 	"testing"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 )
 
 func TestStructArray(t *testing.T) {
diff --git a/go/arrow/array/table.go b/go/arrow/array/table.go
index edbbf913e17a5..7811d9225fd99 100644
--- a/go/arrow/array/table.go
+++ b/go/arrow/array/table.go
@@ -22,8 +22,8 @@ import (
 	"math"
 	"sync/atomic"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/internal/debug"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/internal/debug"
 )
 
 // Table represents a logical sequence of chunked arrays.
diff --git a/go/arrow/array/table_test.go b/go/arrow/array/table_test.go
index 6bb62350c8641..9718122f1f9ce 100644
--- a/go/arrow/array/table_test.go
+++ b/go/arrow/array/table_test.go
@@ -21,9 +21,9 @@ import (
 	"reflect"
 	"testing"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 )
 
 func TestChunked(t *testing.T) {
diff --git a/go/arrow/arrio/arrio.go b/go/arrow/arrio/arrio.go
index 60cfb78b106f1..225bb9bf297d5 100644
--- a/go/arrow/arrio/arrio.go
+++ b/go/arrow/arrio/arrio.go
@@ -16,12 +16,12 @@
 
 // Package arrio exposes functions to manipulate records, exposing and using
 // interfaces not unlike the ones defined in the stdlib io package.
-package arrio // import "github.com/apache/arrow/go/arrow/arrio"
+package arrio
 
 import (
 	"io"
 
-	"github.com/apache/arrow/go/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/array"
 )
 
 // Reader is the interface that wraps the Read method.
diff --git a/go/arrow/arrio/arrio_test.go b/go/arrow/arrio/arrio_test.go
index 783309b22803f..5f7c58501184a 100644
--- a/go/arrow/arrio/arrio_test.go
+++ b/go/arrow/arrio/arrio_test.go
@@ -23,12 +23,12 @@ import (
 	"os"
 	"testing"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/arrio"
-	"github.com/apache/arrow/go/arrow/internal/arrdata"
-	"github.com/apache/arrow/go/arrow/ipc"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/arrio"
+	"github.com/apache/arrow/go/v7/arrow/internal/arrdata"
+	"github.com/apache/arrow/go/v7/arrow/ipc"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 )
 
 type copyKind int
diff --git a/go/arrow/bitutil/bitmaps.go b/go/arrow/bitutil/bitmaps.go
index 572d4d0043c02..1379adac4299c 100644
--- a/go/arrow/bitutil/bitmaps.go
+++ b/go/arrow/bitutil/bitmaps.go
@@ -20,8 +20,8 @@ import (
 	"math/bits"
 	"unsafe"
 
-	"github.com/apache/arrow/go/arrow/endian"
-	"github.com/apache/arrow/go/arrow/internal/debug"
+	"github.com/apache/arrow/go/v7/arrow/endian"
+	"github.com/apache/arrow/go/v7/arrow/internal/debug"
 )
 
 // BitmapReader is a simple bitmap reader for a byte slice.
diff --git a/go/arrow/bitutil/bitmaps_test.go b/go/arrow/bitutil/bitmaps_test.go
index 25dbdfb2abc08..d3f2a1f646684 100644
--- a/go/arrow/bitutil/bitmaps_test.go
+++ b/go/arrow/bitutil/bitmaps_test.go
@@ -22,7 +22,7 @@ import (
 	"strconv"
 	"testing"
 
-	"github.com/apache/arrow/go/arrow/bitutil"
+	"github.com/apache/arrow/go/v7/arrow/bitutil"
 	"github.com/stretchr/testify/assert"
 )
 
diff --git a/go/arrow/bitutil/bitutil.go b/go/arrow/bitutil/bitutil.go
index 02a417067c698..6b887e77b7623 100644
--- a/go/arrow/bitutil/bitutil.go
+++ b/go/arrow/bitutil/bitutil.go
@@ -22,7 +22,7 @@ import (
 	"reflect"
 	"unsafe"
 
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 )
 
 var (
diff --git a/go/arrow/bitutil/bitutil_test.go b/go/arrow/bitutil/bitutil_test.go
index 0a4b725885ec5..956d5d68bcefb 100644
--- a/go/arrow/bitutil/bitutil_test.go
+++ b/go/arrow/bitutil/bitutil_test.go
@@ -21,8 +21,8 @@ import (
 	"math/rand"
 	"testing"
 
-	"github.com/apache/arrow/go/arrow/bitutil"
-	"github.com/apache/arrow/go/arrow/internal/testing/tools"
+	"github.com/apache/arrow/go/v7/arrow/bitutil"
+	"github.com/apache/arrow/go/v7/arrow/internal/testing/tools"
 	"github.com/stretchr/testify/assert"
 )
 
diff --git a/go/arrow/cdata/cdata.go b/go/arrow/cdata/cdata.go
index df45edaeb0718..ba15943bfa963 100644
--- a/go/arrow/cdata/cdata.go
+++ b/go/arrow/cdata/cdata.go
@@ -40,10 +40,10 @@ import (
 	"syscall"
 	"unsafe"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/bitutil"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/bitutil"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 	"golang.org/x/xerrors"
 )
 
diff --git a/go/arrow/cdata/cdata_exports.go b/go/arrow/cdata/cdata_exports.go
index 926a1ccba1ec9..9deeb4deb5f6a 100644
--- a/go/arrow/cdata/cdata_exports.go
+++ b/go/arrow/cdata/cdata_exports.go
@@ -39,10 +39,10 @@ import (
 	"strings"
 	"unsafe"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/endian"
-	"github.com/apache/arrow/go/arrow/ipc"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/endian"
+	"github.com/apache/arrow/go/v7/arrow/ipc"
 )
 
 func encodeCMetadata(keys, values []string) []byte {
diff --git a/go/arrow/cdata/cdata_test.go b/go/arrow/cdata/cdata_test.go
index cf6aba3f7c797..89cdb6643ce92 100644
--- a/go/arrow/cdata/cdata_test.go
+++ b/go/arrow/cdata/cdata_test.go
@@ -30,10 +30,10 @@ import (
 	"time"
 	"unsafe"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/decimal128"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/decimal128"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 	"github.com/stretchr/testify/assert"
 )
 
diff --git a/go/arrow/cdata/cdata_test_framework.go b/go/arrow/cdata/cdata_test_framework.go
index 048d950a3e2a7..04d7cf7883c64 100644
--- a/go/arrow/cdata/cdata_test_framework.go
+++ b/go/arrow/cdata/cdata_test_framework.go
@@ -52,8 +52,8 @@ import "C"
 import (
 	"unsafe"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
 )
 
 const (
diff --git a/go/arrow/cdata/exports.go b/go/arrow/cdata/exports.go
index 8240fbd5a2fd3..b281796e32dcc 100644
--- a/go/arrow/cdata/exports.go
+++ b/go/arrow/cdata/exports.go
@@ -22,7 +22,7 @@ import (
 	"sync/atomic"
 	"unsafe"
 
-	"github.com/apache/arrow/go/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/array"
 )
 
 // #include <stdlib.h>
diff --git a/go/arrow/cdata/interface.go b/go/arrow/cdata/interface.go
index b7c75863abdc6..54d4aa590d99c 100644
--- a/go/arrow/cdata/interface.go
+++ b/go/arrow/cdata/interface.go
@@ -21,10 +21,10 @@ package cdata
 import (
 	"unsafe"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/arrio"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/arrio"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 	"golang.org/x/xerrors"
 )
 
diff --git a/go/arrow/cdata/test/go.mod b/go/arrow/cdata/test/go.mod
deleted file mode 100644
index 153b1272db590..0000000000000
--- a/go/arrow/cdata/test/go.mod
+++ /dev/null
@@ -1,23 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-module cdatatest
-
-go 1.15
-
-replace github.com/apache/arrow/go/arrow => ../../
-
-require github.com/apache/arrow/go/arrow v0.0.0-00010101000000-000000000000
diff --git a/go/arrow/cdata/test/go.sum b/go/arrow/cdata/test/go.sum
deleted file mode 100644
index 033f1912a803e..0000000000000
--- a/go/arrow/cdata/test/go.sum
+++ /dev/null
@@ -1,205 +0,0 @@
-cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
-cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
-dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU=
-gioui.org v0.0.0-20210308172011-57750fc8a0a6/go.mod h1:RSH6KIUZ0p2xy5zHDxgAM4zumjgTw83q2ge/PI+yyw8=
-github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
-github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo=
-github.com/ajstarks/svgo v0.0.0-20180226025133-644b8db467af/go.mod h1:K08gAheRH3/J6wwsYMMT4xOr94bZjxIelGM0+d/wbFw=
-github.com/antihax/optional v1.0.0/go.mod h1:uupD/76wgC+ih3iEmQUL+0Ugr19nfwCT1kdvxnR2qWY=
-github.com/boombuler/barcode v1.0.0/go.mod h1:paBWMcWSl3LHKBqUq+rly7CNSldXjb2rDl3JlRe0mD8=
-github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
-github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
-github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc=
-github.com/cncf/udpa/go v0.0.0-20201120205902-5459f2c99403/go.mod h1:WmhPx2Nbnhtbo57+VJT5O0JRkEi1Wbu0z5j0R8u5Hbk=
-github.com/cncf/xds/go v0.0.0-20210312221358-fbca930ec8ed/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs=
-github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
-github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
-github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
-github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
-github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
-github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98=
-github.com/envoyproxy/go-control-plane v0.9.9-0.20201210154907-fd9021fe5dad/go.mod h1:cXg6YxExXjJnVBQHBLXeUAgxn2UodCpnH306RInaBQk=
-github.com/envoyproxy/go-control-plane v0.9.9-0.20210217033140-668b12f5399d/go.mod h1:cXg6YxExXjJnVBQHBLXeUAgxn2UodCpnH306RInaBQk=
-github.com/envoyproxy/go-control-plane v0.9.9-0.20210512163311-63b5d3c536b0/go.mod h1:hliV/p42l8fGbc6Y9bQ70uLwIvmJyVE5k4iMKlh8wCQ=
-github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c=
-github.com/fogleman/gg v1.2.1-0.20190220221249-0403632d5b90/go.mod h1:R/bRT+9gY/C5z7JzPU0zXsXHKM4/ayA+zqcVNZzPa1k=
-github.com/fogleman/gg v1.3.0/go.mod h1:R/bRT+9gY/C5z7JzPU0zXsXHKM4/ayA+zqcVNZzPa1k=
-github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04=
-github.com/go-fonts/dejavu v0.1.0/go.mod h1:4Wt4I4OU2Nq9asgDCteaAaWZOV24E+0/Pwo0gppep4g=
-github.com/go-fonts/latin-modern v0.2.0/go.mod h1:rQVLdDMK+mK1xscDwsqM5J8U2jrRa3T0ecnM9pNujks=
-github.com/go-fonts/liberation v0.1.1/go.mod h1:K6qoJYypsmfVjWg8KOVDQhLc8UDgIK2HYqyqAO9z7GY=
-github.com/go-fonts/stix v0.1.0/go.mod h1:w/c1f0ldAUlJmLBvlbkvVXLAD+tAMqobIIQpmnUIzUY=
-github.com/go-gl/glfw v0.0.0-20190409004039-e6da0acd62b1/go.mod h1:vR7hzQXu2zJy9AVAgeJqvqgH9Q5CA+iKCZ2gyEVpxRU=
-github.com/go-latex/latex v0.0.0-20210118124228-b3d85cf34e07/go.mod h1:CO1AlKB2CSIqUrmQPqA0gdRIlnLEY0gK5JGjh37zN5U=
-github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k=
-github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q=
-github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A=
-github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
-github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
-github.com/golang/protobuf v1.3.3/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw=
-github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8=
-github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA=
-github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs=
-github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w=
-github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0=
-github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QDs8UjoX8=
-github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI=
-github.com/golang/protobuf v1.4.3/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI=
-github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk=
-github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY=
-github.com/golang/snappy v0.0.3 h1:fHPg5GQYlCeLIPB9BZqMVR5nR9A+IM5zcgeTdjMYmLA=
-github.com/golang/snappy v0.0.3/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
-github.com/google/flatbuffers v2.0.0+incompatible h1:dicJ2oXwypfwUGnB2/TYWYEKiuk9eYQlQO/AnOHl5mI=
-github.com/google/flatbuffers v2.0.0+incompatible/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8=
-github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M=
-github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
-github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
-github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
-github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
-github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
-github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
-github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
-github.com/grpc-ecosystem/grpc-gateway v1.16.0/go.mod h1:BDjrQk3hbvj6Nolgz8mAMFbcEtjT1g+wF4CSlocrBnw=
-github.com/jung-kurt/gofpdf v1.0.0/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+r32jIOes=
-github.com/jung-kurt/gofpdf v1.0.3-0.20190309125859-24315acbbda5/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+r32jIOes=
-github.com/klauspost/compress v1.13.1 h1:wXr2uRxZTJXHLly6qhJabee5JqIhTRoLBhDOA74hDEQ=
-github.com/klauspost/compress v1.13.1/go.mod h1:8dP1Hq4DHOhN9w426knH3Rhby4rFm6D8eO+e+Dq5Gzg=
-github.com/phpdave11/gofpdf v1.4.2/go.mod h1:zpO6xFn9yxo3YLyMvW8HcKWVdbNqgIfOOp2dXMnm1mY=
-github.com/phpdave11/gofpdi v1.0.12/go.mod h1:vBmVV0Do6hSBHC8uKUQ71JGW+ZGQq74llk/7bXwjDoI=
-github.com/pierrec/lz4/v4 v4.1.8 h1:ieHkV+i2BRzngO4Wd/3HGowuZStgq6QkPsD1eolNAO4=
-github.com/pierrec/lz4/v4 v4.1.8/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4=
-github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
-github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
-github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
-github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
-github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
-github.com/rogpeppe/fastuuid v1.2.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ=
-github.com/ruudk/golang-pdf417 v0.0.0-20181029194003-1af4ab5afa58/go.mod h1:6lfFZQK844Gfx8o5WFuvpxWRwnSoipWe/p622j1v06w=
-github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
-github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
-github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA=
-github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY=
-github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
-github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k=
-go.opentelemetry.io/proto/otlp v0.7.0/go.mod h1:PqfVotwruBrMGOCsRd/89rSnXhoiJIqeYNgFYFoEGnI=
-golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
-golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
-golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
-golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
-golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
-golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
-golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
-golang.org/x/exp v0.0.0-20190125153040-c74c464bbbf2/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
-golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
-golang.org/x/exp v0.0.0-20191002040644-a1355ae1e2c3 h1:n9HxLrNxWWtEb1cA950nuEEj3QnKbtsCJ6KjcgisNUs=
-golang.org/x/exp v0.0.0-20191002040644-a1355ae1e2c3/go.mod h1:NOZ3BPKG0ec/BKJQgnvsSFpcKLM5xXVWnvZS97DWHgE=
-golang.org/x/image v0.0.0-20180708004352-c73c2afc3b81/go.mod h1:ux5Hcp/YLpHSI86hEcLt0YII63i6oz57MZXIpbrjZUs=
-golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js=
-golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
-golang.org/x/image v0.0.0-20190910094157-69e4b8554b2a/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
-golang.org/x/image v0.0.0-20200119044424-58c23975cae1/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
-golang.org/x/image v0.0.0-20200430140353-33d19683fad8/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
-golang.org/x/image v0.0.0-20200618115811-c13761719519/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
-golang.org/x/image v0.0.0-20201208152932-35266b937fa6/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
-golang.org/x/image v0.0.0-20210216034530-4410531fe030/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
-golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
-golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU=
-golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
-golang.org/x/lint v0.0.0-20210508222113-6edffad5e616/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY=
-golang.org/x/mobile v0.0.0-20190719004257-d2bd2a29d028/go.mod h1:E/iHnbuqvinMTCcRqshq8CkpyQDoeVncDDYHnLhea+o=
-golang.org/x/mod v0.1.0/go.mod h1:0QHyrYULN0/3qlju5TqG8bIK38QM8yzMo5ekMj3DlcY=
-golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg=
-golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
-golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
-golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
-golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
-golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
-golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
-golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
-golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
-golang.org/x/net v0.0.0-20200822124328-c89045814202/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA=
-golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96bSt6lcn1PtDYWL6XObtHCRCNQM=
-golang.org/x/net v0.0.0-20210614182718-04defd469f4e/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
-golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
-golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
-golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
-golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
-golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20210304124612-50617c2ba197/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20210330210617-4fbd30eecc44/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20210510120138-977fb7262007/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
-golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
-golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
-golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
-golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
-golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
-golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
-golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
-golang.org/x/tools v0.0.0-20190206041539-40960b6deb8e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
-golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY=
-golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
-golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q=
-golang.org/x/tools v0.0.0-20190927191325-030b2cf1153e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
-golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
-golang.org/x/tools v0.0.0-20200130002326-2f3ba24bd6e7/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
-golang.org/x/tools v0.1.4/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk=
-golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
-golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
-golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
-golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE=
-golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
-gonum.org/v1/gonum v0.0.0-20180816165407-929014505bf4/go.mod h1:Y+Yx5eoAFn32cQvJDxZx5Dpnq+c3wtXuadVZAcxbbBo=
-gonum.org/v1/gonum v0.8.2/go.mod h1:oe/vMfY3deqTw+1EZJhuvEW2iwGF1bW9wwu7XCu0+v0=
-gonum.org/v1/gonum v0.9.3 h1:DnoIG+QAMaF5NvxnGe/oKsgKcAc6PcUyl8q0VetfQ8s=
-gonum.org/v1/gonum v0.9.3/go.mod h1:TZumC3NeyVQskjXqmyWt4S3bINhy7B4eYwW69EbyX+0=
-gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0 h1:OE9mWmgKkjJyEmDAAtGMPjXu+YNeGvK9VTSHY6+Qihc=
-gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw=
-gonum.org/v1/plot v0.0.0-20190515093506-e2840ee46a6b/go.mod h1:Wt8AAjI+ypCyYX3nZBvf6cAIx93T+c/OS2HFAYskSZc=
-gonum.org/v1/plot v0.9.0/go.mod h1:3Pcqqmp6RHvJI72kgb8fThyUnav364FOsdDo2aGW5lY=
-google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM=
-google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
-google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc=
-google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc=
-google.golang.org/genproto v0.0.0-20200513103714-09dca8ec2884/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c=
-google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo=
-google.golang.org/genproto v0.0.0-20210630183607-d20f26d13c79/go.mod h1:yiaVoXHpRzHGyxV3o4DktVWY4mSUErTKaeEOq6C3t3U=
-google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c=
-google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg=
-google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY=
-google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk=
-google.golang.org/grpc v1.33.1/go.mod h1:fr5YgcSWrqhRRxogOsw7RzIpsmvOZ6IcH4kBYTpR3n0=
-google.golang.org/grpc v1.36.0/go.mod h1:qjiiYl8FncCW8feJPdyg3v6XW24KsRHe+dy9BAGRRjU=
-google.golang.org/grpc v1.38.0/go.mod h1:NREThFqKR1f3iQ6oBuvc5LadQuXVGo9rkm5ZGrQdJfM=
-google.golang.org/grpc v1.39.0/go.mod h1:PImNr+rS9TWYb2O4/emRugxiyHZ5JyHW5F+RPnDzfrE=
-google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8=
-google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0=
-google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM=
-google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE=
-google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo=
-google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
-google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
-google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
-google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c=
-google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
-google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc=
-google.golang.org/protobuf v1.27.1/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc=
-gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
-gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
-gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
-gopkg.in/yaml.v2 v2.2.3/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
-gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo=
-gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
-honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
-honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
-rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4=
diff --git a/go/arrow/cdata/test/test_cimport.go b/go/arrow/cdata/test/test_cimport.go
index f3fa101962aa0..95aba7ddfe00a 100644
--- a/go/arrow/cdata/test/test_cimport.go
+++ b/go/arrow/cdata/test/test_cimport.go
@@ -22,10 +22,10 @@ import (
 	"fmt"
 	"runtime"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/cdata"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/cdata"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 )
 
 // #include <stdint.h>
diff --git a/go/arrow/csv/common.go b/go/arrow/csv/common.go
index d2c3a3c09b72f..09354cfde34c6 100644
--- a/go/arrow/csv/common.go
+++ b/go/arrow/csv/common.go
@@ -22,8 +22,8 @@ import (
 	"errors"
 	"fmt"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 )
 
 var (
diff --git a/go/arrow/csv/reader.go b/go/arrow/csv/reader.go
index 2cddc3ce03c20..fee5f4ee32a6f 100644
--- a/go/arrow/csv/reader.go
+++ b/go/arrow/csv/reader.go
@@ -24,10 +24,10 @@ import (
 	"sync"
 	"sync/atomic"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/internal/debug"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/internal/debug"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 	"golang.org/x/xerrors"
 )
 
diff --git a/go/arrow/csv/reader_test.go b/go/arrow/csv/reader_test.go
index 7b0229cc6fdd6..0f4f28b6e99c7 100644
--- a/go/arrow/csv/reader_test.go
+++ b/go/arrow/csv/reader_test.go
@@ -22,9 +22,9 @@ import (
 	"io/ioutil"
 	"testing"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/csv"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/csv"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 )
 
 func Example() {
diff --git a/go/arrow/csv/writer.go b/go/arrow/csv/writer.go
index 68e44a7d02af7..1c72dee8c6a3e 100644
--- a/go/arrow/csv/writer.go
+++ b/go/arrow/csv/writer.go
@@ -22,8 +22,8 @@ import (
 	"strconv"
 	"sync"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
 )
 
 // Writer wraps encoding/csv.Writer and writes array.Record based on a schema.
diff --git a/go/arrow/csv/writer_test.go b/go/arrow/csv/writer_test.go
index 7c762665094ac..ce4a897a08390 100644
--- a/go/arrow/csv/writer_test.go
+++ b/go/arrow/csv/writer_test.go
@@ -24,10 +24,10 @@ import (
 	"strings"
 	"testing"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/csv"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/csv"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 )
 
 func Example_writer() {
diff --git a/go/arrow/datatype.go b/go/arrow/datatype.go
index 57cce348b3d3a..383d106504c4d 100644
--- a/go/arrow/datatype.go
+++ b/go/arrow/datatype.go
@@ -19,7 +19,7 @@ package arrow
 import (
 	"hash/maphash"
 
-	"github.com/apache/arrow/go/arrow/internal/debug"
+	"github.com/apache/arrow/go/v7/arrow/internal/debug"
 )
 
 // Type is a logical type. They can be expressed as
diff --git a/go/arrow/datatype_binary_test.go b/go/arrow/datatype_binary_test.go
index 26791e992b0dd..a335fbed5c84b 100644
--- a/go/arrow/datatype_binary_test.go
+++ b/go/arrow/datatype_binary_test.go
@@ -19,7 +19,7 @@ package arrow_test
 import (
 	"testing"
 
-	"github.com/apache/arrow/go/arrow"
+	"github.com/apache/arrow/go/v7/arrow"
 )
 
 func TestBinaryType(t *testing.T) {
diff --git a/go/arrow/datatype_extension_test.go b/go/arrow/datatype_extension_test.go
index 1963d79fad1a4..ad6392b246948 100644
--- a/go/arrow/datatype_extension_test.go
+++ b/go/arrow/datatype_extension_test.go
@@ -20,8 +20,8 @@ import (
 	"reflect"
 	"testing"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/internal/testing/types"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/internal/testing/types"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/suite"
 )
diff --git a/go/arrow/datatype_fixedwidth_test.go b/go/arrow/datatype_fixedwidth_test.go
index 0b3aad9175c4f..238c98ef94aab 100644
--- a/go/arrow/datatype_fixedwidth_test.go
+++ b/go/arrow/datatype_fixedwidth_test.go
@@ -19,7 +19,7 @@ package arrow_test
 import (
 	"testing"
 
-	"github.com/apache/arrow/go/arrow"
+	"github.com/apache/arrow/go/v7/arrow"
 	"github.com/stretchr/testify/assert"
 )
 
diff --git a/go/arrow/datatype_null_test.go b/go/arrow/datatype_null_test.go
index 1549ac3a17951..fa2968214f8db 100644
--- a/go/arrow/datatype_null_test.go
+++ b/go/arrow/datatype_null_test.go
@@ -19,7 +19,7 @@ package arrow_test
 import (
 	"testing"
 
-	"github.com/apache/arrow/go/arrow"
+	"github.com/apache/arrow/go/v7/arrow"
 )
 
 func TestNullType(t *testing.T) {
diff --git a/go/arrow/decimal128/decimal128.go b/go/arrow/decimal128/decimal128.go
index b14b81df87803..2423784fce3ed 100644
--- a/go/arrow/decimal128/decimal128.go
+++ b/go/arrow/decimal128/decimal128.go
@@ -14,7 +14,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package decimal128 // import "github.com/apache/arrow/go/arrow/decimal128"
+package decimal128
 
 import (
 	"math/big"
diff --git a/go/arrow/decimal128/decimal128_test.go b/go/arrow/decimal128/decimal128_test.go
index dfdf5371a7637..a8bb4b6d530f8 100644
--- a/go/arrow/decimal128/decimal128_test.go
+++ b/go/arrow/decimal128/decimal128_test.go
@@ -14,7 +14,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package decimal128_test // import "github.com/apache/arrow/go/arrow/decimal128"
+package decimal128_test
 
 import (
 	"fmt"
@@ -22,7 +22,7 @@ import (
 	"math/big"
 	"testing"
 
-	"github.com/apache/arrow/go/arrow/decimal128"
+	"github.com/apache/arrow/go/v7/arrow/decimal128"
 	"github.com/stretchr/testify/assert"
 )
 
diff --git a/go/arrow/example_test.go b/go/arrow/example_test.go
index 2decf8a9f1ecd..3c68c59326d07 100644
--- a/go/arrow/example_test.go
+++ b/go/arrow/example_test.go
@@ -20,10 +20,10 @@ import (
 	"fmt"
 	"log"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/memory"
-	"github.com/apache/arrow/go/arrow/tensor"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow/tensor"
 )
 
 // This example demonstrates how to build an array of int64 values using a builder and Append.
diff --git a/go/arrow/flight/basic_auth_flight_test.go b/go/arrow/flight/basic_auth_flight_test.go
index c008566f89462..b5eb0b86c7b31 100644
--- a/go/arrow/flight/basic_auth_flight_test.go
+++ b/go/arrow/flight/basic_auth_flight_test.go
@@ -21,7 +21,7 @@ import (
 	"io"
 	"testing"
 
-	"github.com/apache/arrow/go/arrow/flight"
+	"github.com/apache/arrow/go/v7/arrow/flight"
 	"google.golang.org/grpc"
 	"google.golang.org/grpc/codes"
 	"google.golang.org/grpc/metadata"
diff --git a/go/arrow/flight/example_flight_server_test.go b/go/arrow/flight/example_flight_server_test.go
index 01f291fda11c7..493103dc4c826 100644
--- a/go/arrow/flight/example_flight_server_test.go
+++ b/go/arrow/flight/example_flight_server_test.go
@@ -22,7 +22,7 @@ import (
 	"io"
 	"log"
 
-	"github.com/apache/arrow/go/arrow/flight"
+	"github.com/apache/arrow/go/v7/arrow/flight"
 	"google.golang.org/grpc"
 	"google.golang.org/grpc/codes"
 	"google.golang.org/grpc/status"
diff --git a/go/arrow/flight/flight_middleware_test.go b/go/arrow/flight/flight_middleware_test.go
index 4227fee8e7d64..42073c048ad1a 100644
--- a/go/arrow/flight/flight_middleware_test.go
+++ b/go/arrow/flight/flight_middleware_test.go
@@ -21,8 +21,8 @@ import (
 	"io"
 	"testing"
 
-	"github.com/apache/arrow/go/arrow/flight"
-	"github.com/apache/arrow/go/arrow/internal/arrdata"
+	"github.com/apache/arrow/go/v7/arrow/flight"
+	"github.com/apache/arrow/go/v7/arrow/internal/arrdata"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 	"google.golang.org/grpc"
diff --git a/go/arrow/flight/flight_test.go b/go/arrow/flight/flight_test.go
index a6a59f198bf15..2828c77a37ba7 100644
--- a/go/arrow/flight/flight_test.go
+++ b/go/arrow/flight/flight_test.go
@@ -23,11 +23,11 @@ import (
 	"io"
 	"testing"
 
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/flight"
-	"github.com/apache/arrow/go/arrow/internal/arrdata"
-	"github.com/apache/arrow/go/arrow/ipc"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/flight"
+	"github.com/apache/arrow/go/v7/arrow/internal/arrdata"
+	"github.com/apache/arrow/go/v7/arrow/ipc"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 	"google.golang.org/grpc"
 	"google.golang.org/grpc/codes"
 	"google.golang.org/grpc/status"
diff --git a/go/arrow/flight/gen.go b/go/arrow/flight/gen.go
index 8a4b87440984c..1739f6fca2f33 100644
--- a/go/arrow/flight/gen.go
+++ b/go/arrow/flight/gen.go
@@ -14,6 +14,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package flight // import "github.com/apache/arrow/go/arrow/flight"
+package flight
 
 //go:generate protoc -I../../../format --go_out=. --go-grpc_out=. --go_opt=paths=source_relative --go-grpc_opt=paths=source_relative Flight.proto
diff --git a/go/arrow/flight/record_batch_reader.go b/go/arrow/flight/record_batch_reader.go
index 1af3a58023cc7..6fd96549769d6 100644
--- a/go/arrow/flight/record_batch_reader.go
+++ b/go/arrow/flight/record_batch_reader.go
@@ -20,10 +20,10 @@ import (
 	"bytes"
 	"sync/atomic"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/internal/debug"
-	"github.com/apache/arrow/go/arrow/ipc"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/internal/debug"
+	"github.com/apache/arrow/go/v7/arrow/ipc"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 	"golang.org/x/xerrors"
 )
 
diff --git a/go/arrow/flight/record_batch_writer.go b/go/arrow/flight/record_batch_writer.go
index b3e114aada092..108d63cd014b7 100644
--- a/go/arrow/flight/record_batch_writer.go
+++ b/go/arrow/flight/record_batch_writer.go
@@ -19,10 +19,10 @@ package flight
 import (
 	"bytes"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/ipc"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/ipc"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 )
 
 // DataStreamWriter is an interface that represents an Arrow Flight stream
diff --git a/go/arrow/float16/float16.go b/go/arrow/float16/float16.go
index e15ee2c20d765..c46a3a1a00593 100644
--- a/go/arrow/float16/float16.go
+++ b/go/arrow/float16/float16.go
@@ -14,7 +14,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package float16 // import "github.com/apache/arrow/go/arrow/float16"
+package float16
 
 import (
 	"math"
diff --git a/go/arrow/go.mod b/go/arrow/go.mod
deleted file mode 100644
index b71ff89334d23..0000000000000
--- a/go/arrow/go.mod
+++ /dev/null
@@ -1,37 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-module github.com/apache/arrow/go/arrow
-
-go 1.15
-
-require (
-	github.com/davecgh/go-spew v1.1.1 // indirect
-	github.com/golang/protobuf v1.5.2
-	github.com/google/flatbuffers v2.0.0+incompatible
-	github.com/google/go-cmp v0.5.6 // indirect
-	github.com/klauspost/compress v1.13.1
-	github.com/pierrec/lz4/v4 v4.1.8
-	github.com/stretchr/testify v1.7.0
-	golang.org/x/exp v0.0.0-20191002040644-a1355ae1e2c3
-	golang.org/x/net v0.0.0-20210614182718-04defd469f4e // indirect
-	golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c // indirect
-	golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1
-	gonum.org/v1/gonum v0.9.3
-	google.golang.org/genproto v0.0.0-20210630183607-d20f26d13c79 // indirect
-	google.golang.org/grpc v1.39.0
-	google.golang.org/protobuf v1.27.1
-)
diff --git a/go/arrow/go.sum b/go/arrow/go.sum
deleted file mode 100644
index 207218e5aad05..0000000000000
--- a/go/arrow/go.sum
+++ /dev/null
@@ -1,213 +0,0 @@
-cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
-cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
-dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU=
-gioui.org v0.0.0-20210308172011-57750fc8a0a6/go.mod h1:RSH6KIUZ0p2xy5zHDxgAM4zumjgTw83q2ge/PI+yyw8=
-github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
-github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo=
-github.com/ajstarks/svgo v0.0.0-20180226025133-644b8db467af/go.mod h1:K08gAheRH3/J6wwsYMMT4xOr94bZjxIelGM0+d/wbFw=
-github.com/antihax/optional v1.0.0/go.mod h1:uupD/76wgC+ih3iEmQUL+0Ugr19nfwCT1kdvxnR2qWY=
-github.com/boombuler/barcode v1.0.0/go.mod h1:paBWMcWSl3LHKBqUq+rly7CNSldXjb2rDl3JlRe0mD8=
-github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
-github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
-github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc=
-github.com/cncf/udpa/go v0.0.0-20201120205902-5459f2c99403/go.mod h1:WmhPx2Nbnhtbo57+VJT5O0JRkEi1Wbu0z5j0R8u5Hbk=
-github.com/cncf/xds/go v0.0.0-20210312221358-fbca930ec8ed/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs=
-github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
-github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
-github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
-github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
-github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
-github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98=
-github.com/envoyproxy/go-control-plane v0.9.9-0.20201210154907-fd9021fe5dad/go.mod h1:cXg6YxExXjJnVBQHBLXeUAgxn2UodCpnH306RInaBQk=
-github.com/envoyproxy/go-control-plane v0.9.9-0.20210217033140-668b12f5399d/go.mod h1:cXg6YxExXjJnVBQHBLXeUAgxn2UodCpnH306RInaBQk=
-github.com/envoyproxy/go-control-plane v0.9.9-0.20210512163311-63b5d3c536b0/go.mod h1:hliV/p42l8fGbc6Y9bQ70uLwIvmJyVE5k4iMKlh8wCQ=
-github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c=
-github.com/fogleman/gg v1.2.1-0.20190220221249-0403632d5b90/go.mod h1:R/bRT+9gY/C5z7JzPU0zXsXHKM4/ayA+zqcVNZzPa1k=
-github.com/fogleman/gg v1.3.0/go.mod h1:R/bRT+9gY/C5z7JzPU0zXsXHKM4/ayA+zqcVNZzPa1k=
-github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04=
-github.com/go-fonts/dejavu v0.1.0/go.mod h1:4Wt4I4OU2Nq9asgDCteaAaWZOV24E+0/Pwo0gppep4g=
-github.com/go-fonts/latin-modern v0.2.0/go.mod h1:rQVLdDMK+mK1xscDwsqM5J8U2jrRa3T0ecnM9pNujks=
-github.com/go-fonts/liberation v0.1.1/go.mod h1:K6qoJYypsmfVjWg8KOVDQhLc8UDgIK2HYqyqAO9z7GY=
-github.com/go-fonts/stix v0.1.0/go.mod h1:w/c1f0ldAUlJmLBvlbkvVXLAD+tAMqobIIQpmnUIzUY=
-github.com/go-gl/glfw v0.0.0-20190409004039-e6da0acd62b1/go.mod h1:vR7hzQXu2zJy9AVAgeJqvqgH9Q5CA+iKCZ2gyEVpxRU=
-github.com/go-latex/latex v0.0.0-20210118124228-b3d85cf34e07/go.mod h1:CO1AlKB2CSIqUrmQPqA0gdRIlnLEY0gK5JGjh37zN5U=
-github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k=
-github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q=
-github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A=
-github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
-github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
-github.com/golang/protobuf v1.3.3/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw=
-github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8=
-github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA=
-github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs=
-github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w=
-github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0=
-github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QDs8UjoX8=
-github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI=
-github.com/golang/protobuf v1.4.3/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI=
-github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk=
-github.com/golang/protobuf v1.5.2 h1:ROPKBNFfQgOUMifHyP+KYbvpjbdoFNs+aK7DXlji0Tw=
-github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY=
-github.com/golang/snappy v0.0.3 h1:fHPg5GQYlCeLIPB9BZqMVR5nR9A+IM5zcgeTdjMYmLA=
-github.com/golang/snappy v0.0.3/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
-github.com/google/flatbuffers v2.0.0+incompatible h1:dicJ2oXwypfwUGnB2/TYWYEKiuk9eYQlQO/AnOHl5mI=
-github.com/google/flatbuffers v2.0.0+incompatible/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8=
-github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M=
-github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
-github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
-github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
-github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
-github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
-github.com/google/go-cmp v0.5.6 h1:BKbKCqvP6I+rmFHt06ZmyQtvB8xAkWdhFyr0ZUNZcxQ=
-github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
-github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
-github.com/grpc-ecosystem/grpc-gateway v1.16.0/go.mod h1:BDjrQk3hbvj6Nolgz8mAMFbcEtjT1g+wF4CSlocrBnw=
-github.com/jung-kurt/gofpdf v1.0.0/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+r32jIOes=
-github.com/jung-kurt/gofpdf v1.0.3-0.20190309125859-24315acbbda5/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+r32jIOes=
-github.com/klauspost/compress v1.13.1 h1:wXr2uRxZTJXHLly6qhJabee5JqIhTRoLBhDOA74hDEQ=
-github.com/klauspost/compress v1.13.1/go.mod h1:8dP1Hq4DHOhN9w426knH3Rhby4rFm6D8eO+e+Dq5Gzg=
-github.com/phpdave11/gofpdf v1.4.2/go.mod h1:zpO6xFn9yxo3YLyMvW8HcKWVdbNqgIfOOp2dXMnm1mY=
-github.com/phpdave11/gofpdi v1.0.12/go.mod h1:vBmVV0Do6hSBHC8uKUQ71JGW+ZGQq74llk/7bXwjDoI=
-github.com/pierrec/lz4/v4 v4.1.8 h1:ieHkV+i2BRzngO4Wd/3HGowuZStgq6QkPsD1eolNAO4=
-github.com/pierrec/lz4/v4 v4.1.8/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4=
-github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
-github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
-github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
-github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
-github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
-github.com/rogpeppe/fastuuid v1.2.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ=
-github.com/ruudk/golang-pdf417 v0.0.0-20181029194003-1af4ab5afa58/go.mod h1:6lfFZQK844Gfx8o5WFuvpxWRwnSoipWe/p622j1v06w=
-github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
-github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
-github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA=
-github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY=
-github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
-github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k=
-go.opentelemetry.io/proto/otlp v0.7.0/go.mod h1:PqfVotwruBrMGOCsRd/89rSnXhoiJIqeYNgFYFoEGnI=
-golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
-golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
-golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
-golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
-golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
-golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
-golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
-golang.org/x/exp v0.0.0-20190125153040-c74c464bbbf2/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
-golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
-golang.org/x/exp v0.0.0-20191002040644-a1355ae1e2c3 h1:n9HxLrNxWWtEb1cA950nuEEj3QnKbtsCJ6KjcgisNUs=
-golang.org/x/exp v0.0.0-20191002040644-a1355ae1e2c3/go.mod h1:NOZ3BPKG0ec/BKJQgnvsSFpcKLM5xXVWnvZS97DWHgE=
-golang.org/x/image v0.0.0-20180708004352-c73c2afc3b81/go.mod h1:ux5Hcp/YLpHSI86hEcLt0YII63i6oz57MZXIpbrjZUs=
-golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js=
-golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
-golang.org/x/image v0.0.0-20190910094157-69e4b8554b2a/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
-golang.org/x/image v0.0.0-20200119044424-58c23975cae1/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
-golang.org/x/image v0.0.0-20200430140353-33d19683fad8/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
-golang.org/x/image v0.0.0-20200618115811-c13761719519/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
-golang.org/x/image v0.0.0-20201208152932-35266b937fa6/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
-golang.org/x/image v0.0.0-20210216034530-4410531fe030/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
-golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
-golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU=
-golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
-golang.org/x/lint v0.0.0-20210508222113-6edffad5e616/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY=
-golang.org/x/mobile v0.0.0-20190719004257-d2bd2a29d028/go.mod h1:E/iHnbuqvinMTCcRqshq8CkpyQDoeVncDDYHnLhea+o=
-golang.org/x/mod v0.1.0/go.mod h1:0QHyrYULN0/3qlju5TqG8bIK38QM8yzMo5ekMj3DlcY=
-golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg=
-golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
-golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
-golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
-golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
-golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
-golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
-golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
-golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
-golang.org/x/net v0.0.0-20200822124328-c89045814202/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA=
-golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96bSt6lcn1PtDYWL6XObtHCRCNQM=
-golang.org/x/net v0.0.0-20210614182718-04defd469f4e h1:XpT3nA5TvE525Ne3hInMh6+GETgn27Zfm9dxsThnX2Q=
-golang.org/x/net v0.0.0-20210614182718-04defd469f4e/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
-golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
-golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
-golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
-golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
-golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20210304124612-50617c2ba197/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20210330210617-4fbd30eecc44/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20210510120138-977fb7262007/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c h1:F1jZWGFhYfh0Ci55sIpILtKKK8p3i2/krTr0H1rg74I=
-golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
-golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
-golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
-golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
-golang.org/x/text v0.3.6 h1:aRYxNxv6iGQlyVaZmk6ZgYEDa+Jg18DxebPSrd6bg1M=
-golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
-golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
-golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
-golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
-golang.org/x/tools v0.0.0-20190206041539-40960b6deb8e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
-golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY=
-golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
-golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q=
-golang.org/x/tools v0.0.0-20190927191325-030b2cf1153e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
-golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
-golang.org/x/tools v0.0.0-20200130002326-2f3ba24bd6e7/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
-golang.org/x/tools v0.1.4/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk=
-golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
-golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
-golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
-golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE=
-golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
-gonum.org/v1/gonum v0.0.0-20180816165407-929014505bf4/go.mod h1:Y+Yx5eoAFn32cQvJDxZx5Dpnq+c3wtXuadVZAcxbbBo=
-gonum.org/v1/gonum v0.8.2/go.mod h1:oe/vMfY3deqTw+1EZJhuvEW2iwGF1bW9wwu7XCu0+v0=
-gonum.org/v1/gonum v0.9.3 h1:DnoIG+QAMaF5NvxnGe/oKsgKcAc6PcUyl8q0VetfQ8s=
-gonum.org/v1/gonum v0.9.3/go.mod h1:TZumC3NeyVQskjXqmyWt4S3bINhy7B4eYwW69EbyX+0=
-gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0 h1:OE9mWmgKkjJyEmDAAtGMPjXu+YNeGvK9VTSHY6+Qihc=
-gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw=
-gonum.org/v1/plot v0.0.0-20190515093506-e2840ee46a6b/go.mod h1:Wt8AAjI+ypCyYX3nZBvf6cAIx93T+c/OS2HFAYskSZc=
-gonum.org/v1/plot v0.9.0/go.mod h1:3Pcqqmp6RHvJI72kgb8fThyUnav364FOsdDo2aGW5lY=
-google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM=
-google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
-google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc=
-google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc=
-google.golang.org/genproto v0.0.0-20200513103714-09dca8ec2884/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c=
-google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo=
-google.golang.org/genproto v0.0.0-20210630183607-d20f26d13c79 h1:s1jFTXJryg4a1mew7xv03VZD8N9XjxFhk1o4Js4WvPQ=
-google.golang.org/genproto v0.0.0-20210630183607-d20f26d13c79/go.mod h1:yiaVoXHpRzHGyxV3o4DktVWY4mSUErTKaeEOq6C3t3U=
-google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c=
-google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg=
-google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY=
-google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk=
-google.golang.org/grpc v1.33.1/go.mod h1:fr5YgcSWrqhRRxogOsw7RzIpsmvOZ6IcH4kBYTpR3n0=
-google.golang.org/grpc v1.36.0/go.mod h1:qjiiYl8FncCW8feJPdyg3v6XW24KsRHe+dy9BAGRRjU=
-google.golang.org/grpc v1.38.0/go.mod h1:NREThFqKR1f3iQ6oBuvc5LadQuXVGo9rkm5ZGrQdJfM=
-google.golang.org/grpc v1.39.0 h1:Klz8I9kdtkIN6EpHHUOMLCYhTn/2WAe5a0s1hcBkdTI=
-google.golang.org/grpc v1.39.0/go.mod h1:PImNr+rS9TWYb2O4/emRugxiyHZ5JyHW5F+RPnDzfrE=
-google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8=
-google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0=
-google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM=
-google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE=
-google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo=
-google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
-google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
-google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
-google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c=
-google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
-google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc=
-google.golang.org/protobuf v1.27.1 h1:SnqbnDw1V7RiZcXPx5MEeqPv2s79L9i7BJUlG/+RurQ=
-google.golang.org/protobuf v1.27.1/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc=
-gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
-gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
-gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
-gopkg.in/yaml.v2 v2.2.3/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
-gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo=
-gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
-honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
-honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
-rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4=
diff --git a/go/arrow/internal/arrdata/arrdata.go b/go/arrow/internal/arrdata/arrdata.go
index 77f3d35df55dd..7c44f9814bbe5 100644
--- a/go/arrow/internal/arrdata/arrdata.go
+++ b/go/arrow/internal/arrdata/arrdata.go
@@ -15,19 +15,19 @@
 // limitations under the License.
 
 // Package arrdata exports arrays and records data ready to be used for tests.
-package arrdata // import "github.com/apache/arrow/go/arrow/internal/arrdata"
+package arrdata
 
 import (
 	"fmt"
 	"sort"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/decimal128"
-	"github.com/apache/arrow/go/arrow/float16"
-	"github.com/apache/arrow/go/arrow/internal/testing/types"
-	"github.com/apache/arrow/go/arrow/ipc"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/decimal128"
+	"github.com/apache/arrow/go/v7/arrow/float16"
+	"github.com/apache/arrow/go/v7/arrow/internal/testing/types"
+	"github.com/apache/arrow/go/v7/arrow/ipc"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 )
 
 var (
diff --git a/go/arrow/internal/arrdata/ioutil.go b/go/arrow/internal/arrdata/ioutil.go
index dfafa4e7fac71..a7e3e4135a302 100644
--- a/go/arrow/internal/arrdata/ioutil.go
+++ b/go/arrow/internal/arrdata/ioutil.go
@@ -14,7 +14,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package arrdata // import "github.com/apache/arrow/go/arrow/internal/arrdata"
+package arrdata
 
 import (
 	"fmt"
@@ -23,11 +23,11 @@ import (
 	"sync"
 	"testing"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/internal/flatbuf"
-	"github.com/apache/arrow/go/arrow/ipc"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/internal/flatbuf"
+	"github.com/apache/arrow/go/v7/arrow/ipc"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 )
 
 // CheckArrowFile checks whether a given ARROW file contains the expected list of records.
diff --git a/go/arrow/internal/arrjson/arrjson.go b/go/arrow/internal/arrjson/arrjson.go
index c5f7ab56f4278..09343c0ff9e63 100644
--- a/go/arrow/internal/arrjson/arrjson.go
+++ b/go/arrow/internal/arrjson/arrjson.go
@@ -16,7 +16,7 @@
 
 // Package arrjson provides types and functions to encode and decode ARROW types and data
 // to and from JSON files.
-package arrjson // import "github.com/apache/arrow/go/arrow/internal/arrjson"
+package arrjson
 
 import (
 	"bytes"
@@ -26,13 +26,13 @@ import (
 	"strconv"
 	"strings"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/bitutil"
-	"github.com/apache/arrow/go/arrow/decimal128"
-	"github.com/apache/arrow/go/arrow/float16"
-	"github.com/apache/arrow/go/arrow/ipc"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/bitutil"
+	"github.com/apache/arrow/go/v7/arrow/decimal128"
+	"github.com/apache/arrow/go/v7/arrow/float16"
+	"github.com/apache/arrow/go/v7/arrow/ipc"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 	"golang.org/x/xerrors"
 )
 
diff --git a/go/arrow/internal/arrjson/arrjson_test.go b/go/arrow/internal/arrjson/arrjson_test.go
index c16145b0c2bdd..a6658af497e7d 100644
--- a/go/arrow/internal/arrjson/arrjson_test.go
+++ b/go/arrow/internal/arrjson/arrjson_test.go
@@ -14,7 +14,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package arrjson // import "github.com/apache/arrow/go/arrow/internal/arrjson"
+package arrjson
 
 import (
 	"io"
@@ -23,9 +23,9 @@ import (
 	"strings"
 	"testing"
 
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/internal/arrdata"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/internal/arrdata"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 )
 
 func TestReadWrite(t *testing.T) {
diff --git a/go/arrow/internal/arrjson/option.go b/go/arrow/internal/arrjson/option.go
index ea89f6eea882d..8f65a95106b8c 100644
--- a/go/arrow/internal/arrjson/option.go
+++ b/go/arrow/internal/arrjson/option.go
@@ -14,11 +14,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package arrjson // import "github.com/apache/arrow/go/arrow/internal/arrjson"
+package arrjson
 
 import (
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 )
 
 type config struct {
diff --git a/go/arrow/internal/arrjson/reader.go b/go/arrow/internal/arrjson/reader.go
index 34c49c14f4375..32246e088d7e7 100644
--- a/go/arrow/internal/arrjson/reader.go
+++ b/go/arrow/internal/arrjson/reader.go
@@ -14,17 +14,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package arrjson // import "github.com/apache/arrow/go/arrow/internal/arrjson"
+package arrjson
 
 import (
 	"encoding/json"
 	"io"
 	"sync/atomic"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/arrio"
-	"github.com/apache/arrow/go/arrow/internal/debug"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/arrio"
+	"github.com/apache/arrow/go/v7/arrow/internal/debug"
 )
 
 type Reader struct {
diff --git a/go/arrow/internal/arrjson/writer.go b/go/arrow/internal/arrjson/writer.go
index f5dd00c8f4c7e..90089ea017556 100644
--- a/go/arrow/internal/arrjson/writer.go
+++ b/go/arrow/internal/arrjson/writer.go
@@ -14,15 +14,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package arrjson // import "github.com/apache/arrow/go/arrow/internal/arrjson"
+package arrjson
 
 import (
 	"encoding/json"
 	"io"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/arrio"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/arrio"
 )
 
 const (
diff --git a/go/arrow/internal/cpu/cpu_test.go b/go/arrow/internal/cpu/cpu_test.go
index 02f09d3d54cef..c2dae6928c9b1 100644
--- a/go/arrow/internal/cpu/cpu_test.go
+++ b/go/arrow/internal/cpu/cpu_test.go
@@ -8,7 +8,7 @@ import (
 	"runtime"
 	"testing"
 
-	"github.com/apache/arrow/go/arrow/internal/cpu"
+	"github.com/apache/arrow/go/v7/arrow/internal/cpu"
 )
 
 func TestAMD64minimalFeatures(t *testing.T) {
diff --git a/go/arrow/internal/flight_integration/cmd/arrow-flight-integration-client/main.go b/go/arrow/internal/flight_integration/cmd/arrow-flight-integration-client/main.go
index b0c0bfbd98fa6..5c2dd85a15f0f 100644
--- a/go/arrow/internal/flight_integration/cmd/arrow-flight-integration-client/main.go
+++ b/go/arrow/internal/flight_integration/cmd/arrow-flight-integration-client/main.go
@@ -22,7 +22,7 @@ import (
 	"fmt"
 	"time"
 
-	"github.com/apache/arrow/go/arrow/internal/flight_integration"
+	"github.com/apache/arrow/go/v7/arrow/internal/flight_integration"
 	"google.golang.org/grpc"
 )
 
diff --git a/go/arrow/internal/flight_integration/cmd/arrow-flight-integration-server/main.go b/go/arrow/internal/flight_integration/cmd/arrow-flight-integration-server/main.go
index 54590864eb2ff..15fe013bb4593 100644
--- a/go/arrow/internal/flight_integration/cmd/arrow-flight-integration-server/main.go
+++ b/go/arrow/internal/flight_integration/cmd/arrow-flight-integration-server/main.go
@@ -23,7 +23,7 @@ import (
 	"os"
 	"syscall"
 
-	"github.com/apache/arrow/go/arrow/internal/flight_integration"
+	"github.com/apache/arrow/go/v7/arrow/internal/flight_integration"
 )
 
 var (
diff --git a/go/arrow/internal/flight_integration/scenario.go b/go/arrow/internal/flight_integration/scenario.go
index afe450557c861..2f0e074a043db 100644
--- a/go/arrow/internal/flight_integration/scenario.go
+++ b/go/arrow/internal/flight_integration/scenario.go
@@ -25,13 +25,13 @@ import (
 	"os"
 	"strconv"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/flight"
-	"github.com/apache/arrow/go/arrow/internal/arrjson"
-	"github.com/apache/arrow/go/arrow/internal/testing/types"
-	"github.com/apache/arrow/go/arrow/ipc"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/flight"
+	"github.com/apache/arrow/go/v7/arrow/internal/arrjson"
+	"github.com/apache/arrow/go/v7/arrow/internal/testing/types"
+	"github.com/apache/arrow/go/v7/arrow/ipc"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 	"golang.org/x/xerrors"
 	"google.golang.org/grpc"
 	"google.golang.org/grpc/codes"
diff --git a/go/arrow/internal/testing/gen/random_array_gen.go b/go/arrow/internal/testing/gen/random_array_gen.go
index 975117ae19154..6e301363f8f5d 100644
--- a/go/arrow/internal/testing/gen/random_array_gen.go
+++ b/go/arrow/internal/testing/gen/random_array_gen.go
@@ -17,10 +17,10 @@
 package gen
 
 import (
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/bitutil"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/bitutil"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 	"golang.org/x/exp/rand"
 	"gonum.org/v1/gonum/stat/distuv"
 )
diff --git a/go/arrow/internal/testing/tools/bits_test.go b/go/arrow/internal/testing/tools/bits_test.go
index ba790eabc5283..e134df617fd36 100644
--- a/go/arrow/internal/testing/tools/bits_test.go
+++ b/go/arrow/internal/testing/tools/bits_test.go
@@ -20,7 +20,7 @@ import (
 	"fmt"
 	"testing"
 
-	"github.com/apache/arrow/go/arrow/internal/testing/tools"
+	"github.com/apache/arrow/go/v7/arrow/internal/testing/tools"
 	"github.com/stretchr/testify/assert"
 )
 
diff --git a/go/arrow/internal/testing/types/extension_types.go b/go/arrow/internal/testing/types/extension_types.go
index bb0f984a8e193..16acd713a8e3e 100644
--- a/go/arrow/internal/testing/types/extension_types.go
+++ b/go/arrow/internal/testing/types/extension_types.go
@@ -22,8 +22,8 @@ import (
 	"fmt"
 	"reflect"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
 	"golang.org/x/xerrors"
 )
 
diff --git a/go/arrow/ipc/cmd/arrow-cat/main.go b/go/arrow/ipc/cmd/arrow-cat/main.go
index d754f5dfe1cd5..501a836015bd6 100644
--- a/go/arrow/ipc/cmd/arrow-cat/main.go
+++ b/go/arrow/ipc/cmd/arrow-cat/main.go
@@ -52,7 +52,7 @@
 //  record 2...
 //    col[0] "bools": [true (null) (null) false true]
 //  [...]
-package main // import "github.com/apache/arrow/go/arrow/ipc/cmd/arrow-cat"
+package main
 
 import (
 	"bytes"
@@ -62,8 +62,8 @@ import (
 	"log"
 	"os"
 
-	"github.com/apache/arrow/go/arrow/ipc"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow/ipc"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 	"golang.org/x/xerrors"
 )
 
diff --git a/go/arrow/ipc/cmd/arrow-cat/main_test.go b/go/arrow/ipc/cmd/arrow-cat/main_test.go
index dd42438d993da..67c1ec162faf8 100644
--- a/go/arrow/ipc/cmd/arrow-cat/main_test.go
+++ b/go/arrow/ipc/cmd/arrow-cat/main_test.go
@@ -24,10 +24,10 @@ import (
 	"os"
 	"testing"
 
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/internal/arrdata"
-	"github.com/apache/arrow/go/arrow/ipc"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/internal/arrdata"
+	"github.com/apache/arrow/go/v7/arrow/ipc"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 )
 
 func TestCatStream(t *testing.T) {
diff --git a/go/arrow/ipc/cmd/arrow-file-to-stream/main.go b/go/arrow/ipc/cmd/arrow-file-to-stream/main.go
index 04ae824607b04..3ce8f45943076 100644
--- a/go/arrow/ipc/cmd/arrow-file-to-stream/main.go
+++ b/go/arrow/ipc/cmd/arrow-file-to-stream/main.go
@@ -14,7 +14,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package main // import "github.com/apache/arrow/go/arrow/ipc/cmd/arrow-file-to-stream"
+package main
 
 import (
 	"flag"
@@ -22,9 +22,9 @@ import (
 	"log"
 	"os"
 
-	"github.com/apache/arrow/go/arrow/arrio"
-	"github.com/apache/arrow/go/arrow/ipc"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow/arrio"
+	"github.com/apache/arrow/go/v7/arrow/ipc"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 	"golang.org/x/xerrors"
 )
 
diff --git a/go/arrow/ipc/cmd/arrow-file-to-stream/main_test.go b/go/arrow/ipc/cmd/arrow-file-to-stream/main_test.go
index d42518a6cd7ab..1c6e93b6d2cc1 100644
--- a/go/arrow/ipc/cmd/arrow-file-to-stream/main_test.go
+++ b/go/arrow/ipc/cmd/arrow-file-to-stream/main_test.go
@@ -14,7 +14,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package main // import "github.com/apache/arrow/go/arrow/ipc/cmd/arrow-file-to-stream"
+package main
 
 import (
 	"io"
@@ -22,8 +22,8 @@ import (
 	"os"
 	"testing"
 
-	"github.com/apache/arrow/go/arrow/internal/arrdata"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow/internal/arrdata"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 )
 
 func TestFileToStream(t *testing.T) {
diff --git a/go/arrow/ipc/cmd/arrow-json-integration-test/main.go b/go/arrow/ipc/cmd/arrow-json-integration-test/main.go
index 90347b1618fae..1232cc10776b0 100644
--- a/go/arrow/ipc/cmd/arrow-json-integration-test/main.go
+++ b/go/arrow/ipc/cmd/arrow-json-integration-test/main.go
@@ -14,19 +14,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package main // import "github.com/apache/arrow/go/arrow/ipc/cmd/arrow-json-integration-test"
+package main
 
 import (
 	"flag"
 	"log"
 	"os"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/arrio"
-	"github.com/apache/arrow/go/arrow/internal/arrjson"
-	"github.com/apache/arrow/go/arrow/internal/testing/types"
-	"github.com/apache/arrow/go/arrow/ipc"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/arrio"
+	"github.com/apache/arrow/go/v7/arrow/internal/arrjson"
+	"github.com/apache/arrow/go/v7/arrow/internal/testing/types"
+	"github.com/apache/arrow/go/v7/arrow/ipc"
 	"golang.org/x/xerrors"
 )
 
diff --git a/go/arrow/ipc/cmd/arrow-json-integration-test/main_test.go b/go/arrow/ipc/cmd/arrow-json-integration-test/main_test.go
index 0d189e7ef5029..941f23cb3e1b0 100644
--- a/go/arrow/ipc/cmd/arrow-json-integration-test/main_test.go
+++ b/go/arrow/ipc/cmd/arrow-json-integration-test/main_test.go
@@ -14,15 +14,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package main // import "github.com/apache/arrow/go/arrow/ipc/cmd/arrow-json-integration-test"
+package main
 
 import (
 	"io/ioutil"
 	"os"
 	"testing"
 
-	"github.com/apache/arrow/go/arrow/internal/arrdata"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow/internal/arrdata"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 )
 
 func TestIntegration(t *testing.T) {
diff --git a/go/arrow/ipc/cmd/arrow-ls/main.go b/go/arrow/ipc/cmd/arrow-ls/main.go
index f731cbf63d601..446c09acb8bbe 100644
--- a/go/arrow/ipc/cmd/arrow-ls/main.go
+++ b/go/arrow/ipc/cmd/arrow-ls/main.go
@@ -50,7 +50,7 @@
 //      - float32s: type=float32, nullable
 //      - float64s: type=float64, nullable
 //  records: 3
-package main // import "github.com/apache/arrow/go/arrow/ipc/cmd/arrow-ls"
+package main
 
 import (
 	"bytes"
@@ -60,8 +60,8 @@ import (
 	"log"
 	"os"
 
-	"github.com/apache/arrow/go/arrow/ipc"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow/ipc"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 	"golang.org/x/xerrors"
 )
 
diff --git a/go/arrow/ipc/cmd/arrow-ls/main_test.go b/go/arrow/ipc/cmd/arrow-ls/main_test.go
index 508ea68539915..c90ca10a48745 100644
--- a/go/arrow/ipc/cmd/arrow-ls/main_test.go
+++ b/go/arrow/ipc/cmd/arrow-ls/main_test.go
@@ -24,10 +24,10 @@ import (
 	"os"
 	"testing"
 
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/internal/arrdata"
-	"github.com/apache/arrow/go/arrow/ipc"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/internal/arrdata"
+	"github.com/apache/arrow/go/v7/arrow/ipc"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 )
 
 func TestLsStream(t *testing.T) {
diff --git a/go/arrow/ipc/cmd/arrow-stream-to-file/main.go b/go/arrow/ipc/cmd/arrow-stream-to-file/main.go
index 583bc33a9888c..43a8be0101f42 100644
--- a/go/arrow/ipc/cmd/arrow-stream-to-file/main.go
+++ b/go/arrow/ipc/cmd/arrow-stream-to-file/main.go
@@ -14,7 +14,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package main // import "github.com/apache/arrow/go/arrow/ipc/cmd/arrow-stream-to-file"
+package main
 
 import (
 	"flag"
@@ -22,9 +22,9 @@ import (
 	"log"
 	"os"
 
-	"github.com/apache/arrow/go/arrow/arrio"
-	"github.com/apache/arrow/go/arrow/ipc"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow/arrio"
+	"github.com/apache/arrow/go/v7/arrow/ipc"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 	"golang.org/x/xerrors"
 )
 
diff --git a/go/arrow/ipc/cmd/arrow-stream-to-file/main_test.go b/go/arrow/ipc/cmd/arrow-stream-to-file/main_test.go
index 953dfbe2b89fd..4ceb22d16b0e0 100644
--- a/go/arrow/ipc/cmd/arrow-stream-to-file/main_test.go
+++ b/go/arrow/ipc/cmd/arrow-stream-to-file/main_test.go
@@ -14,7 +14,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package main // import "github.com/apache/arrow/go/arrow/ipc/cmd/arrow-stream-to-file"
+package main
 
 import (
 	"io"
@@ -22,8 +22,8 @@ import (
 	"os"
 	"testing"
 
-	"github.com/apache/arrow/go/arrow/internal/arrdata"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow/internal/arrdata"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 )
 
 func TestStreamToFile(t *testing.T) {
diff --git a/go/arrow/ipc/compression.go b/go/arrow/ipc/compression.go
index 7b2502f87e07b..846318ca14cf9 100644
--- a/go/arrow/ipc/compression.go
+++ b/go/arrow/ipc/compression.go
@@ -19,8 +19,8 @@ package ipc
 import (
 	"io"
 
-	"github.com/apache/arrow/go/arrow/internal/debug"
-	"github.com/apache/arrow/go/arrow/internal/flatbuf"
+	"github.com/apache/arrow/go/v7/arrow/internal/debug"
+	"github.com/apache/arrow/go/v7/arrow/internal/flatbuf"
 	"github.com/klauspost/compress/zstd"
 	"github.com/pierrec/lz4/v4"
 )
diff --git a/go/arrow/ipc/dict.go b/go/arrow/ipc/dict.go
index 30c5b353bfce1..9bc14d39925a3 100644
--- a/go/arrow/ipc/dict.go
+++ b/go/arrow/ipc/dict.go
@@ -14,11 +14,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package ipc // import "github.com/apache/arrow/go/arrow/ipc"
+package ipc
 
 import (
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
 	"golang.org/x/xerrors"
 )
 
diff --git a/go/arrow/ipc/dict_test.go b/go/arrow/ipc/dict_test.go
index ba511840fe035..1433fe44e6699 100644
--- a/go/arrow/ipc/dict_test.go
+++ b/go/arrow/ipc/dict_test.go
@@ -14,14 +14,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package ipc // import "github.com/apache/arrow/go/arrow/ipc"
+package ipc
 
 import (
 	"fmt"
 	"testing"
 
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 )
 
 func TestDictMemo(t *testing.T) {
diff --git a/go/arrow/ipc/file_reader.go b/go/arrow/ipc/file_reader.go
index d744358add719..78d8067c082b0 100644
--- a/go/arrow/ipc/file_reader.go
+++ b/go/arrow/ipc/file_reader.go
@@ -14,18 +14,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package ipc // import "github.com/apache/arrow/go/arrow/ipc"
+package ipc
 
 import (
 	"bytes"
 	"encoding/binary"
 	"io"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/bitutil"
-	"github.com/apache/arrow/go/arrow/internal/flatbuf"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/bitutil"
+	"github.com/apache/arrow/go/v7/arrow/internal/flatbuf"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 	"golang.org/x/xerrors"
 )
 
diff --git a/go/arrow/ipc/file_test.go b/go/arrow/ipc/file_test.go
index 53915e51aede0..e448d1767d34d 100644
--- a/go/arrow/ipc/file_test.go
+++ b/go/arrow/ipc/file_test.go
@@ -21,9 +21,9 @@ import (
 	"os"
 	"testing"
 
-	"github.com/apache/arrow/go/arrow/internal/arrdata"
-	"github.com/apache/arrow/go/arrow/internal/flatbuf"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow/internal/arrdata"
+	"github.com/apache/arrow/go/v7/arrow/internal/flatbuf"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 )
 
 func TestFile(t *testing.T) {
diff --git a/go/arrow/ipc/file_writer.go b/go/arrow/ipc/file_writer.go
index 1b6c4fbf3d9ca..ce489f2578b68 100644
--- a/go/arrow/ipc/file_writer.go
+++ b/go/arrow/ipc/file_writer.go
@@ -14,17 +14,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package ipc // import "github.com/apache/arrow/go/arrow/ipc"
+package ipc
 
 import (
 	"encoding/binary"
 	"io"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/bitutil"
-	"github.com/apache/arrow/go/arrow/internal/flatbuf"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/bitutil"
+	"github.com/apache/arrow/go/v7/arrow/internal/flatbuf"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 	"golang.org/x/xerrors"
 )
 
diff --git a/go/arrow/ipc/ipc.go b/go/arrow/ipc/ipc.go
index fa114c9500d0b..e742942c6af96 100644
--- a/go/arrow/ipc/ipc.go
+++ b/go/arrow/ipc/ipc.go
@@ -14,15 +14,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package ipc // import "github.com/apache/arrow/go/arrow/ipc"
+package ipc
 
 import (
 	"io"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/arrio"
-	"github.com/apache/arrow/go/arrow/internal/flatbuf"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/arrio"
+	"github.com/apache/arrow/go/v7/arrow/internal/flatbuf"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 )
 
 const (
diff --git a/go/arrow/ipc/ipc_test.go b/go/arrow/ipc/ipc_test.go
index 5b33106ad4735..c2495a8aeb6f8 100644
--- a/go/arrow/ipc/ipc_test.go
+++ b/go/arrow/ipc/ipc_test.go
@@ -22,10 +22,10 @@ import (
 	"strconv"
 	"testing"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/ipc"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/ipc"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 	"github.com/stretchr/testify/assert"
 )
 
diff --git a/go/arrow/ipc/message.go b/go/arrow/ipc/message.go
index 50dbc7a7187e2..2eda586277e01 100644
--- a/go/arrow/ipc/message.go
+++ b/go/arrow/ipc/message.go
@@ -14,7 +14,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package ipc // import "github.com/apache/arrow/go/arrow/ipc"
+package ipc
 
 import (
 	"encoding/binary"
@@ -22,9 +22,9 @@ import (
 	"io"
 	"sync/atomic"
 
-	"github.com/apache/arrow/go/arrow/internal/debug"
-	"github.com/apache/arrow/go/arrow/internal/flatbuf"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow/internal/debug"
+	"github.com/apache/arrow/go/v7/arrow/internal/flatbuf"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 	"golang.org/x/xerrors"
 )
 
diff --git a/go/arrow/ipc/metadata.go b/go/arrow/ipc/metadata.go
index 3699039b28bd1..a34a95b7d450c 100644
--- a/go/arrow/ipc/metadata.go
+++ b/go/arrow/ipc/metadata.go
@@ -14,16 +14,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package ipc // import "github.com/apache/arrow/go/arrow/ipc"
+package ipc
 
 import (
 	"encoding/binary"
 	"io"
 	"sort"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/internal/flatbuf"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/internal/flatbuf"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 	flatbuffers "github.com/google/flatbuffers/go"
 	"golang.org/x/xerrors"
 )
diff --git a/go/arrow/ipc/metadata_test.go b/go/arrow/ipc/metadata_test.go
index a1408ae1ecf75..43dbec16fdc20 100644
--- a/go/arrow/ipc/metadata_test.go
+++ b/go/arrow/ipc/metadata_test.go
@@ -14,18 +14,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package ipc // import "github.com/apache/arrow/go/arrow/ipc"
+package ipc
 
 import (
 	"bytes"
 	"reflect"
 	"testing"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/internal/flatbuf"
-	"github.com/apache/arrow/go/arrow/internal/testing/types"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/internal/flatbuf"
+	"github.com/apache/arrow/go/v7/arrow/internal/testing/types"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 	flatbuffers "github.com/google/flatbuffers/go"
 	"github.com/stretchr/testify/assert"
 )
diff --git a/go/arrow/ipc/reader.go b/go/arrow/ipc/reader.go
index e63f4059e7bc7..42572b7396be1 100644
--- a/go/arrow/ipc/reader.go
+++ b/go/arrow/ipc/reader.go
@@ -14,18 +14,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package ipc // import "github.com/apache/arrow/go/arrow/ipc"
+package ipc
 
 import (
 	"bytes"
 	"io"
 	"sync/atomic"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/internal/debug"
-	"github.com/apache/arrow/go/arrow/internal/flatbuf"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/internal/debug"
+	"github.com/apache/arrow/go/v7/arrow/internal/flatbuf"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 	"golang.org/x/xerrors"
 )
 
diff --git a/go/arrow/ipc/stream_test.go b/go/arrow/ipc/stream_test.go
index 42e3709bff756..648e1c3b0e9b0 100644
--- a/go/arrow/ipc/stream_test.go
+++ b/go/arrow/ipc/stream_test.go
@@ -23,9 +23,9 @@ import (
 	"strconv"
 	"testing"
 
-	"github.com/apache/arrow/go/arrow/internal/arrdata"
-	"github.com/apache/arrow/go/arrow/internal/flatbuf"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow/internal/arrdata"
+	"github.com/apache/arrow/go/v7/arrow/internal/flatbuf"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 )
 
 func TestStream(t *testing.T) {
diff --git a/go/arrow/ipc/writer.go b/go/arrow/ipc/writer.go
index a6f48480646ca..367c4b5b2d9b9 100644
--- a/go/arrow/ipc/writer.go
+++ b/go/arrow/ipc/writer.go
@@ -14,7 +14,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package ipc // import "github.com/apache/arrow/go/arrow/ipc"
+package ipc
 
 import (
 	"bytes"
@@ -24,11 +24,11 @@ import (
 	"math"
 	"sync"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/bitutil"
-	"github.com/apache/arrow/go/arrow/internal/flatbuf"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/bitutil"
+	"github.com/apache/arrow/go/v7/arrow/internal/flatbuf"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 	"golang.org/x/xerrors"
 )
 
diff --git a/go/arrow/ipc/writer_test.go b/go/arrow/ipc/writer_test.go
index 7bfcf451e9f37..3932eb64d783a 100644
--- a/go/arrow/ipc/writer_test.go
+++ b/go/arrow/ipc/writer_test.go
@@ -21,10 +21,10 @@ import (
 	"fmt"
 	"testing"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/ipc"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/ipc"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 	"github.com/stretchr/testify/assert"
 )
 
diff --git a/go/arrow/math/float64.go b/go/arrow/math/float64.go
index ccd2d8505217f..d23934b4b6a97 100644
--- a/go/arrow/math/float64.go
+++ b/go/arrow/math/float64.go
@@ -19,7 +19,7 @@
 package math
 
 import (
-	"github.com/apache/arrow/go/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/array"
 )
 
 type Float64Funcs struct {
diff --git a/go/arrow/math/float64_avx2_amd64.go b/go/arrow/math/float64_avx2_amd64.go
index 172a23a69a9da..48a7940485712 100644
--- a/go/arrow/math/float64_avx2_amd64.go
+++ b/go/arrow/math/float64_avx2_amd64.go
@@ -23,7 +23,7 @@ package math
 import (
 	"unsafe"
 
-	"github.com/apache/arrow/go/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/array"
 )
 
 //go:noescape
diff --git a/go/arrow/math/float64_sse4_amd64.go b/go/arrow/math/float64_sse4_amd64.go
index d641b839e9689..0c4922b730988 100644
--- a/go/arrow/math/float64_sse4_amd64.go
+++ b/go/arrow/math/float64_sse4_amd64.go
@@ -23,7 +23,7 @@ package math
 import (
 	"unsafe"
 
-	"github.com/apache/arrow/go/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/array"
 )
 
 //go:noescape
diff --git a/go/arrow/math/float64_test.go b/go/arrow/math/float64_test.go
index c2354d348bc5b..c038e7e11681e 100644
--- a/go/arrow/math/float64_test.go
+++ b/go/arrow/math/float64_test.go
@@ -21,9 +21,9 @@ package math_test
 import (
 	"testing"
 
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/math"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/math"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 	"github.com/stretchr/testify/assert"
 )
 
diff --git a/go/arrow/math/int64.go b/go/arrow/math/int64.go
index a6183198b1250..2fd73cdbaddbb 100644
--- a/go/arrow/math/int64.go
+++ b/go/arrow/math/int64.go
@@ -19,7 +19,7 @@
 package math
 
 import (
-	"github.com/apache/arrow/go/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/array"
 )
 
 type Int64Funcs struct {
diff --git a/go/arrow/math/int64_avx2_amd64.go b/go/arrow/math/int64_avx2_amd64.go
index e22fbd53092d8..8bfc4a0d5b3bb 100644
--- a/go/arrow/math/int64_avx2_amd64.go
+++ b/go/arrow/math/int64_avx2_amd64.go
@@ -23,7 +23,7 @@ package math
 import (
 	"unsafe"
 
-	"github.com/apache/arrow/go/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/array"
 )
 
 //go:noescape
diff --git a/go/arrow/math/int64_sse4_amd64.go b/go/arrow/math/int64_sse4_amd64.go
index 2820170529373..c65deeb3dc193 100644
--- a/go/arrow/math/int64_sse4_amd64.go
+++ b/go/arrow/math/int64_sse4_amd64.go
@@ -23,7 +23,7 @@ package math
 import (
 	"unsafe"
 
-	"github.com/apache/arrow/go/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/array"
 )
 
 //go:noescape
diff --git a/go/arrow/math/int64_test.go b/go/arrow/math/int64_test.go
index 4de1b745236f4..6ae2eeb4428fc 100644
--- a/go/arrow/math/int64_test.go
+++ b/go/arrow/math/int64_test.go
@@ -21,9 +21,9 @@ package math_test
 import (
 	"testing"
 
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/math"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/math"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 	"github.com/stretchr/testify/assert"
 )
 
diff --git a/go/arrow/math/math_amd64.go b/go/arrow/math/math_amd64.go
index 73789de6d0ffe..45c079814112b 100644
--- a/go/arrow/math/math_amd64.go
+++ b/go/arrow/math/math_amd64.go
@@ -19,7 +19,7 @@
 package math
 
 import (
-	"github.com/apache/arrow/go/arrow/internal/cpu"
+	"github.com/apache/arrow/go/v7/arrow/internal/cpu"
 )
 
 func init() {
diff --git a/go/arrow/math/type.go.tmpl b/go/arrow/math/type.go.tmpl
index 71a11cb2f37b1..d78b19d3d20a4 100644
--- a/go/arrow/math/type.go.tmpl
+++ b/go/arrow/math/type.go.tmpl
@@ -17,7 +17,7 @@
 package math
 
 import (
-	"github.com/apache/arrow/go/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/array"
 )
 
 {{$def := .D}}
diff --git a/go/arrow/math/type_simd_amd64.go.tmpl b/go/arrow/math/type_simd_amd64.go.tmpl
index 5fde1f8f5d7f2..f15d84d62161f 100644
--- a/go/arrow/math/type_simd_amd64.go.tmpl
+++ b/go/arrow/math/type_simd_amd64.go.tmpl
@@ -21,7 +21,7 @@ package math
 import (
 	"unsafe"
 
-	"github.com/apache/arrow/go/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/array"
 )
 
 {{$name := printf "%s_%s" .In.Type .D.arch}}
diff --git a/go/arrow/math/type_test.go.tmpl b/go/arrow/math/type_test.go.tmpl
index 4af2d05cfb4d7..be9923a55c640 100644
--- a/go/arrow/math/type_test.go.tmpl
+++ b/go/arrow/math/type_test.go.tmpl
@@ -19,9 +19,9 @@ package math_test
 import (
 	"testing"
 
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/math"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/math"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 	"github.com/stretchr/testify/assert"
 )
 
diff --git a/go/arrow/math/uint64.go b/go/arrow/math/uint64.go
index 6918707106365..0f00573381b55 100644
--- a/go/arrow/math/uint64.go
+++ b/go/arrow/math/uint64.go
@@ -19,7 +19,7 @@
 package math
 
 import (
-	"github.com/apache/arrow/go/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/array"
 )
 
 type Uint64Funcs struct {
diff --git a/go/arrow/math/uint64_avx2_amd64.go b/go/arrow/math/uint64_avx2_amd64.go
index 51a388447c1a2..18107c09cc31f 100644
--- a/go/arrow/math/uint64_avx2_amd64.go
+++ b/go/arrow/math/uint64_avx2_amd64.go
@@ -23,7 +23,7 @@ package math
 import (
 	"unsafe"
 
-	"github.com/apache/arrow/go/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/array"
 )
 
 //go:noescape
diff --git a/go/arrow/math/uint64_sse4_amd64.go b/go/arrow/math/uint64_sse4_amd64.go
index 901dad63026f2..a4bebadb7fb2b 100644
--- a/go/arrow/math/uint64_sse4_amd64.go
+++ b/go/arrow/math/uint64_sse4_amd64.go
@@ -23,7 +23,7 @@ package math
 import (
 	"unsafe"
 
-	"github.com/apache/arrow/go/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/array"
 )
 
 //go:noescape
diff --git a/go/arrow/math/uint64_test.go b/go/arrow/math/uint64_test.go
index 6c7f76755c8e8..54a4bba340716 100644
--- a/go/arrow/math/uint64_test.go
+++ b/go/arrow/math/uint64_test.go
@@ -21,9 +21,9 @@ package math_test
 import (
 	"testing"
 
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/math"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/math"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 	"github.com/stretchr/testify/assert"
 )
 
diff --git a/go/arrow/memory/buffer.go b/go/arrow/memory/buffer.go
index 57c0db487d18d..c6c775a099eac 100644
--- a/go/arrow/memory/buffer.go
+++ b/go/arrow/memory/buffer.go
@@ -19,7 +19,7 @@ package memory
 import (
 	"sync/atomic"
 
-	"github.com/apache/arrow/go/arrow/internal/debug"
+	"github.com/apache/arrow/go/v7/arrow/internal/debug"
 )
 
 // Buffer is a wrapper type for a buffer of bytes.
diff --git a/go/arrow/memory/buffer_test.go b/go/arrow/memory/buffer_test.go
index 53b4044b03ee2..7e04833bed8ec 100644
--- a/go/arrow/memory/buffer_test.go
+++ b/go/arrow/memory/buffer_test.go
@@ -19,7 +19,7 @@ package memory_test
 import (
 	"testing"
 
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 	"github.com/stretchr/testify/assert"
 )
 
diff --git a/go/arrow/memory/cgo_allocator.go b/go/arrow/memory/cgo_allocator.go
index 094ab57795cb7..4422f3175d043 100644
--- a/go/arrow/memory/cgo_allocator.go
+++ b/go/arrow/memory/cgo_allocator.go
@@ -22,7 +22,7 @@ package memory
 import (
 	"runtime"
 
-	cga "github.com/apache/arrow/go/arrow/memory/internal/cgoalloc"
+	cga "github.com/apache/arrow/go/v7/arrow/memory/internal/cgoalloc"
 )
 
 // CgoArrowAllocator is an allocator which exposes the C++ memory pool class
diff --git a/go/arrow/memory/memory_amd64.go b/go/arrow/memory/memory_amd64.go
index 5ea4a3fe1a046..8848b3c3e4ef4 100644
--- a/go/arrow/memory/memory_amd64.go
+++ b/go/arrow/memory/memory_amd64.go
@@ -19,7 +19,7 @@
 package memory
 
 import (
-	"github.com/apache/arrow/go/arrow/internal/cpu"
+	"github.com/apache/arrow/go/v7/arrow/internal/cpu"
 )
 
 func init() {
diff --git a/go/arrow/memory/memory_test.go b/go/arrow/memory/memory_test.go
index 2ee5bfbe89282..5ac285c28ed00 100644
--- a/go/arrow/memory/memory_test.go
+++ b/go/arrow/memory/memory_test.go
@@ -19,7 +19,7 @@ package memory_test
 import (
 	"testing"
 
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 	"github.com/stretchr/testify/assert"
 )
 
diff --git a/go/arrow/scalar/binary.go b/go/arrow/scalar/binary.go
index 62bbfab238c15..b174dc1f2d006 100644
--- a/go/arrow/scalar/binary.go
+++ b/go/arrow/scalar/binary.go
@@ -20,8 +20,8 @@ import (
 	"bytes"
 	"unicode/utf8"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 	"golang.org/x/xerrors"
 )
 
diff --git a/go/arrow/scalar/compare.go b/go/arrow/scalar/compare.go
index 1d7fcf4780fcb..913fdc02d5b89 100644
--- a/go/arrow/scalar/compare.go
+++ b/go/arrow/scalar/compare.go
@@ -16,7 +16,7 @@
 
 package scalar
 
-import "github.com/apache/arrow/go/arrow"
+import "github.com/apache/arrow/go/v7/arrow"
 
 //TODO(zeroshade): approxequals
 // tracked in https://issues.apache.org/jira/browse/ARROW-13980
diff --git a/go/arrow/scalar/nested.go b/go/arrow/scalar/nested.go
index 2347592e073fc..483dbfe998b84 100644
--- a/go/arrow/scalar/nested.go
+++ b/go/arrow/scalar/nested.go
@@ -20,10 +20,10 @@ import (
 	"bytes"
 	"fmt"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/internal/debug"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/internal/debug"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 	"golang.org/x/xerrors"
 )
 
diff --git a/go/arrow/scalar/numeric.gen.go b/go/arrow/scalar/numeric.gen.go
index 788904db8ed4f..cc2d78f66b546 100644
--- a/go/arrow/scalar/numeric.gen.go
+++ b/go/arrow/scalar/numeric.gen.go
@@ -23,7 +23,7 @@ import (
 	"reflect"
 	"unsafe"
 
-	"github.com/apache/arrow/go/arrow"
+	"github.com/apache/arrow/go/v7/arrow"
 	"golang.org/x/xerrors"
 )
 
diff --git a/go/arrow/scalar/numeric.gen_test.go b/go/arrow/scalar/numeric.gen_test.go
index 458fbb6f90070..72b0bb6c85b8b 100644
--- a/go/arrow/scalar/numeric.gen_test.go
+++ b/go/arrow/scalar/numeric.gen_test.go
@@ -21,8 +21,8 @@ package scalar_test
 import (
 	"testing"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/scalar"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/scalar"
 	"github.com/stretchr/testify/assert"
 )
 
diff --git a/go/arrow/scalar/numeric.gen_test.go.tmpl b/go/arrow/scalar/numeric.gen_test.go.tmpl
index 7ac3a25841b6f..7e06f99fba28b 100644
--- a/go/arrow/scalar/numeric.gen_test.go.tmpl
+++ b/go/arrow/scalar/numeric.gen_test.go.tmpl
@@ -19,8 +19,8 @@ package scalar_test
 import (
 	"testing"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/scalar"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/scalar"
 	"github.com/stretchr/testify/assert"
 )
 
diff --git a/go/arrow/scalar/parse.go b/go/arrow/scalar/parse.go
index 78bbff8b20296..7f258e5b1274b 100644
--- a/go/arrow/scalar/parse.go
+++ b/go/arrow/scalar/parse.go
@@ -21,10 +21,10 @@ import (
 	"strconv"
 	"time"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/float16"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/float16"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 	"golang.org/x/xerrors"
 )
 
diff --git a/go/arrow/scalar/scalar.go b/go/arrow/scalar/scalar.go
index 0275866c0a5ed..145e5e1717b26 100644
--- a/go/arrow/scalar/scalar.go
+++ b/go/arrow/scalar/scalar.go
@@ -26,14 +26,14 @@ import (
 	"strconv"
 	"unsafe"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/bitutil"
-	"github.com/apache/arrow/go/arrow/decimal128"
-	"github.com/apache/arrow/go/arrow/endian"
-	"github.com/apache/arrow/go/arrow/float16"
-	"github.com/apache/arrow/go/arrow/internal/debug"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/bitutil"
+	"github.com/apache/arrow/go/v7/arrow/decimal128"
+	"github.com/apache/arrow/go/v7/arrow/endian"
+	"github.com/apache/arrow/go/v7/arrow/float16"
+	"github.com/apache/arrow/go/v7/arrow/internal/debug"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 	"golang.org/x/xerrors"
 )
 
diff --git a/go/arrow/scalar/scalar_test.go b/go/arrow/scalar/scalar_test.go
index dc476e8bdd892..882c30542f644 100644
--- a/go/arrow/scalar/scalar_test.go
+++ b/go/arrow/scalar/scalar_test.go
@@ -23,11 +23,11 @@ import (
 	"testing"
 	"time"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/decimal128"
-	"github.com/apache/arrow/go/arrow/memory"
-	"github.com/apache/arrow/go/arrow/scalar"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/decimal128"
+	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow/scalar"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/suite"
 )
diff --git a/go/arrow/scalar/temporal.go b/go/arrow/scalar/temporal.go
index 3dba5692d459d..4d9949f47f2ff 100644
--- a/go/arrow/scalar/temporal.go
+++ b/go/arrow/scalar/temporal.go
@@ -22,7 +22,7 @@ import (
 	"time"
 	"unsafe"
 
-	"github.com/apache/arrow/go/arrow"
+	"github.com/apache/arrow/go/v7/arrow"
 	"golang.org/x/xerrors"
 )
 
diff --git a/go/arrow/tensor/numeric.gen.go b/go/arrow/tensor/numeric.gen.go
index f25d199f93070..2474bfd90b2a3 100644
--- a/go/arrow/tensor/numeric.gen.go
+++ b/go/arrow/tensor/numeric.gen.go
@@ -19,8 +19,8 @@
 package tensor
 
 import (
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
 )
 
 // Int8 is an n-dim array of int8s.
diff --git a/go/arrow/tensor/numeric.gen.go.tmpl b/go/arrow/tensor/numeric.gen.go.tmpl
index f7760f1503aad..1fdaddf37fc0d 100644
--- a/go/arrow/tensor/numeric.gen.go.tmpl
+++ b/go/arrow/tensor/numeric.gen.go.tmpl
@@ -17,8 +17,8 @@
 package tensor
 
 import (
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
 )
 
 {{range .In}}
diff --git a/go/arrow/tensor/numeric.gen_test.go b/go/arrow/tensor/numeric.gen_test.go
index 56f392230888a..a009522deab49 100644
--- a/go/arrow/tensor/numeric.gen_test.go
+++ b/go/arrow/tensor/numeric.gen_test.go
@@ -23,10 +23,10 @@ import (
 	"reflect"
 	"testing"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/memory"
-	"github.com/apache/arrow/go/arrow/tensor"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow/tensor"
 )
 
 func TestTensorInt8(t *testing.T) {
diff --git a/go/arrow/tensor/numeric.gen_test.go.tmpl b/go/arrow/tensor/numeric.gen_test.go.tmpl
index 2b8740d9bcfd2..f244615541640 100644
--- a/go/arrow/tensor/numeric.gen_test.go.tmpl
+++ b/go/arrow/tensor/numeric.gen_test.go.tmpl
@@ -21,10 +21,10 @@ import (
 	"reflect"
 	"testing"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/memory"
-	"github.com/apache/arrow/go/arrow/tensor"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow/tensor"
 )
 
 {{range .In}}
diff --git a/go/arrow/tensor/tensor.go b/go/arrow/tensor/tensor.go
index 23771b67ae66b..75a1831927aee 100644
--- a/go/arrow/tensor/tensor.go
+++ b/go/arrow/tensor/tensor.go
@@ -21,9 +21,9 @@ import (
 	"fmt"
 	"sync/atomic"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/internal/debug"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/internal/debug"
 )
 
 // Interface represents an n-dimensional array of numerical data.
diff --git a/go/arrow/tensor/tensor_test.go b/go/arrow/tensor/tensor_test.go
index 3746f302a2556..1149a15b80717 100644
--- a/go/arrow/tensor/tensor_test.go
+++ b/go/arrow/tensor/tensor_test.go
@@ -21,10 +21,10 @@ import (
 	"reflect"
 	"testing"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/memory"
-	"github.com/apache/arrow/go/arrow/tensor"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow/tensor"
 )
 
 func TestTensor(t *testing.T) {
diff --git a/go/arrow/type_traits_boolean.go b/go/arrow/type_traits_boolean.go
index a3a5c59257b16..363ed65649d32 100644
--- a/go/arrow/type_traits_boolean.go
+++ b/go/arrow/type_traits_boolean.go
@@ -17,7 +17,7 @@
 package arrow
 
 import (
-	"github.com/apache/arrow/go/arrow/bitutil"
+	"github.com/apache/arrow/go/v7/arrow/bitutil"
 )
 
 type booleanTraits struct{}
diff --git a/go/arrow/type_traits_decimal128.go b/go/arrow/type_traits_decimal128.go
index 7d1177ebe486c..2bd6ce1ebaef8 100644
--- a/go/arrow/type_traits_decimal128.go
+++ b/go/arrow/type_traits_decimal128.go
@@ -20,8 +20,8 @@ import (
 	"reflect"
 	"unsafe"
 
-	"github.com/apache/arrow/go/arrow/decimal128"
-	"github.com/apache/arrow/go/arrow/endian"
+	"github.com/apache/arrow/go/v7/arrow/decimal128"
+	"github.com/apache/arrow/go/v7/arrow/endian"
 )
 
 // Decimal128 traits
diff --git a/go/arrow/type_traits_float16.go b/go/arrow/type_traits_float16.go
index 1e8239de8eb4e..0bd136d2b2094 100644
--- a/go/arrow/type_traits_float16.go
+++ b/go/arrow/type_traits_float16.go
@@ -20,8 +20,8 @@ import (
 	"reflect"
 	"unsafe"
 
-	"github.com/apache/arrow/go/arrow/float16"
-	"github.com/apache/arrow/go/arrow/endian"
+	"github.com/apache/arrow/go/v7/arrow/float16"
+	"github.com/apache/arrow/go/v7/arrow/endian"
 )
 
 // Float16 traits
diff --git a/go/arrow/type_traits_interval.go b/go/arrow/type_traits_interval.go
index 13ede6c7b7dbe..714421a77bf0b 100644
--- a/go/arrow/type_traits_interval.go
+++ b/go/arrow/type_traits_interval.go
@@ -20,8 +20,8 @@ import (
 	"reflect"
 	"unsafe"
 
-	"github.com/apache/arrow/go/arrow/endian"
-	"github.com/apache/arrow/go/arrow/internal/debug"
+	"github.com/apache/arrow/go/v7/arrow/endian"
+	"github.com/apache/arrow/go/v7/arrow/internal/debug"
 )
 
 var (
diff --git a/go/arrow/type_traits_numeric.gen.go b/go/arrow/type_traits_numeric.gen.go
index c4c14d4904af5..daa48c7b66aa5 100644
--- a/go/arrow/type_traits_numeric.gen.go
+++ b/go/arrow/type_traits_numeric.gen.go
@@ -16,14 +16,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package arrow // import "github.com/apache/arrow/go/arrow"
+package arrow
 
 import (
 	"math"
 	"reflect"
 	"unsafe"
 
-	"github.com/apache/arrow/go/arrow/endian"
+	"github.com/apache/arrow/go/v7/arrow/endian"
 )
 
 var (
diff --git a/go/arrow/type_traits_numeric.gen.go.tmpl b/go/arrow/type_traits_numeric.gen.go.tmpl
index a4c49ea9625f3..aa53c60f8a25f 100644
--- a/go/arrow/type_traits_numeric.gen.go.tmpl
+++ b/go/arrow/type_traits_numeric.gen.go.tmpl
@@ -14,14 +14,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package arrow // import "github.com/apache/arrow/go/arrow"
+package arrow
 
 import (
 	"math"
 	"reflect"
 	"unsafe"
 
-	"github.com/apache/arrow/go/arrow/endian"
+	"github.com/apache/arrow/go/v7/arrow/endian"
 )
 
 var (
diff --git a/go/arrow/type_traits_numeric.gen_test.go b/go/arrow/type_traits_numeric.gen_test.go
index 7347de3b11121..5146251e7072f 100644
--- a/go/arrow/type_traits_numeric.gen_test.go
+++ b/go/arrow/type_traits_numeric.gen_test.go
@@ -22,7 +22,7 @@ import (
 	"reflect"
 	"testing"
 
-	"github.com/apache/arrow/go/arrow"
+	"github.com/apache/arrow/go/v7/arrow"
 )
 
 func TestInt64Traits(t *testing.T) {
diff --git a/go/arrow/type_traits_numeric.gen_test.go.tmpl b/go/arrow/type_traits_numeric.gen_test.go.tmpl
index 5a0e269b5d5bf..3ca3524f152f3 100644
--- a/go/arrow/type_traits_numeric.gen_test.go.tmpl
+++ b/go/arrow/type_traits_numeric.gen_test.go.tmpl
@@ -20,7 +20,7 @@ import (
 	"reflect"
 	"testing"
 
-	"github.com/apache/arrow/go/arrow"
+	"github.com/apache/arrow/go/v7/arrow"
 )
 
 {{- range .In}}
diff --git a/go/arrow/type_traits_test.go b/go/arrow/type_traits_test.go
index b3674758fa987..d78bed80300d4 100644
--- a/go/arrow/type_traits_test.go
+++ b/go/arrow/type_traits_test.go
@@ -21,9 +21,9 @@ import (
 	"reflect"
 	"testing"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/decimal128"
-	"github.com/apache/arrow/go/arrow/float16"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/decimal128"
+	"github.com/apache/arrow/go/v7/arrow/float16"
 )
 
 func TestBooleanTraits(t *testing.T) {
diff --git a/go/parquet/go.mod b/go/go.mod
similarity index 67%
rename from go/parquet/go.mod
rename to go/go.mod
index d1e28dd936d84..c4fd0fedfacc0 100644
--- a/go/parquet/go.mod
+++ b/go/go.mod
@@ -14,25 +14,28 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-module github.com/apache/arrow/go/parquet
+module github.com/apache/arrow/go/v7
 
 go 1.15
 
 require (
-	github.com/JohnCGriffin/overflow v0.0.0-20170615021017-4d914c927216
+	github.com/JohnCGriffin/overflow v0.0.0-20211019200055-46fa312c352c
 	github.com/andybalholm/brotli v1.0.3
-	github.com/apache/arrow/go/arrow v0.0.0-20211025125312-be665ef948cb
 	github.com/apache/thrift v0.15.0
-	github.com/golang/snappy v0.0.3
-	github.com/klauspost/asmfmt v1.2.3
-	github.com/klauspost/compress v1.13.1
+	github.com/golang/protobuf v1.5.2
+	github.com/golang/snappy v0.0.4
+	github.com/google/flatbuffers v2.0.0+incompatible
+	github.com/klauspost/asmfmt v1.3.1
+	github.com/klauspost/compress v1.13.6
 	github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8
 	github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3
+	github.com/pierrec/lz4/v4 v4.1.9
 	github.com/stretchr/testify v1.7.0
-	github.com/zeebo/xxh3 v0.10.0
-	golang.org/x/exp v0.0.0-20210220032938-85be41e4509f
-	golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c
-	golang.org/x/tools v0.1.4
+	github.com/zeebo/xxh3 v0.13.0
+	golang.org/x/exp v0.0.0-20211028214138-64b4c8e87d1a
+	golang.org/x/sys v0.0.0-20211025201205-69cdffdb9359
 	golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1
 	gonum.org/v1/gonum v0.9.3
+	google.golang.org/grpc v1.41.0
+	google.golang.org/protobuf v1.27.1
 )
diff --git a/go/go.sum b/go/go.sum
new file mode 100644
index 0000000000000..f992bbf66ffb0
--- /dev/null
+++ b/go/go.sum
@@ -0,0 +1,533 @@
+cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
+cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
+dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU=
+dmitri.shuralyov.com/gpu/mtl v0.0.0-20201218220906-28db891af037/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU=
+gioui.org v0.0.0-20210308172011-57750fc8a0a6/go.mod h1:RSH6KIUZ0p2xy5zHDxgAM4zumjgTw83q2ge/PI+yyw8=
+github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
+github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo=
+github.com/JohnCGriffin/overflow v0.0.0-20211019200055-46fa312c352c h1:RGWPOewvKIROun94nF7v2cua9qP+thov/7M50KEoeSU=
+github.com/JohnCGriffin/overflow v0.0.0-20211019200055-46fa312c352c/go.mod h1:X0CRv0ky0k6m906ixxpzmDRLvX58TFUKS2eePweuyxk=
+github.com/Knetic/govaluate v3.0.1-0.20171022003610-9aa49832a739+incompatible/go.mod h1:r7JcOSlj0wfOMncg0iLm8Leh48TZaKVeNIfJntJ2wa0=
+github.com/Shopify/sarama v1.19.0/go.mod h1:FVkBWblsNy7DGZRfXLU0O9RCGt5g3g3yEuWXgklEdEo=
+github.com/Shopify/toxiproxy v2.1.4+incompatible/go.mod h1:OXgGpZ6Cli1/URJOF1DMxUHB2q5Ap20/P/eIdh4G0pI=
+github.com/VividCortex/gohistogram v1.0.0/go.mod h1:Pf5mBqqDxYaXu3hDrrU+w6nw50o/4+TcAqDqk/vUH7g=
+github.com/afex/hystrix-go v0.0.0-20180502004556-fa1af6a1f4f5/go.mod h1:SkGFH1ia65gfNATL8TAiHDNxPzPdmEL5uirI2Uyuz6c=
+github.com/ajstarks/svgo v0.0.0-20180226025133-644b8db467af/go.mod h1:K08gAheRH3/J6wwsYMMT4xOr94bZjxIelGM0+d/wbFw=
+github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc=
+github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc=
+github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0=
+github.com/alecthomas/units v0.0.0-20190717042225-c3de453c63f4/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0=
+github.com/andybalholm/brotli v1.0.3 h1:fpcw+r1N1h0Poc1F/pHbW40cUm/lMEQslZtCkBQ0UnM=
+github.com/andybalholm/brotli v1.0.3/go.mod h1:fO7iG3H7G2nSZ7m0zPUDn85XEX2GTukHGRSepvi9Eig=
+github.com/antihax/optional v1.0.0/go.mod h1:uupD/76wgC+ih3iEmQUL+0Ugr19nfwCT1kdvxnR2qWY=
+github.com/apache/thrift v0.12.0/go.mod h1:cp2SuWMxlEZw2r+iP2GNCdIi4C1qmUzdZFSVb+bacwQ=
+github.com/apache/thrift v0.13.0/go.mod h1:cp2SuWMxlEZw2r+iP2GNCdIi4C1qmUzdZFSVb+bacwQ=
+github.com/apache/thrift v0.15.0 h1:aGvdaR0v1t9XLgjtBYwxcBvBOTMqClzwE26CHOgjW1Y=
+github.com/apache/thrift v0.15.0/go.mod h1:PHK3hniurgQaNMZYaCLEqXKsYK8upmhPbmdP2FXSqgU=
+github.com/armon/circbuf v0.0.0-20150827004946-bbbad097214e/go.mod h1:3U/XgcO3hCbHZ8TKRvWD2dDTCfh9M9ya+I9JpbB7O8o=
+github.com/armon/go-metrics v0.0.0-20180917152333-f0300d1749da/go.mod h1:Q73ZrmVTwzkszR9V5SSuryQ31EELlFMUz1kKyl939pY=
+github.com/armon/go-radix v0.0.0-20180808171621-7fddfc383310/go.mod h1:ufUuZ+zHj4x4TnLV4JWEpy2hxWSpsRywHrMgIH9cCH8=
+github.com/aryann/difflib v0.0.0-20170710044230-e206f873d14a/go.mod h1:DAHtR1m6lCRdSC2Tm3DSWRPvIPr6xNKyeHdqDQSQT+A=
+github.com/aws/aws-lambda-go v1.13.3/go.mod h1:4UKl9IzQMoD+QF79YdCuzCwp8VbmG4VAQwij/eHl5CU=
+github.com/aws/aws-sdk-go v1.27.0/go.mod h1:KmX6BPdI08NWTb3/sm4ZGu5ShLoqVDhKgpiN924inxo=
+github.com/aws/aws-sdk-go-v2 v0.18.0/go.mod h1:JWVYvqSMppoMJC0x5wdwiImzgXTI9FuZwxzkQq9wy+g=
+github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q=
+github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8=
+github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
+github.com/bgentry/speakeasy v0.1.0/go.mod h1:+zsyZBPWlz7T6j88CTgSN5bM796AkVf0kBD4zp0CCIs=
+github.com/boombuler/barcode v1.0.0/go.mod h1:paBWMcWSl3LHKBqUq+rly7CNSldXjb2rDl3JlRe0mD8=
+github.com/casbin/casbin/v2 v2.1.2/go.mod h1:YcPU1XXisHhLzuxH9coDNf2FbKpjGlbCg3n9yuLkIJQ=
+github.com/cenkalti/backoff v2.2.1+incompatible/go.mod h1:90ReRw6GdpyfrHakVjL/QHaoyV4aDUVVkXQJJJ3NXXM=
+github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
+github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
+github.com/clbanning/x2j v0.0.0-20191024224557-825249438eec/go.mod h1:jMjuTZXRI4dUb/I5gc9Hdhagfvm9+RyrPryS/auMzxE=
+github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
+github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc=
+github.com/cncf/udpa/go v0.0.0-20201120205902-5459f2c99403/go.mod h1:WmhPx2Nbnhtbo57+VJT5O0JRkEi1Wbu0z5j0R8u5Hbk=
+github.com/cncf/xds/go v0.0.0-20210805033703-aa0b78936158/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs=
+github.com/cockroachdb/datadriven v0.0.0-20190809214429-80d97fb3cbaa/go.mod h1:zn76sxSg3SzpJ0PPJaLDCu+Bu0Lg3sKTORVIj19EIF8=
+github.com/codahale/hdrhistogram v0.0.0-20161010025455-3a0bb77429bd/go.mod h1:sE/e/2PUdi/liOCUjSTXgM1o87ZssimdTWN964YiIeI=
+github.com/coreos/go-semver v0.2.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3EedlOD2RNk=
+github.com/coreos/go-systemd v0.0.0-20180511133405-39ca1b05acc7/go.mod h1:F5haX7vjVVG0kc13fIWeqUViNPyEJxv/OmvnBo0Yme4=
+github.com/coreos/go-systemd v0.0.0-20190321100706-95778dfbb74e/go.mod h1:F5haX7vjVVG0kc13fIWeqUViNPyEJxv/OmvnBo0Yme4=
+github.com/coreos/pkg v0.0.0-20160727233714-3ac0863d7acf/go.mod h1:E3G3o1h8I7cfcXa63jLwjI0eiQQMgzzUDFVpN/nH/eA=
+github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU=
+github.com/creack/pty v1.1.7/go.mod h1:lj5s0c3V2DBrqTV7llrYr5NG6My20zk30Fl46Y7DoTY=
+github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
+github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/dgrijalva/jwt-go v3.2.0+incompatible/go.mod h1:E3ru+11k8xSBh+hMPgOLZmtrrCbhqsmaPHjLKYnJCaQ=
+github.com/dustin/go-humanize v0.0.0-20171111073723-bb3d318650d4/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk=
+github.com/eapache/go-resiliency v1.1.0/go.mod h1:kFI+JgMyC7bLPUVY133qvEBtVayf5mFgVsvEsIPBvNs=
+github.com/eapache/go-xerial-snappy v0.0.0-20180814174437-776d5712da21/go.mod h1:+020luEh2TKB4/GOp8oxxtq0Daoen/Cii55CzbTV6DU=
+github.com/eapache/queue v1.1.0/go.mod h1:6eCeP0CKFpHLu8blIFXhExK/dRa7WDZfr6jVFPTqq+I=
+github.com/edsrzf/mmap-go v1.0.0/go.mod h1:YO35OhQPt3KJa3ryjFM5Bs14WD66h8eGKpfaBNrHW5M=
+github.com/envoyproxy/go-control-plane v0.6.9/go.mod h1:SBwIajubJHhxtWwsL9s8ss4safvEdbitLhGGK48rN6g=
+github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
+github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
+github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98=
+github.com/envoyproxy/go-control-plane v0.9.9-0.20201210154907-fd9021fe5dad/go.mod h1:cXg6YxExXjJnVBQHBLXeUAgxn2UodCpnH306RInaBQk=
+github.com/envoyproxy/go-control-plane v0.9.10-0.20210907150352-cf90f659a021/go.mod h1:AFq3mo9L8Lqqiid3OhADV3RfLJnjiw63cSpi+fDTRC0=
+github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c=
+github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4=
+github.com/fogleman/gg v1.2.1-0.20190220221249-0403632d5b90/go.mod h1:R/bRT+9gY/C5z7JzPU0zXsXHKM4/ayA+zqcVNZzPa1k=
+github.com/fogleman/gg v1.3.0/go.mod h1:R/bRT+9gY/C5z7JzPU0zXsXHKM4/ayA+zqcVNZzPa1k=
+github.com/franela/goblin v0.0.0-20200105215937-c9ffbefa60db/go.mod h1:7dvUGVsVBjqR7JHJk0brhHOZYGmfBYOrK0ZhYMEtBr4=
+github.com/franela/goreq v0.0.0-20171204163338-bcd34c9993f8/go.mod h1:ZhphrRTfi2rbfLwlschooIH4+wKKDR4Pdxhh+TRoA20=
+github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo=
+github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04=
+github.com/go-fonts/dejavu v0.1.0/go.mod h1:4Wt4I4OU2Nq9asgDCteaAaWZOV24E+0/Pwo0gppep4g=
+github.com/go-fonts/latin-modern v0.2.0/go.mod h1:rQVLdDMK+mK1xscDwsqM5J8U2jrRa3T0ecnM9pNujks=
+github.com/go-fonts/liberation v0.1.1/go.mod h1:K6qoJYypsmfVjWg8KOVDQhLc8UDgIK2HYqyqAO9z7GY=
+github.com/go-fonts/stix v0.1.0/go.mod h1:w/c1f0ldAUlJmLBvlbkvVXLAD+tAMqobIIQpmnUIzUY=
+github.com/go-gl/glfw v0.0.0-20190409004039-e6da0acd62b1/go.mod h1:vR7hzQXu2zJy9AVAgeJqvqgH9Q5CA+iKCZ2gyEVpxRU=
+github.com/go-gl/glfw/v3.3/glfw v0.0.0-20200222043503-6f7a984d4dc4/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8=
+github.com/go-kit/kit v0.8.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as=
+github.com/go-kit/kit v0.9.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as=
+github.com/go-kit/kit v0.10.0/go.mod h1:xUsJbQ/Fp4kEt7AFgCuvyX4a71u8h9jB8tj/ORgOZ7o=
+github.com/go-latex/latex v0.0.0-20210118124228-b3d85cf34e07/go.mod h1:CO1AlKB2CSIqUrmQPqA0gdRIlnLEY0gK5JGjh37zN5U=
+github.com/go-logfmt/logfmt v0.3.0/go.mod h1:Qt1PoO58o5twSAckw1HlFXLmHsOX5/0LbT9GBnD5lWE=
+github.com/go-logfmt/logfmt v0.4.0/go.mod h1:3RMwSq7FuexP4Kalkev3ejPJsZTpXXBr9+V4qmtdjCk=
+github.com/go-logfmt/logfmt v0.5.0/go.mod h1:wCYkCAKZfumFQihp8CzCvQ3paCTfi41vtzG1KdI/P7A=
+github.com/go-logr/logr v0.4.0/go.mod h1:z6/tIYblkpsD+a4lm/fGIIU9mZ+XfAiaFtq7xTgseGU=
+github.com/go-sql-driver/mysql v1.4.0/go.mod h1:zAC/RDZ24gD3HViQzih4MyKcchzm+sOG5ZlKdlhCg5w=
+github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY=
+github.com/gogo/googleapis v1.1.0/go.mod h1:gf4bu3Q80BeJ6H1S1vYPm8/ELATdvryBaNFGgqEef3s=
+github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ=
+github.com/gogo/protobuf v1.2.0/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ=
+github.com/gogo/protobuf v1.2.1/go.mod h1:hp+jE20tsWTFYpLwKvXlhS1hjn+gTNwPg2I6zVXpSg4=
+github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k=
+github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q=
+github.com/golang/groupcache v0.0.0-20160516000752-02826c3e7903/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
+github.com/golang/groupcache v0.0.0-20190702054246-869f871628b6/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
+github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A=
+github.com/golang/mock v1.5.0/go.mod h1:CWnOUgYIOo4TcNZ0wHX3YZCqsaM1I1Jvs6v3mP3KVu8=
+github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
+github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
+github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
+github.com/golang/protobuf v1.3.3/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw=
+github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8=
+github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA=
+github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs=
+github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w=
+github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0=
+github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QDs8UjoX8=
+github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI=
+github.com/golang/protobuf v1.4.3/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI=
+github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk=
+github.com/golang/protobuf v1.5.2 h1:ROPKBNFfQgOUMifHyP+KYbvpjbdoFNs+aK7DXlji0Tw=
+github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY=
+github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
+github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM=
+github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
+github.com/google/btree v0.0.0-20180813153112-4030bb1f1f0c/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ=
+github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ=
+github.com/google/flatbuffers v2.0.0+incompatible h1:dicJ2oXwypfwUGnB2/TYWYEKiuk9eYQlQO/AnOHl5mI=
+github.com/google/flatbuffers v2.0.0+incompatible/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8=
+github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M=
+github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
+github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
+github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
+github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
+github.com/google/go-cmp v0.5.5 h1:Khx7svrCpmxxtHBq5j2mp/xVjsi8hQMfNLvJFAlrGgU=
+github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
+github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
+github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI=
+github.com/google/uuid v1.0.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
+github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
+github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY=
+github.com/gorilla/context v1.1.1/go.mod h1:kBGZzfjB9CEq2AlWe17Uuf7NDRt0dE0s8S51q0aT7Yg=
+github.com/gorilla/mux v1.6.2/go.mod h1:1lud6UwP+6orDFRuTfBEV8e9/aOM/c4fVVCaMa2zaAs=
+github.com/gorilla/mux v1.7.3/go.mod h1:1lud6UwP+6orDFRuTfBEV8e9/aOM/c4fVVCaMa2zaAs=
+github.com/gorilla/websocket v0.0.0-20170926233335-4201258b820c/go.mod h1:E7qHFY5m1UJ88s3WnNqhKjPHQ0heANvMoAMk2YaljkQ=
+github.com/grpc-ecosystem/go-grpc-middleware v1.0.1-0.20190118093823-f849b5445de4/go.mod h1:FiyG127CGDf3tlThmgyCl78X/SZQqEOJBCDaAfeWzPs=
+github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0/go.mod h1:8NvIoxWQoOIhqOTXgfV/d3M/q6VIi02HzZEHgUlZvzk=
+github.com/grpc-ecosystem/grpc-gateway v1.9.5/go.mod h1:vNeuVxBJEsws4ogUvrchl83t/GYV9WGTSLVdBhOQFDY=
+github.com/grpc-ecosystem/grpc-gateway v1.16.0/go.mod h1:BDjrQk3hbvj6Nolgz8mAMFbcEtjT1g+wF4CSlocrBnw=
+github.com/hashicorp/consul/api v1.3.0/go.mod h1:MmDNSzIMUjNpY/mQ398R4bk2FnqQLoPndWW5VkKPlCE=
+github.com/hashicorp/consul/sdk v0.3.0/go.mod h1:VKf9jXwCTEY1QZP2MOLRhb5i/I/ssyNV1vwHyQBF0x8=
+github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4=
+github.com/hashicorp/go-cleanhttp v0.5.1/go.mod h1:JpRdi6/HCYpAwUzNwuwqhbovhLtngrth3wmdIIUrZ80=
+github.com/hashicorp/go-immutable-radix v1.0.0/go.mod h1:0y9vanUI8NX6FsYoO3zeMjhV/C5i9g4Q3DwcSNZ4P60=
+github.com/hashicorp/go-msgpack v0.5.3/go.mod h1:ahLV/dePpqEmjfWmKiqvPkv/twdG7iPBM1vqhUKIvfM=
+github.com/hashicorp/go-multierror v1.0.0/go.mod h1:dHtQlpGsu+cZNNAkkCN/P3hoUDHhCYQXV3UM06sGGrk=
+github.com/hashicorp/go-rootcerts v1.0.0/go.mod h1:K6zTfqpRlCUIjkwsN4Z+hiSfzSTQa6eBIzfwKfwNnHU=
+github.com/hashicorp/go-sockaddr v1.0.0/go.mod h1:7Xibr9yA9JjQq1JpNB2Vw7kxv8xerXegt+ozgdvDeDU=
+github.com/hashicorp/go-syslog v1.0.0/go.mod h1:qPfqrKkXGihmCqbJM2mZgkZGvKG1dFdvsLplgctolz4=
+github.com/hashicorp/go-uuid v1.0.0/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro=
+github.com/hashicorp/go-uuid v1.0.1/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro=
+github.com/hashicorp/go-version v1.2.0/go.mod h1:fltr4n8CU8Ke44wwGCBoEymUuxUHl09ZGVZPK5anwXA=
+github.com/hashicorp/go.net v0.0.1/go.mod h1:hjKkEWcCURg++eb33jQU7oqQcI9XDCnUzHA0oac0k90=
+github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8=
+github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8=
+github.com/hashicorp/logutils v1.0.0/go.mod h1:QIAnNjmIWmVIIkWDTG1z5v++HQmx9WQRO+LraFDTW64=
+github.com/hashicorp/mdns v1.0.0/go.mod h1:tL+uN++7HEJ6SQLQ2/p+z2pH24WQKWjBPkE0mNTz8vQ=
+github.com/hashicorp/memberlist v0.1.3/go.mod h1:ajVTdAv/9Im8oMAAj5G31PhhMCZJV2pPBoIllUwCN7I=
+github.com/hashicorp/serf v0.8.2/go.mod h1:6hOLApaqBFA1NXqRQAsxw9QxuDEvNxSQRwA/JwenrHc=
+github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU=
+github.com/hudl/fargo v1.3.0/go.mod h1:y3CKSmjA+wD2gak7sUSXTAoopbhU08POFhmITJgmKTg=
+github.com/inconshreveable/mousetrap v1.0.0/go.mod h1:PxqpIevigyE2G7u3NXJIT2ANytuPF1OarO4DADm73n8=
+github.com/influxdata/influxdb1-client v0.0.0-20191209144304-8bf82d3c094d/go.mod h1:qj24IKcXYK6Iy9ceXlo3Tc+vtHo9lIhSX5JddghvEPo=
+github.com/jmespath/go-jmespath v0.0.0-20180206201540-c2b33e8439af/go.mod h1:Nht3zPeWKUH0NzdCt2Blrr5ys8VGpn0CEB0cQHVjt7k=
+github.com/jonboulle/clockwork v0.1.0/go.mod h1:Ii8DK3G1RaLaWxj9trq07+26W01tbo22gdxWY5EU2bo=
+github.com/json-iterator/go v1.1.6/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU=
+github.com/json-iterator/go v1.1.7/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4=
+github.com/json-iterator/go v1.1.8/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4=
+github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU=
+github.com/julienschmidt/httprouter v1.2.0/go.mod h1:SYymIcj16QtmaHHD7aYtjjsJG7VTCxuUUipMqKk8s4w=
+github.com/jung-kurt/gofpdf v1.0.0/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+r32jIOes=
+github.com/jung-kurt/gofpdf v1.0.3-0.20190309125859-24315acbbda5/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+r32jIOes=
+github.com/kisielk/errcheck v1.1.0/go.mod h1:EZBBE59ingxPouuu3KfxchcWSUPOHkagtvWXihfKN4Q=
+github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
+github.com/klauspost/asmfmt v1.3.1 h1:7xZi1N7s9gTLbqiM8KUv8TLyysavbTRGBT5/ly0bRtw=
+github.com/klauspost/asmfmt v1.3.1/go.mod h1:AG8TuvYojzulgDAMCnYn50l/5QV3Bs/tp6j0HLHbNSE=
+github.com/klauspost/compress v1.13.6 h1:P76CopJELS0TiO2mebmnzgWaajssP/EszplttgQxcgc=
+github.com/klauspost/compress v1.13.6/go.mod h1:/3/Vjq9QcHkK5uEr5lBEmyoZ1iFhe47etQ6QUkpK6sk=
+github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
+github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc=
+github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI=
+github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
+github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
+github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=
+github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
+github.com/lightstep/lightstep-tracer-common/golang/gogo v0.0.0-20190605223551-bc2310a04743/go.mod h1:qklhhLq1aX+mtWk9cPHPzaBjWImj5ULL6C7HFJtXQMM=
+github.com/lightstep/lightstep-tracer-go v0.18.1/go.mod h1:jlF1pusYV4pidLvZ+XD0UBX0ZE6WURAspgAczcDHrL4=
+github.com/lyft/protoc-gen-validate v0.0.13/go.mod h1:XbGvPuh87YZc5TdIa2/I4pLk0QoUACkjt2znoq26NVQ=
+github.com/mattn/go-colorable v0.0.9/go.mod h1:9vuHe8Xs5qXnSaW/c/ABM9alt+Vo+STaOChaDxuIBZU=
+github.com/mattn/go-isatty v0.0.3/go.mod h1:M+lRXTBqGeGNdLjl/ufCoiOlB5xdOkqRJdNxMWT7Zi4=
+github.com/mattn/go-isatty v0.0.4/go.mod h1:M+lRXTBqGeGNdLjl/ufCoiOlB5xdOkqRJdNxMWT7Zi4=
+github.com/mattn/go-runewidth v0.0.2/go.mod h1:LwmH8dsx7+W8Uxz3IHJYH5QSwggIsqBzpuz5H//U1FU=
+github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0=
+github.com/miekg/dns v1.0.14/go.mod h1:W1PPwlIAgtquWBMBEV9nkV9Cazfe8ScdGz/Lj7v3Nrg=
+github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8 h1:AMFGa4R4MiIpspGNG7Z948v4n35fFGB3RR3G/ry4FWs=
+github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8/go.mod h1:mC1jAcsrzbxHt8iiaC+zU4b1ylILSosueou12R++wfY=
+github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3 h1:+n/aFZefKZp7spd8DFdX7uMikMLXX4oubIzJF4kv/wI=
+github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3/go.mod h1:RagcQ7I8IeTMnF8JTXieKnO4Z6JCsikNEzj0DwauVzE=
+github.com/mitchellh/cli v1.0.0/go.mod h1:hNIlj7HEI86fIcpObd7a0FcrxTWetlwJDGcceTlRvqc=
+github.com/mitchellh/go-homedir v1.0.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0=
+github.com/mitchellh/go-testing-interface v1.0.0/go.mod h1:kRemZodwjscx+RGhAo8eIhFbs2+BFgRtFPeD/KE+zxI=
+github.com/mitchellh/gox v0.4.0/go.mod h1:Sd9lOJ0+aimLBi73mGofS1ycjY8lL3uZM3JPS42BGNg=
+github.com/mitchellh/iochan v1.0.0/go.mod h1:JwYml1nuB7xOzsp52dPpHFffvOCDupsG0QubkSMEySY=
+github.com/mitchellh/mapstructure v0.0.0-20160808181253-ca63d7c062ee/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y=
+github.com/mitchellh/mapstructure v1.1.2/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y=
+github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
+github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
+github.com/modern-go/reflect2 v0.0.0-20180701023420-4b7aa43c6742/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0=
+github.com/modern-go/reflect2 v1.0.1/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0=
+github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U=
+github.com/nats-io/jwt v0.3.0/go.mod h1:fRYCDE99xlTsqUzISS1Bi75UBJ6ljOJQOAAu5VglpSg=
+github.com/nats-io/jwt v0.3.2/go.mod h1:/euKqTS1ZD+zzjYrY7pseZrTtWQSjujC7xjPc8wL6eU=
+github.com/nats-io/nats-server/v2 v2.1.2/go.mod h1:Afk+wRZqkMQs/p45uXdrVLuab3gwv3Z8C4HTBu8GD/k=
+github.com/nats-io/nats.go v1.9.1/go.mod h1:ZjDU1L/7fJ09jvUSRVBR2e7+RnLiiIQyqyzEE/Zbp4w=
+github.com/nats-io/nkeys v0.1.0/go.mod h1:xpnFELMwJABBLVhffcfd1MZx6VsNRFpEugbxziKVo7w=
+github.com/nats-io/nkeys v0.1.3/go.mod h1:xpnFELMwJABBLVhffcfd1MZx6VsNRFpEugbxziKVo7w=
+github.com/nats-io/nuid v1.0.1/go.mod h1:19wcPz3Ph3q0Jbyiqsd0kePYG7A95tJPxeL+1OSON2c=
+github.com/oklog/oklog v0.3.2/go.mod h1:FCV+B7mhrz4o+ueLpx+KqkyXRGMWOYEvfiXtdGtbWGs=
+github.com/oklog/run v1.0.0/go.mod h1:dlhp/R75TPv97u0XWUtDeV/lRKWPKSdTuV0TZvrmrQA=
+github.com/olekukonko/tablewriter v0.0.0-20170122224234-a0225b3f23b5/go.mod h1:vsDQFd/mU46D+Z4whnwzcISnGGzXWMclvtLoiIKAKIo=
+github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE=
+github.com/onsi/ginkgo v1.7.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE=
+github.com/onsi/gomega v1.4.3/go.mod h1:ex+gbHU/CVuBBDIJjb2X0qEXbFg53c61hWP/1CpauHY=
+github.com/op/go-logging v0.0.0-20160315200505-970db520ece7/go.mod h1:HzydrMdWErDVzsI23lYNej1Htcns9BCg93Dk0bBINWk=
+github.com/opentracing-contrib/go-observer v0.0.0-20170622124052-a52f23424492/go.mod h1:Ngi6UdF0k5OKD5t5wlmGhe/EDKPoUM3BXZSSfIuJbis=
+github.com/opentracing/basictracer-go v1.0.0/go.mod h1:QfBfYuafItcjQuMwinw9GhYKwFXS9KnPs5lxoYwgW74=
+github.com/opentracing/opentracing-go v1.0.2/go.mod h1:UkNAQd3GIcIGf0SeVgPpRdFStlNbqXla1AfSYxPUl2o=
+github.com/opentracing/opentracing-go v1.1.0/go.mod h1:UkNAQd3GIcIGf0SeVgPpRdFStlNbqXla1AfSYxPUl2o=
+github.com/openzipkin-contrib/zipkin-go-opentracing v0.4.5/go.mod h1:/wsWhb9smxSfWAKL3wpBW7V8scJMt8N8gnaMCS9E/cA=
+github.com/openzipkin/zipkin-go v0.1.6/go.mod h1:QgAqvLzwWbR/WpD4A3cGpPtJrZXNIiJc5AZX7/PBEpw=
+github.com/openzipkin/zipkin-go v0.2.1/go.mod h1:NaW6tEwdmWMaCDZzg8sh+IBNOxHMPnhQw8ySjnjRyN4=
+github.com/openzipkin/zipkin-go v0.2.2/go.mod h1:NaW6tEwdmWMaCDZzg8sh+IBNOxHMPnhQw8ySjnjRyN4=
+github.com/pact-foundation/pact-go v1.0.4/go.mod h1:uExwJY4kCzNPcHRj+hCR/HBbOOIwwtUjcrb0b5/5kLM=
+github.com/pascaldekloe/goe v0.0.0-20180627143212-57f6aae5913c/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc=
+github.com/pborman/uuid v1.2.0/go.mod h1:X/NO0urCmaxf9VXbdlT7C2Yzkj2IKimNn4k+gtPdI/k=
+github.com/performancecopilot/speed v3.0.0+incompatible/go.mod h1:/CLtqpZ5gBg1M9iaPbIdPPGyKcA8hKdoy6hAWba7Yac=
+github.com/phpdave11/gofpdf v1.4.2/go.mod h1:zpO6xFn9yxo3YLyMvW8HcKWVdbNqgIfOOp2dXMnm1mY=
+github.com/phpdave11/gofpdi v1.0.12/go.mod h1:vBmVV0Do6hSBHC8uKUQ71JGW+ZGQq74llk/7bXwjDoI=
+github.com/pierrec/lz4 v1.0.2-0.20190131084431-473cd7ce01a1/go.mod h1:3/3N9NVKO0jef7pBehbT1qWhCMrIgbYNnFAZCqQ5LRc=
+github.com/pierrec/lz4 v2.0.5+incompatible h1:2xWsjqPFWcplujydGg4WmhC/6fZqK42wMM8aXeqhl0I=
+github.com/pierrec/lz4 v2.0.5+incompatible/go.mod h1:pdkljMzZIN41W+lC3N2tnIh5sFi+IEE17M5jbnwPHcY=
+github.com/pierrec/lz4/v4 v4.1.9 h1:xkrjwpOP5xg1k4Nn4GX4a4YFGhscyQL/3EddJ1Xxqm8=
+github.com/pierrec/lz4/v4 v4.1.9/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4=
+github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
+github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
+github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
+github.com/pkg/profile v1.2.1/go.mod h1:hJw3o1OdXxsrSjjVksARp5W95eeEaEfptyVZyv6JUPA=
+github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
+github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/posener/complete v1.1.1/go.mod h1:em0nMJCgc9GFtwrmVmEMR/ZL6WyhyjMBndrE9hABlRI=
+github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw=
+github.com/prometheus/client_golang v0.9.3-0.20190127221311-3c4408c8b829/go.mod h1:p2iRAGwDERtqlqzRXnrOVns+ignqQo//hLXqYxZYVNs=
+github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo=
+github.com/prometheus/client_golang v1.3.0/go.mod h1:hJaj2vgQTGQmVCsAACORcieXFeDPbaTKGT+JTgUa3og=
+github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo=
+github.com/prometheus/client_model v0.0.0-20190115171406-56726106282f/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo=
+github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
+github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
+github.com/prometheus/client_model v0.1.0/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
+github.com/prometheus/common v0.2.0/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y86RQel1bk4=
+github.com/prometheus/common v0.4.1/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y86RQel1bk4=
+github.com/prometheus/common v0.7.0/go.mod h1:DjGbpBbp5NYNiECxcL/VnbXCCaQpKd3tt26CguLLsqA=
+github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk=
+github.com/prometheus/procfs v0.0.0-20190117184657-bf6a532e95b1/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk=
+github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA=
+github.com/prometheus/procfs v0.0.8/go.mod h1:7Qr8sr6344vo1JqZ6HhLceV9o3AJ1Ff+GxbHq6oeK9A=
+github.com/rcrowley/go-metrics v0.0.0-20181016184325-3113b8401b8a/go.mod h1:bCqnVzQkZxMG4s8nGwiZ5l3QUCyqpo9Y+/ZMZ9VjZe4=
+github.com/rogpeppe/fastuuid v0.0.0-20150106093220-6724a57986af/go.mod h1:XWv6SoW27p1b0cqNHllgS5HIMJraePCO15w5zCzIWYg=
+github.com/rogpeppe/fastuuid v1.2.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ=
+github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4=
+github.com/rs/xid v1.2.1/go.mod h1:+uKXf+4Djp6Md1KODXJxgGQPKngRmWyn10oCKFzNHOQ=
+github.com/rs/zerolog v1.21.0/go.mod h1:ZPhntP/xmq1nnND05hhpAh2QMhSsA4UN3MGZ6O2J3hM=
+github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
+github.com/ruudk/golang-pdf417 v0.0.0-20181029194003-1af4ab5afa58/go.mod h1:6lfFZQK844Gfx8o5WFuvpxWRwnSoipWe/p622j1v06w=
+github.com/ryanuber/columnize v0.0.0-20160712163229-9b3edd62028f/go.mod h1:sm1tb6uqfes/u+d4ooFouqFdy9/2g9QGwK3SQygK0Ts=
+github.com/samuel/go-zookeeper v0.0.0-20190923202752-2cc03de413da/go.mod h1:gi+0XIa01GRL2eRQVjQkKGqKF3SF9vZR/HnPullcV2E=
+github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529/go.mod h1:DxrIzT+xaE7yg65j358z/aeFdxmN0P9QXhEzd20vsDc=
+github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc=
+github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo=
+github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE=
+github.com/sirupsen/logrus v1.8.1/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0=
+github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc=
+github.com/smartystreets/goconvey v1.6.4/go.mod h1:syvi0/a8iFYH4r/RixwvyeAJjdLS9QV7WQ/tjFTllLA=
+github.com/soheilhy/cmux v0.1.4/go.mod h1:IM3LyeVVIOuxMH7sFAkER9+bJ4dT7Ms6E4xg4kGIyLM=
+github.com/sony/gobreaker v0.4.1/go.mod h1:ZKptC7FHNvhBz7dN2LGjPVBz2sZJmc0/PkyDJOjmxWY=
+github.com/spf13/cobra v0.0.3/go.mod h1:1l0Ry5zgKvJasoi3XT1TypsSe7PqH0Sj9dhYf7v3XqQ=
+github.com/spf13/pflag v1.0.1/go.mod h1:DYY7MBk1bdzusC3SYhjObp+wFpr4gzcvqqNjLnInEg4=
+github.com/streadway/amqp v0.0.0-20190404075320-75d898a42a94/go.mod h1:AZpEONHx3DKn8O/DFsRAY58/XVQiIPMTMB1SddzLXVw=
+github.com/streadway/amqp v0.0.0-20190827072141-edfb9018d271/go.mod h1:AZpEONHx3DKn8O/DFsRAY58/XVQiIPMTMB1SddzLXVw=
+github.com/streadway/handy v0.0.0-20190108123426-d5acb3125c2a/go.mod h1:qNTQ5P5JnDBl6z3cMAg/SywNDC5ABu5ApDIw6lUbRmI=
+github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
+github.com/stretchr/objx v0.1.1 h1:2vfRuCMp5sSVIDSqO8oNnWJq7mPa6KVP3iPIwFBuy8A=
+github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
+github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
+github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
+github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
+github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA=
+github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY=
+github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
+github.com/tmc/grpc-websocket-proxy v0.0.0-20170815181823-89b8d40f7ca8/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U=
+github.com/urfave/cli v1.20.0/go.mod h1:70zkFmudgCuE/ngEzBv17Jvp/497gISqfk5gWijbERA=
+github.com/urfave/cli v1.22.1/go.mod h1:Gos4lmkARVdJ6EkW0WaNv/tZAAMe9V7XWyB60NtXRu0=
+github.com/xiang90/probing v0.0.0-20190116061207-43a291ad63a2/go.mod h1:UETIi67q53MR2AWcXfiuqkDkRtnGDLqkBTpCHuJHxtU=
+github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
+github.com/zeebo/xxh3 v0.13.0 h1:Dmwt3ytycfDL+wm9ljWTS3gdtaQHMwJN9tOKwNJBxJ0=
+github.com/zeebo/xxh3 v0.13.0/go.mod h1:AQY73TOrhF3jNsdiM9zZOb8MThrYbZONHj7ryDBaLpg=
+go.etcd.io/bbolt v1.3.3/go.mod h1:IbVyRI1SCnLcuJnV2u8VeU0CEYM7e686BmAb1XKL+uU=
+go.etcd.io/etcd v0.0.0-20191023171146-3cf2f69b5738/go.mod h1:dnLIgRNXwCJa5e+c6mIZCrds/GIG4ncV9HhK5PX7jPg=
+go.opencensus.io v0.20.1/go.mod h1:6WKK9ahsWS3RSO+PY9ZHZUfv2irvY6gN279GOPZjmmk=
+go.opencensus.io v0.20.2/go.mod h1:6WKK9ahsWS3RSO+PY9ZHZUfv2irvY6gN279GOPZjmmk=
+go.opencensus.io v0.22.2/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw=
+go.opentelemetry.io/otel v0.20.0/go.mod h1:Y3ugLH2oa81t5QO+Lty+zXf8zC9L26ax4Nzoxm/dooo=
+go.opentelemetry.io/otel/metric v0.20.0/go.mod h1:598I5tYlH1vzBjn+BTuhzTCSb/9debfNp6R3s7Pr1eU=
+go.opentelemetry.io/otel/oteltest v0.20.0/go.mod h1:L7bgKf9ZB7qCwT9Up7i9/pn0PWIa9FqQ2IQ8LoxiGnw=
+go.opentelemetry.io/otel/sdk v0.20.0/go.mod h1:g/IcepuwNsoiX5Byy2nNV0ySUF1em498m7hBWC279Yc=
+go.opentelemetry.io/otel/trace v0.20.0/go.mod h1:6GjCW8zgDjwGHGa6GkyeB8+/5vjT16gUEi0Nf1iBdgw=
+go.opentelemetry.io/proto/otlp v0.7.0/go.mod h1:PqfVotwruBrMGOCsRd/89rSnXhoiJIqeYNgFYFoEGnI=
+go.uber.org/atomic v1.3.2/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE=
+go.uber.org/atomic v1.5.0/go.mod h1:sABNBOSYdrvTF6hTgEIbc7YasKWGhgEQZyfxyTvoXHQ=
+go.uber.org/atomic v1.6.0/go.mod h1:sABNBOSYdrvTF6hTgEIbc7YasKWGhgEQZyfxyTvoXHQ=
+go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc=
+go.uber.org/multierr v1.1.0/go.mod h1:wR5kodmAFQ0UK8QlbwjlSNy0Z68gJhDJUG5sjR94q/0=
+go.uber.org/multierr v1.3.0/go.mod h1:VgVr7evmIr6uPjLBxg28wmKNXyqE9akIJ5XnfpiKl+4=
+go.uber.org/multierr v1.5.0/go.mod h1:FeouvMocqHpRaaGuG9EjoKcStLC43Zu/fmqdUMPcKYU=
+go.uber.org/multierr v1.6.0/go.mod h1:cdWPpRnG4AhwMwsgIHip0KRBQjJy5kYEpYjJxpXp9iU=
+go.uber.org/tools v0.0.0-20190618225709-2cfd321de3ee/go.mod h1:vJERXedbb3MVM5f9Ejo0C68/HhF8uaILCdgjnY+goOA=
+go.uber.org/zap v1.10.0/go.mod h1:vwi/ZaCAaUcBkycHslxD9B2zi4UTXhF60s6SWpuDF0Q=
+go.uber.org/zap v1.13.0/go.mod h1:zwrFLgMcdUuIBviXEYEH1YKNaOBnKXsx2IPda5bBwHM=
+go.uber.org/zap v1.16.0/go.mod h1:MA8QOfq0BHJwdXa996Y4dYkAqRKB8/1K1QMMZVaNZjQ=
+golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
+golang.org/x/crypto v0.0.0-20181029021203-45a5f77698d3/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
+golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
+golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
+golang.org/x/crypto v0.0.0-20190701094942-4def268fd1a4/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
+golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
+golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
+golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
+golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
+golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
+golang.org/x/exp v0.0.0-20190125153040-c74c464bbbf2/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
+golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
+golang.org/x/exp v0.0.0-20190731235908-ec7cb31e5a56/go.mod h1:JhuoJpWY28nO4Vef9tZUw9qufEGTyX1+7lmHxV5q5G4=
+golang.org/x/exp v0.0.0-20191002040644-a1355ae1e2c3/go.mod h1:NOZ3BPKG0ec/BKJQgnvsSFpcKLM5xXVWnvZS97DWHgE=
+golang.org/x/exp v0.0.0-20211028214138-64b4c8e87d1a h1:9kUIHyUjWEuW2MdtaYvvEXTDs2eNylNVAt/ui66QGOg=
+golang.org/x/exp v0.0.0-20211028214138-64b4c8e87d1a/go.mod h1:a3o/VtDNHN+dCVLEpzjjUHOzR+Ln3DHX056ZPzoZGGA=
+golang.org/x/image v0.0.0-20180708004352-c73c2afc3b81/go.mod h1:ux5Hcp/YLpHSI86hEcLt0YII63i6oz57MZXIpbrjZUs=
+golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js=
+golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
+golang.org/x/image v0.0.0-20190910094157-69e4b8554b2a/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
+golang.org/x/image v0.0.0-20200119044424-58c23975cae1/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
+golang.org/x/image v0.0.0-20200430140353-33d19683fad8/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
+golang.org/x/image v0.0.0-20200618115811-c13761719519/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
+golang.org/x/image v0.0.0-20201208152932-35266b937fa6/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
+golang.org/x/image v0.0.0-20210216034530-4410531fe030/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
+golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
+golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU=
+golang.org/x/lint v0.0.0-20190301231843-5614ed5bae6f/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
+golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
+golang.org/x/lint v0.0.0-20190930215403-16217165b5de/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
+golang.org/x/mobile v0.0.0-20190312151609-d3739f865fa6/go.mod h1:z+o9i4GpDbdi3rU15maQ/Ox0txvL9dWGYEHz965HBQE=
+golang.org/x/mobile v0.0.0-20190719004257-d2bd2a29d028/go.mod h1:E/iHnbuqvinMTCcRqshq8CkpyQDoeVncDDYHnLhea+o=
+golang.org/x/mobile v0.0.0-20201217150744-e6ae53a27f4f/go.mod h1:skQtrUTUwhdJvXM/2KKJzY8pDgNr9I/FOMqDVRPBUS4=
+golang.org/x/mod v0.0.0-20190513183733-4bf6d317e70e/go.mod h1:mXi4GBBbnImb6dmsKGUJ2LatrhH/nqhxcFungHvyanc=
+golang.org/x/mod v0.1.0/go.mod h1:0QHyrYULN0/3qlju5TqG8bIK38QM8yzMo5ekMj3DlcY=
+golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg=
+golang.org/x/mod v0.1.1-0.20191209134235-331c550502dd/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
+golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
+golang.org/x/mod v0.5.1-0.20210830214625-1b1db11ec8f4/go.mod h1:5OXOZSfqPIIbmVBIIKWRFfZjPR0E5r58TLhUjH0a2Ro=
+golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/net v0.0.0-20181023162649-9b4f9f5ad519/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/net v0.0.0-20181201002055-351d144fa1fc/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/net v0.0.0-20181220203305-927f97764cc3/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/net v0.0.0-20190125091013-d26f9f9a57f3/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
+golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
+golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks=
+golang.org/x/net v0.0.0-20190613194153-d28f0bde5980/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/net v0.0.0-20190813141303-74dc4d7220e7/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/net v0.0.0-20200822124328-c89045814202/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA=
+golang.org/x/net v0.0.0-20201021035429-f5854403a974 h1:IX6qOQeG5uLjB/hjjwjedwfjND0hgjPMMyO1RoIXQNI=
+golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
+golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
+golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
+golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
+golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20190227155943-e225da77a7e6/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sys v0.0.0-20180823144017-11551d06cbcc/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20181026203630-95b1ffbd15a5/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20181107165924-66b7b1311ac8/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20181116152217-5ac8a444bdc5/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20181122145206-62eef0e2fa9b/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20190422165155-953cdadca894/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20190502145724-3ef323f4f1fd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20190726091711-fc99dfbffb4e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20190826190057-c7b8b68b1456/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20191220142924-d4481acd189f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20200727154430-2d971f7391a4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210119212857-b64e53b001e4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210304124612-50617c2ba197/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20211025201205-69cdffdb9359 h1:2B5p2L5IfGiD7+b9BOoRMC6DgObAVZV+Fsp050NqXik=
+golang.org/x/sys v0.0.0-20211025201205-69cdffdb9359/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
+golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
+golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
+golang.org/x/text v0.3.5 h1:i6eZZ+zk0SOf0xgBpEpPD18qWcJda6q1sxt3S0kzyUQ=
+golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
+golang.org/x/time v0.0.0-20180412165947-fbb02b2291d2/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
+golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
+golang.org/x/tools v0.0.0-20180221164845-07fd8470d635/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.0.0-20180828015842-6cd1fcedba52/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.0.0-20190206041539-40960b6deb8e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY=
+golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
+golang.org/x/tools v0.0.0-20190312151545-0bb0c0a6e846/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
+golang.org/x/tools v0.0.0-20190312170243-e65039ee4138/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
+golang.org/x/tools v0.0.0-20190328211700-ab21143f2384/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
+golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q=
+golang.org/x/tools v0.0.0-20190621195816-6e04913cbbac/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc=
+golang.org/x/tools v0.0.0-20190927191325-030b2cf1153e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
+golang.org/x/tools v0.0.0-20191029041327-9cc4af7d6b2c/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
+golang.org/x/tools v0.0.0-20191029190741-b9c20aec41a5/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
+golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
+golang.org/x/tools v0.0.0-20200103221440-774c71fcf114/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
+golang.org/x/tools v0.0.0-20200117012304-6edc0a871e69/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
+golang.org/x/tools v0.1.0/go.mod h1:xkSsbof2nBLbhDlRMhhhyNLN/zl3eTqcnHD5viDpcZ0=
+golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE=
+golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+gonum.org/v1/gonum v0.0.0-20180816165407-929014505bf4/go.mod h1:Y+Yx5eoAFn32cQvJDxZx5Dpnq+c3wtXuadVZAcxbbBo=
+gonum.org/v1/gonum v0.8.2/go.mod h1:oe/vMfY3deqTw+1EZJhuvEW2iwGF1bW9wwu7XCu0+v0=
+gonum.org/v1/gonum v0.9.3 h1:DnoIG+QAMaF5NvxnGe/oKsgKcAc6PcUyl8q0VetfQ8s=
+gonum.org/v1/gonum v0.9.3/go.mod h1:TZumC3NeyVQskjXqmyWt4S3bINhy7B4eYwW69EbyX+0=
+gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0 h1:OE9mWmgKkjJyEmDAAtGMPjXu+YNeGvK9VTSHY6+Qihc=
+gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw=
+gonum.org/v1/plot v0.0.0-20190515093506-e2840ee46a6b/go.mod h1:Wt8AAjI+ypCyYX3nZBvf6cAIx93T+c/OS2HFAYskSZc=
+gonum.org/v1/plot v0.9.0/go.mod h1:3Pcqqmp6RHvJI72kgb8fThyUnav364FOsdDo2aGW5lY=
+google.golang.org/api v0.3.1/go.mod h1:6wY9I6uQWHQ8EM57III9mq/AjF+i8G65rmVagqKMtkk=
+google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM=
+google.golang.org/appengine v1.2.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
+google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
+google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc=
+google.golang.org/genproto v0.0.0-20190307195333-5fe7a883aa19/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE=
+google.golang.org/genproto v0.0.0-20190425155659-357c62f0e4bb/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE=
+google.golang.org/genproto v0.0.0-20190530194941-fb225487d101/go.mod h1:z3L6/3dTEVtUr6QSP8miRzeRqwQOioJ9I66odjN4I7s=
+google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc=
+google.golang.org/genproto v0.0.0-20200513103714-09dca8ec2884/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c=
+google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013 h1:+kGHl1aib/qcwaRi1CbqBZ1rk19r85MNUf8HaBghugY=
+google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo=
+google.golang.org/grpc v1.17.0/go.mod h1:6QZJwpn2B+Zp71q/5VxRsJ6NXXVCE5NRUHRo+f3cWCs=
+google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c=
+google.golang.org/grpc v1.20.0/go.mod h1:chYK+tFQF0nDUGJgXMSgLCQk3phJEuONr2DCgLDdAQM=
+google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38=
+google.golang.org/grpc v1.21.0/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM=
+google.golang.org/grpc v1.22.1/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg=
+google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg=
+google.golang.org/grpc v1.23.1/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg=
+google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY=
+google.golang.org/grpc v1.26.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk=
+google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk=
+google.golang.org/grpc v1.33.1/go.mod h1:fr5YgcSWrqhRRxogOsw7RzIpsmvOZ6IcH4kBYTpR3n0=
+google.golang.org/grpc v1.36.0/go.mod h1:qjiiYl8FncCW8feJPdyg3v6XW24KsRHe+dy9BAGRRjU=
+google.golang.org/grpc v1.41.0 h1:f+PlOh7QV4iIJkPrx5NQ7qaNGFQ3OTse67yaDHfju4E=
+google.golang.org/grpc v1.41.0/go.mod h1:U3l9uK9J0sini8mHphKoXyaqDA/8VyGnDee1zzIUK6k=
+google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8=
+google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0=
+google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM=
+google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE=
+google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo=
+google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
+google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
+google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
+google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c=
+google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
+google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc=
+google.golang.org/protobuf v1.27.1 h1:SnqbnDw1V7RiZcXPx5MEeqPv2s79L9i7BJUlG/+RurQ=
+google.golang.org/protobuf v1.27.1/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc=
+gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 h1:qIbj1fsPNlZgppZ+VLlY7N33q108Sa+fhmuc+sWQYwY=
+gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/cheggaaa/pb.v1 v1.0.25/go.mod h1:V/YB90LKu/1FcN3WVnfiiE5oMCibMjukxqG/qStrOgw=
+gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI=
+gopkg.in/fsnotify.v1 v1.4.7/go.mod h1:Tz8NjZHkW78fSQdbUxIjBTcgA1z1m8ZHf0WmKUhAMys=
+gopkg.in/gcfg.v1 v1.2.3/go.mod h1:yesOnuUOFQAhST5vPY4nbZsb/huCgGGXlipJsBn0b3o=
+gopkg.in/resty.v1 v1.12.0/go.mod h1:mDo4pnntr5jdWRML875a/NmxYqAlA73dVijT2AXvQQo=
+gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw=
+gopkg.in/warnings.v0 v0.1.2/go.mod h1:jksf8JmL6Qr/oQM2OXTHunEvvTAsrWBLb6OOjuVWRNI=
+gopkg.in/yaml.v2 v2.0.0-20170812160011-eb3733d160e7/go.mod h1:JAlM8MvJe8wmxCU4Bli9HhUf9+ttbYbLASfIpnQbh74=
+gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
+gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
+gopkg.in/yaml.v2 v2.2.3/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
+gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo=
+gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+honnef.co/go/tools v0.0.0-20180728063816-88497007e858/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
+honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
+honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
+honnef.co/go/tools v0.0.1-2019.2.3/go.mod h1:a3bituU0lyd329TUQxRnasdCoJDkEUEAqEt0JzvZhAg=
+honnef.co/go/tools v0.1.3/go.mod h1:NgwopIslSNH47DimFoV78dnkksY2EFtX0ajyb3K/las=
+rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4=
+sigs.k8s.io/yaml v1.1.0/go.mod h1:UJmg0vDUVViEyp3mgSv9WPwZCDxu4rQW1olrI1uml+o=
+sourcegraph.com/sourcegraph/appdash v0.0.0-20190731080439-ebfcffb1b5c0/go.mod h1:hI742Nqp5OhwiqlzhgfbWU4mW4yO10fP+LoT9WOswdU=
diff --git a/go/parquet/compress/brotli.go b/go/parquet/compress/brotli.go
index 2b45225957b13..73f28d061cef6 100644
--- a/go/parquet/compress/brotli.go
+++ b/go/parquet/compress/brotli.go
@@ -22,7 +22,7 @@ import (
 	"io/ioutil"
 
 	"github.com/andybalholm/brotli"
-	"github.com/apache/arrow/go/parquet/internal/debug"
+	"github.com/apache/arrow/go/v7/parquet/internal/debug"
 )
 
 type brotliCodec struct{}
diff --git a/go/parquet/compress/compress.go b/go/parquet/compress/compress.go
index 1b6d83687c204..9cf6e878b1da0 100644
--- a/go/parquet/compress/compress.go
+++ b/go/parquet/compress/compress.go
@@ -23,7 +23,7 @@ import (
 	"io"
 	"io/ioutil"
 
-	"github.com/apache/arrow/go/parquet/internal/gen-go/parquet"
+	"github.com/apache/arrow/go/v7/parquet/internal/gen-go/parquet"
 	"golang.org/x/xerrors"
 )
 
diff --git a/go/parquet/compress/compress_test.go b/go/parquet/compress/compress_test.go
index acb0c4bacd096..2a40d7fcf202b 100644
--- a/go/parquet/compress/compress_test.go
+++ b/go/parquet/compress/compress_test.go
@@ -22,7 +22,7 @@ import (
 	"math/rand"
 	"testing"
 
-	"github.com/apache/arrow/go/parquet/compress"
+	"github.com/apache/arrow/go/v7/parquet/compress"
 	"github.com/stretchr/testify/assert"
 )
 
diff --git a/go/parquet/compress/zstd.go b/go/parquet/compress/zstd.go
index ebc91baeb26be..ceeb732840c7c 100644
--- a/go/parquet/compress/zstd.go
+++ b/go/parquet/compress/zstd.go
@@ -20,7 +20,7 @@ import (
 	"io"
 	"sync"
 
-	"github.com/apache/arrow/go/parquet/internal/debug"
+	"github.com/apache/arrow/go/v7/parquet/internal/debug"
 	"github.com/klauspost/compress/zstd"
 )
 
diff --git a/go/parquet/doc.go b/go/parquet/doc.go
index 87a592836a91f..e5643f0b86e15 100644
--- a/go/parquet/doc.go
+++ b/go/parquet/doc.go
@@ -25,12 +25,10 @@
 //
 // Install
 //
-// You can download the library via:
-//   go get -u github.com/apache/arrow/go/parquet
-//
-// In addition, two cli utilities are provided:
-// 	go install github.com/apache/arrow/go/parquet/cmd/parquet_reader
-// 	go install github.com/apache/arrow/go/parquet/cmd/parquet_schema
+// You can download the library and cli utilities via:
+//   go get -u github.com/apache/arrow/go/v7/parquet
+//   go install github.com/apache/arrow/go/v7/parquet/cmd/parquet_reader@latest
+//   go install github.com/apache/arrow/go/v7/parquet/cmd/parquet_schema@latest
 //
 // Modules
 //
diff --git a/go/parquet/encryption_properties.go b/go/parquet/encryption_properties.go
index bd97e53a401f7..a78daa0338b2f 100644
--- a/go/parquet/encryption_properties.go
+++ b/go/parquet/encryption_properties.go
@@ -20,7 +20,7 @@ import (
 	"crypto/rand"
 	"unicode/utf8"
 
-	format "github.com/apache/arrow/go/parquet/internal/gen-go/parquet"
+	format "github.com/apache/arrow/go/v7/parquet/internal/gen-go/parquet"
 )
 
 // Constants that will be used as the default values with encryption/decryption
diff --git a/go/parquet/encryption_properties_test.go b/go/parquet/encryption_properties_test.go
index ad7cb6010d96e..4e7446bd95a43 100644
--- a/go/parquet/encryption_properties_test.go
+++ b/go/parquet/encryption_properties_test.go
@@ -19,8 +19,8 @@ package parquet_test
 import (
 	"testing"
 
-	"github.com/apache/arrow/go/parquet"
-	"github.com/apache/arrow/go/parquet/internal/encryption"
+	"github.com/apache/arrow/go/v7/parquet"
+	"github.com/apache/arrow/go/v7/parquet/internal/encryption"
 	"github.com/stretchr/testify/assert"
 )
 
diff --git a/go/parquet/file/column_reader.go b/go/parquet/file/column_reader.go
index 79c6479b05b0d..184cd30a580e2 100644
--- a/go/parquet/file/column_reader.go
+++ b/go/parquet/file/column_reader.go
@@ -17,13 +17,13 @@
 package file
 
 import (
-	"github.com/apache/arrow/go/arrow/memory"
-	"github.com/apache/arrow/go/parquet"
-	"github.com/apache/arrow/go/parquet/internal/encoding"
-	"github.com/apache/arrow/go/parquet/internal/encryption"
-	format "github.com/apache/arrow/go/parquet/internal/gen-go/parquet"
-	"github.com/apache/arrow/go/parquet/internal/utils"
-	"github.com/apache/arrow/go/parquet/schema"
+	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/apache/arrow/go/v7/parquet"
+	"github.com/apache/arrow/go/v7/parquet/internal/encoding"
+	"github.com/apache/arrow/go/v7/parquet/internal/encryption"
+	format "github.com/apache/arrow/go/v7/parquet/internal/gen-go/parquet"
+	"github.com/apache/arrow/go/v7/parquet/internal/utils"
+	"github.com/apache/arrow/go/v7/parquet/schema"
 	"golang.org/x/xerrors"
 )
 
diff --git a/go/parquet/file/column_reader_test.go b/go/parquet/file/column_reader_test.go
index d22e365fa805e..97152f109acda 100644
--- a/go/parquet/file/column_reader_test.go
+++ b/go/parquet/file/column_reader_test.go
@@ -22,12 +22,12 @@ import (
 	"reflect"
 	"testing"
 
-	"github.com/apache/arrow/go/arrow/memory"
-	"github.com/apache/arrow/go/parquet"
-	"github.com/apache/arrow/go/parquet/file"
-	"github.com/apache/arrow/go/parquet/internal/testutils"
-	"github.com/apache/arrow/go/parquet/internal/utils"
-	"github.com/apache/arrow/go/parquet/schema"
+	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/apache/arrow/go/v7/parquet"
+	"github.com/apache/arrow/go/v7/parquet/file"
+	"github.com/apache/arrow/go/v7/parquet/internal/testutils"
+	"github.com/apache/arrow/go/v7/parquet/internal/utils"
+	"github.com/apache/arrow/go/v7/parquet/schema"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/suite"
 )
diff --git a/go/parquet/file/column_reader_types.gen.go b/go/parquet/file/column_reader_types.gen.go
index ab1fd535bbf27..e7e1edeb28e7c 100644
--- a/go/parquet/file/column_reader_types.gen.go
+++ b/go/parquet/file/column_reader_types.gen.go
@@ -21,9 +21,9 @@ package file
 import (
 	"unsafe"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/parquet"
-	"github.com/apache/arrow/go/parquet/internal/encoding"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/parquet"
+	"github.com/apache/arrow/go/v7/parquet/internal/encoding"
 )
 
 // Int32ColumnChunkReader is the Typed Column chunk reader instance for reading
diff --git a/go/parquet/file/column_reader_types.gen.go.tmpl b/go/parquet/file/column_reader_types.gen.go.tmpl
index 23b7d3ed823c3..691291d96d543 100644
--- a/go/parquet/file/column_reader_types.gen.go.tmpl
+++ b/go/parquet/file/column_reader_types.gen.go.tmpl
@@ -17,8 +17,8 @@
 package file
 
 import (
-    "github.com/apache/arrow/go/parquet"
-    "github.com/apache/arrow/go/parquet/internal/encoding"
+    "github.com/apache/arrow/go/v7/parquet"
+    "github.com/apache/arrow/go/v7/parquet/internal/encoding"
 )
 
 {{range .In}}
diff --git a/go/parquet/file/file_reader.go b/go/parquet/file/file_reader.go
index 8b95223a14d82..958ea23cc42c7 100644
--- a/go/parquet/file/file_reader.go
+++ b/go/parquet/file/file_reader.go
@@ -22,10 +22,10 @@ import (
 	"io"
 	"os"
 
-	"github.com/apache/arrow/go/arrow/memory"
-	"github.com/apache/arrow/go/parquet"
-	"github.com/apache/arrow/go/parquet/internal/encryption"
-	"github.com/apache/arrow/go/parquet/metadata"
+	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/apache/arrow/go/v7/parquet"
+	"github.com/apache/arrow/go/v7/parquet/internal/encryption"
+	"github.com/apache/arrow/go/v7/parquet/metadata"
 	"golang.org/x/exp/mmap"
 	"golang.org/x/xerrors"
 )
diff --git a/go/parquet/file/file_reader_test.go b/go/parquet/file/file_reader_test.go
index 6dfb1fa6bc736..4efceff9c9d79 100644
--- a/go/parquet/file/file_reader_test.go
+++ b/go/parquet/file/file_reader_test.go
@@ -23,13 +23,13 @@ import (
 	"math/rand"
 	"testing"
 
-	"github.com/apache/arrow/go/arrow/memory"
-	"github.com/apache/arrow/go/parquet/compress"
-	"github.com/apache/arrow/go/parquet/file"
-	"github.com/apache/arrow/go/parquet/internal/encoding"
-	format "github.com/apache/arrow/go/parquet/internal/gen-go/parquet"
-	"github.com/apache/arrow/go/parquet/internal/thrift"
-	"github.com/apache/arrow/go/parquet/metadata"
+	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/apache/arrow/go/v7/parquet/compress"
+	"github.com/apache/arrow/go/v7/parquet/file"
+	"github.com/apache/arrow/go/v7/parquet/internal/encoding"
+	format "github.com/apache/arrow/go/v7/parquet/internal/gen-go/parquet"
+	"github.com/apache/arrow/go/v7/parquet/internal/thrift"
+	"github.com/apache/arrow/go/v7/parquet/metadata"
 	libthrift "github.com/apache/thrift/lib/go/thrift"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/suite"
diff --git a/go/parquet/file/level_conversion.go b/go/parquet/file/level_conversion.go
index 6c56c13933e08..40f23bbb43c28 100644
--- a/go/parquet/file/level_conversion.go
+++ b/go/parquet/file/level_conversion.go
@@ -21,10 +21,10 @@ import (
 	"math/bits"
 	"unsafe"
 
-	"github.com/apache/arrow/go/parquet"
-	"github.com/apache/arrow/go/parquet/internal/bmi"
-	"github.com/apache/arrow/go/parquet/internal/utils"
-	"github.com/apache/arrow/go/parquet/schema"
+	"github.com/apache/arrow/go/v7/parquet"
+	"github.com/apache/arrow/go/v7/parquet/internal/bmi"
+	"github.com/apache/arrow/go/v7/parquet/internal/utils"
+	"github.com/apache/arrow/go/v7/parquet/schema"
 	"golang.org/x/xerrors"
 )
 
diff --git a/go/parquet/file/level_conversion_test.go b/go/parquet/file/level_conversion_test.go
index 08d2fe311f88a..752b3133d9a63 100644
--- a/go/parquet/file/level_conversion_test.go
+++ b/go/parquet/file/level_conversion_test.go
@@ -20,9 +20,9 @@ import (
 	"strings"
 	"testing"
 
-	"github.com/apache/arrow/go/arrow/bitutil"
-	"github.com/apache/arrow/go/parquet/internal/bmi"
-	"github.com/apache/arrow/go/parquet/internal/utils"
+	"github.com/apache/arrow/go/v7/arrow/bitutil"
+	"github.com/apache/arrow/go/v7/parquet/internal/bmi"
+	"github.com/apache/arrow/go/v7/parquet/internal/utils"
 	"github.com/stretchr/testify/assert"
 )
 
diff --git a/go/parquet/file/page_reader.go b/go/parquet/file/page_reader.go
index 5c36b338ba9f8..1b66068d19307 100644
--- a/go/parquet/file/page_reader.go
+++ b/go/parquet/file/page_reader.go
@@ -22,14 +22,14 @@ import (
 	"sync"
 
 	"github.com/JohnCGriffin/overflow"
-	"github.com/apache/arrow/go/arrow/ipc"
-	"github.com/apache/arrow/go/arrow/memory"
-	"github.com/apache/arrow/go/parquet"
-	"github.com/apache/arrow/go/parquet/compress"
-	"github.com/apache/arrow/go/parquet/internal/encryption"
-	format "github.com/apache/arrow/go/parquet/internal/gen-go/parquet"
-	"github.com/apache/arrow/go/parquet/internal/thrift"
-	"github.com/apache/arrow/go/parquet/metadata"
+	"github.com/apache/arrow/go/v7/arrow/ipc"
+	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/apache/arrow/go/v7/parquet"
+	"github.com/apache/arrow/go/v7/parquet/compress"
+	"github.com/apache/arrow/go/v7/parquet/internal/encryption"
+	format "github.com/apache/arrow/go/v7/parquet/internal/gen-go/parquet"
+	"github.com/apache/arrow/go/v7/parquet/internal/thrift"
+	"github.com/apache/arrow/go/v7/parquet/metadata"
 	"golang.org/x/xerrors"
 )
 
diff --git a/go/parquet/file/row_group_reader.go b/go/parquet/file/row_group_reader.go
index 455144e266ac0..e6f84d2412a29 100644
--- a/go/parquet/file/row_group_reader.go
+++ b/go/parquet/file/row_group_reader.go
@@ -17,11 +17,11 @@
 package file
 
 import (
-	"github.com/apache/arrow/go/arrow/ipc"
-	"github.com/apache/arrow/go/parquet"
-	"github.com/apache/arrow/go/parquet/internal/encryption"
-	"github.com/apache/arrow/go/parquet/internal/utils"
-	"github.com/apache/arrow/go/parquet/metadata"
+	"github.com/apache/arrow/go/v7/arrow/ipc"
+	"github.com/apache/arrow/go/v7/parquet"
+	"github.com/apache/arrow/go/v7/parquet/internal/encryption"
+	"github.com/apache/arrow/go/v7/parquet/internal/utils"
+	"github.com/apache/arrow/go/v7/parquet/metadata"
 	"golang.org/x/xerrors"
 )
 
diff --git a/go/parquet/go.sum b/go/parquet/go.sum
deleted file mode 100644
index 6df409297b40d..0000000000000
--- a/go/parquet/go.sum
+++ /dev/null
@@ -1,245 +0,0 @@
-cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
-cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
-dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU=
-dmitri.shuralyov.com/gpu/mtl v0.0.0-20201218220906-28db891af037/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU=
-gioui.org v0.0.0-20210308172011-57750fc8a0a6/go.mod h1:RSH6KIUZ0p2xy5zHDxgAM4zumjgTw83q2ge/PI+yyw8=
-github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
-github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo=
-github.com/JohnCGriffin/overflow v0.0.0-20170615021017-4d914c927216 h1:2ZboyJ8vl75fGesnG9NpMTD2DyQI3FzMXy4x752rGF0=
-github.com/JohnCGriffin/overflow v0.0.0-20170615021017-4d914c927216/go.mod h1:X0CRv0ky0k6m906ixxpzmDRLvX58TFUKS2eePweuyxk=
-github.com/ajstarks/svgo v0.0.0-20180226025133-644b8db467af/go.mod h1:K08gAheRH3/J6wwsYMMT4xOr94bZjxIelGM0+d/wbFw=
-github.com/andybalholm/brotli v1.0.1 h1:KqhlKozYbRtJvsPrrEeXcO+N2l6NYT5A2QAFmSULpEc=
-github.com/andybalholm/brotli v1.0.1/go.mod h1:loMXtMfwqflxFJPmdbJO0a3KNoPuLBgiu3qAvBg8x/Y=
-github.com/andybalholm/brotli v1.0.3 h1:fpcw+r1N1h0Poc1F/pHbW40cUm/lMEQslZtCkBQ0UnM=
-github.com/andybalholm/brotli v1.0.3/go.mod h1:fO7iG3H7G2nSZ7m0zPUDn85XEX2GTukHGRSepvi9Eig=
-github.com/antihax/optional v1.0.0/go.mod h1:uupD/76wgC+ih3iEmQUL+0Ugr19nfwCT1kdvxnR2qWY=
-github.com/apache/arrow/go/arrow v0.0.0-20210909182554-946bdcf83611 h1:17eVDdLcmI8iNy6dDimoLT+5jG3Y68KxZkDkKI1rLuw=
-github.com/apache/arrow/go/arrow v0.0.0-20210909182554-946bdcf83611/go.mod h1:2qMFB56yOP3KzkB3PbYZ4AlUFg3a88F67TIx5lB/WwY=
-github.com/apache/arrow/go/arrow v0.0.0-20211025125312-be665ef948cb h1:4t4siO1kRtmrdKcOKXxZvtFpCP/bJQW7LA3qABUhdEY=
-github.com/apache/arrow/go/arrow v0.0.0-20211025125312-be665ef948cb/go.mod h1:Q7yQnSMnLvcXlZ8RV+jwz/6y1rQTqbX6C82SndT52Zs=
-github.com/apache/thrift v0.15.0 h1:aGvdaR0v1t9XLgjtBYwxcBvBOTMqClzwE26CHOgjW1Y=
-github.com/apache/thrift v0.15.0/go.mod h1:PHK3hniurgQaNMZYaCLEqXKsYK8upmhPbmdP2FXSqgU=
-github.com/boombuler/barcode v1.0.0/go.mod h1:paBWMcWSl3LHKBqUq+rly7CNSldXjb2rDl3JlRe0mD8=
-github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
-github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
-github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc=
-github.com/cncf/udpa/go v0.0.0-20201120205902-5459f2c99403/go.mod h1:WmhPx2Nbnhtbo57+VJT5O0JRkEi1Wbu0z5j0R8u5Hbk=
-github.com/cncf/xds/go v0.0.0-20210312221358-fbca930ec8ed/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs=
-github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
-github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
-github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
-github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
-github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
-github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98=
-github.com/envoyproxy/go-control-plane v0.9.9-0.20201210154907-fd9021fe5dad/go.mod h1:cXg6YxExXjJnVBQHBLXeUAgxn2UodCpnH306RInaBQk=
-github.com/envoyproxy/go-control-plane v0.9.9-0.20210217033140-668b12f5399d/go.mod h1:cXg6YxExXjJnVBQHBLXeUAgxn2UodCpnH306RInaBQk=
-github.com/envoyproxy/go-control-plane v0.9.9-0.20210512163311-63b5d3c536b0/go.mod h1:hliV/p42l8fGbc6Y9bQ70uLwIvmJyVE5k4iMKlh8wCQ=
-github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c=
-github.com/fogleman/gg v1.2.1-0.20190220221249-0403632d5b90/go.mod h1:R/bRT+9gY/C5z7JzPU0zXsXHKM4/ayA+zqcVNZzPa1k=
-github.com/fogleman/gg v1.3.0/go.mod h1:R/bRT+9gY/C5z7JzPU0zXsXHKM4/ayA+zqcVNZzPa1k=
-github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04=
-github.com/go-fonts/dejavu v0.1.0/go.mod h1:4Wt4I4OU2Nq9asgDCteaAaWZOV24E+0/Pwo0gppep4g=
-github.com/go-fonts/latin-modern v0.2.0/go.mod h1:rQVLdDMK+mK1xscDwsqM5J8U2jrRa3T0ecnM9pNujks=
-github.com/go-fonts/liberation v0.1.1/go.mod h1:K6qoJYypsmfVjWg8KOVDQhLc8UDgIK2HYqyqAO9z7GY=
-github.com/go-fonts/stix v0.1.0/go.mod h1:w/c1f0ldAUlJmLBvlbkvVXLAD+tAMqobIIQpmnUIzUY=
-github.com/go-gl/glfw v0.0.0-20190409004039-e6da0acd62b1/go.mod h1:vR7hzQXu2zJy9AVAgeJqvqgH9Q5CA+iKCZ2gyEVpxRU=
-github.com/go-gl/glfw/v3.3/glfw v0.0.0-20200222043503-6f7a984d4dc4/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8=
-github.com/go-latex/latex v0.0.0-20210118124228-b3d85cf34e07/go.mod h1:CO1AlKB2CSIqUrmQPqA0gdRIlnLEY0gK5JGjh37zN5U=
-github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k=
-github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q=
-github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A=
-github.com/golang/mock v1.5.0/go.mod h1:CWnOUgYIOo4TcNZ0wHX3YZCqsaM1I1Jvs6v3mP3KVu8=
-github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
-github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
-github.com/golang/protobuf v1.3.3/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw=
-github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8=
-github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA=
-github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs=
-github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w=
-github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0=
-github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QDs8UjoX8=
-github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI=
-github.com/golang/protobuf v1.4.3/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI=
-github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk=
-github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY=
-github.com/golang/snappy v0.0.3 h1:fHPg5GQYlCeLIPB9BZqMVR5nR9A+IM5zcgeTdjMYmLA=
-github.com/golang/snappy v0.0.3/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
-github.com/google/flatbuffers v2.0.0+incompatible h1:dicJ2oXwypfwUGnB2/TYWYEKiuk9eYQlQO/AnOHl5mI=
-github.com/google/flatbuffers v2.0.0+incompatible/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8=
-github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M=
-github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
-github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
-github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
-github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
-github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
-github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
-github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
-github.com/grpc-ecosystem/grpc-gateway v1.16.0/go.mod h1:BDjrQk3hbvj6Nolgz8mAMFbcEtjT1g+wF4CSlocrBnw=
-github.com/jung-kurt/gofpdf v1.0.0/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+r32jIOes=
-github.com/jung-kurt/gofpdf v1.0.3-0.20190309125859-24315acbbda5/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+r32jIOes=
-github.com/klauspost/asmfmt v1.2.3 h1:qEM7SLDo6DXXXz5yTpqUoxhsrtwH30nNR2riO2ZjznY=
-github.com/klauspost/asmfmt v1.2.3/go.mod h1:RAoUvqkWr2rUa2I19qKMEVZQe4BVtcHGTMCUOcCU2Lg=
-github.com/klauspost/compress v1.13.1 h1:wXr2uRxZTJXHLly6qhJabee5JqIhTRoLBhDOA74hDEQ=
-github.com/klauspost/compress v1.13.1/go.mod h1:8dP1Hq4DHOhN9w426knH3Rhby4rFm6D8eO+e+Dq5Gzg=
-github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8 h1:AMFGa4R4MiIpspGNG7Z948v4n35fFGB3RR3G/ry4FWs=
-github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8/go.mod h1:mC1jAcsrzbxHt8iiaC+zU4b1ylILSosueou12R++wfY=
-github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3 h1:+n/aFZefKZp7spd8DFdX7uMikMLXX4oubIzJF4kv/wI=
-github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3/go.mod h1:RagcQ7I8IeTMnF8JTXieKnO4Z6JCsikNEzj0DwauVzE=
-github.com/phpdave11/gofpdf v1.4.2/go.mod h1:zpO6xFn9yxo3YLyMvW8HcKWVdbNqgIfOOp2dXMnm1mY=
-github.com/phpdave11/gofpdi v1.0.12/go.mod h1:vBmVV0Do6hSBHC8uKUQ71JGW+ZGQq74llk/7bXwjDoI=
-github.com/pierrec/lz4/v4 v4.1.8 h1:ieHkV+i2BRzngO4Wd/3HGowuZStgq6QkPsD1eolNAO4=
-github.com/pierrec/lz4/v4 v4.1.8/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4=
-github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
-github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
-github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
-github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
-github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
-github.com/rogpeppe/fastuuid v1.2.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ=
-github.com/ruudk/golang-pdf417 v0.0.0-20181029194003-1af4ab5afa58/go.mod h1:6lfFZQK844Gfx8o5WFuvpxWRwnSoipWe/p622j1v06w=
-github.com/stretchr/objx v0.1.0 h1:4G4v2dO3VZwixGIRoQ5Lfboy6nUhCyYzaqnIAPPhYs4=
-github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
-github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
-github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA=
-github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY=
-github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
-github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k=
-github.com/zeebo/xxh3 v0.10.0 h1:1+2Mov9zfxTNUeoDG9k9i13VfxTR0p1JQu8L0vikxB0=
-github.com/zeebo/xxh3 v0.10.0/go.mod h1:AQY73TOrhF3jNsdiM9zZOb8MThrYbZONHj7ryDBaLpg=
-go.opentelemetry.io/proto/otlp v0.7.0/go.mod h1:PqfVotwruBrMGOCsRd/89rSnXhoiJIqeYNgFYFoEGnI=
-golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
-golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
-golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
-golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
-golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
-golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
-golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
-golang.org/x/exp v0.0.0-20190125153040-c74c464bbbf2/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
-golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
-golang.org/x/exp v0.0.0-20190731235908-ec7cb31e5a56/go.mod h1:JhuoJpWY28nO4Vef9tZUw9qufEGTyX1+7lmHxV5q5G4=
-golang.org/x/exp v0.0.0-20191002040644-a1355ae1e2c3/go.mod h1:NOZ3BPKG0ec/BKJQgnvsSFpcKLM5xXVWnvZS97DWHgE=
-golang.org/x/exp v0.0.0-20210220032938-85be41e4509f h1:GrkO5AtFUU9U/1f5ctbIBXtBGeSJbWwIYfIsTcFMaX4=
-golang.org/x/exp v0.0.0-20210220032938-85be41e4509f/go.mod h1:I6l2HNBLBZEcrOoCpyKLdY2lHoRZ8lI4x60KMCQDft4=
-golang.org/x/image v0.0.0-20180708004352-c73c2afc3b81/go.mod h1:ux5Hcp/YLpHSI86hEcLt0YII63i6oz57MZXIpbrjZUs=
-golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js=
-golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
-golang.org/x/image v0.0.0-20190910094157-69e4b8554b2a/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
-golang.org/x/image v0.0.0-20200119044424-58c23975cae1/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
-golang.org/x/image v0.0.0-20200430140353-33d19683fad8/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
-golang.org/x/image v0.0.0-20200618115811-c13761719519/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
-golang.org/x/image v0.0.0-20201208152932-35266b937fa6/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
-golang.org/x/image v0.0.0-20210216034530-4410531fe030/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
-golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
-golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU=
-golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
-golang.org/x/lint v0.0.0-20210508222113-6edffad5e616/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY=
-golang.org/x/mobile v0.0.0-20190312151609-d3739f865fa6/go.mod h1:z+o9i4GpDbdi3rU15maQ/Ox0txvL9dWGYEHz965HBQE=
-golang.org/x/mobile v0.0.0-20190719004257-d2bd2a29d028/go.mod h1:E/iHnbuqvinMTCcRqshq8CkpyQDoeVncDDYHnLhea+o=
-golang.org/x/mobile v0.0.0-20201217150744-e6ae53a27f4f/go.mod h1:skQtrUTUwhdJvXM/2KKJzY8pDgNr9I/FOMqDVRPBUS4=
-golang.org/x/mod v0.1.0/go.mod h1:0QHyrYULN0/3qlju5TqG8bIK38QM8yzMo5ekMj3DlcY=
-golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg=
-golang.org/x/mod v0.1.1-0.20191209134235-331c550502dd/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
-golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
-golang.org/x/mod v0.3.1-0.20200828183125-ce943fd02449/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
-golang.org/x/mod v0.4.2 h1:Gz96sIWK3OalVv/I/qNygP42zyoKp3xptRVCWRFEBvo=
-golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
-golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
-golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
-golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
-golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
-golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
-golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
-golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
-golang.org/x/net v0.0.0-20200822124328-c89045814202/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA=
-golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96bSt6lcn1PtDYWL6XObtHCRCNQM=
-golang.org/x/net v0.0.0-20210614182718-04defd469f4e/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
-golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
-golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
-golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
-golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
-golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20191001151750-bb3f8db39f24/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20200727154430-2d971f7391a4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20210304124612-50617c2ba197/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20210330210617-4fbd30eecc44/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20210510120138-977fb7262007/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c h1:F1jZWGFhYfh0Ci55sIpILtKKK8p3i2/krTr0H1rg74I=
-golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
-golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
-golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
-golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
-golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
-golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
-golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
-golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
-golang.org/x/tools v0.0.0-20190206041539-40960b6deb8e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
-golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY=
-golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
-golang.org/x/tools v0.0.0-20190312151545-0bb0c0a6e846/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
-golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q=
-golang.org/x/tools v0.0.0-20190927191325-030b2cf1153e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
-golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
-golang.org/x/tools v0.0.0-20200117012304-6edc0a871e69/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
-golang.org/x/tools v0.0.0-20200130002326-2f3ba24bd6e7/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
-golang.org/x/tools v0.0.0-20200207183749-b753a1ba74fa/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
-golang.org/x/tools v0.1.4 h1:cVngSRcfgyZCzys3KYOpCFa+4dqX/Oub9tAq00ttGVs=
-golang.org/x/tools v0.1.4/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk=
-golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
-golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
-golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
-golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE=
-golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
-gonum.org/v1/gonum v0.0.0-20180816165407-929014505bf4/go.mod h1:Y+Yx5eoAFn32cQvJDxZx5Dpnq+c3wtXuadVZAcxbbBo=
-gonum.org/v1/gonum v0.8.2 h1:CCXrcPKiGGotvnN6jfUsKk4rRqm7q09/YbKb5xCEvtM=
-gonum.org/v1/gonum v0.8.2/go.mod h1:oe/vMfY3deqTw+1EZJhuvEW2iwGF1bW9wwu7XCu0+v0=
-gonum.org/v1/gonum v0.9.3 h1:DnoIG+QAMaF5NvxnGe/oKsgKcAc6PcUyl8q0VetfQ8s=
-gonum.org/v1/gonum v0.9.3/go.mod h1:TZumC3NeyVQskjXqmyWt4S3bINhy7B4eYwW69EbyX+0=
-gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0 h1:OE9mWmgKkjJyEmDAAtGMPjXu+YNeGvK9VTSHY6+Qihc=
-gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw=
-gonum.org/v1/plot v0.0.0-20190515093506-e2840ee46a6b/go.mod h1:Wt8AAjI+ypCyYX3nZBvf6cAIx93T+c/OS2HFAYskSZc=
-gonum.org/v1/plot v0.9.0/go.mod h1:3Pcqqmp6RHvJI72kgb8fThyUnav364FOsdDo2aGW5lY=
-google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM=
-google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
-google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc=
-google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc=
-google.golang.org/genproto v0.0.0-20200513103714-09dca8ec2884/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c=
-google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo=
-google.golang.org/genproto v0.0.0-20210630183607-d20f26d13c79/go.mod h1:yiaVoXHpRzHGyxV3o4DktVWY4mSUErTKaeEOq6C3t3U=
-google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c=
-google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg=
-google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY=
-google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk=
-google.golang.org/grpc v1.33.1/go.mod h1:fr5YgcSWrqhRRxogOsw7RzIpsmvOZ6IcH4kBYTpR3n0=
-google.golang.org/grpc v1.36.0/go.mod h1:qjiiYl8FncCW8feJPdyg3v6XW24KsRHe+dy9BAGRRjU=
-google.golang.org/grpc v1.38.0/go.mod h1:NREThFqKR1f3iQ6oBuvc5LadQuXVGo9rkm5ZGrQdJfM=
-google.golang.org/grpc v1.39.0/go.mod h1:PImNr+rS9TWYb2O4/emRugxiyHZ5JyHW5F+RPnDzfrE=
-google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8=
-google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0=
-google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM=
-google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE=
-google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo=
-google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
-google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
-google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
-google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c=
-google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
-google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc=
-google.golang.org/protobuf v1.27.1/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc=
-gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
-gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
-gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
-gopkg.in/yaml.v2 v2.2.3/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
-gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo=
-gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
-honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
-honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
-rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4=
diff --git a/go/parquet/internal/encoding/boolean_decoder.go b/go/parquet/internal/encoding/boolean_decoder.go
index bdf1fd56f9825..228a007775c33 100644
--- a/go/parquet/internal/encoding/boolean_decoder.go
+++ b/go/parquet/internal/encoding/boolean_decoder.go
@@ -17,9 +17,9 @@
 package encoding
 
 import (
-	"github.com/apache/arrow/go/arrow/bitutil"
-	"github.com/apache/arrow/go/parquet"
-	"github.com/apache/arrow/go/parquet/internal/utils"
+	"github.com/apache/arrow/go/v7/arrow/bitutil"
+	"github.com/apache/arrow/go/v7/parquet"
+	"github.com/apache/arrow/go/v7/parquet/internal/utils"
 	"golang.org/x/xerrors"
 )
 
diff --git a/go/parquet/internal/encoding/boolean_encoder.go b/go/parquet/internal/encoding/boolean_encoder.go
index 81f523b2116be..d8b515440b79d 100644
--- a/go/parquet/internal/encoding/boolean_encoder.go
+++ b/go/parquet/internal/encoding/boolean_encoder.go
@@ -17,9 +17,9 @@
 package encoding
 
 import (
-	"github.com/apache/arrow/go/arrow/bitutil"
-	"github.com/apache/arrow/go/parquet"
-	"github.com/apache/arrow/go/parquet/internal/utils"
+	"github.com/apache/arrow/go/v7/arrow/bitutil"
+	"github.com/apache/arrow/go/v7/parquet"
+	"github.com/apache/arrow/go/v7/parquet/internal/utils"
 )
 
 const (
diff --git a/go/parquet/internal/encoding/byte_array_decoder.go b/go/parquet/internal/encoding/byte_array_decoder.go
index fa8033b78fa97..d32cb654aec17 100644
--- a/go/parquet/internal/encoding/byte_array_decoder.go
+++ b/go/parquet/internal/encoding/byte_array_decoder.go
@@ -19,8 +19,8 @@ package encoding
 import (
 	"encoding/binary"
 
-	"github.com/apache/arrow/go/parquet"
-	"github.com/apache/arrow/go/parquet/internal/utils"
+	"github.com/apache/arrow/go/v7/parquet"
+	"github.com/apache/arrow/go/v7/parquet/internal/utils"
 	"golang.org/x/xerrors"
 )
 
diff --git a/go/parquet/internal/encoding/byte_array_encoder.go b/go/parquet/internal/encoding/byte_array_encoder.go
index 3015bf8603289..754db95f77fa8 100644
--- a/go/parquet/internal/encoding/byte_array_encoder.go
+++ b/go/parquet/internal/encoding/byte_array_encoder.go
@@ -20,9 +20,9 @@ import (
 	"encoding/binary"
 	"unsafe"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/parquet"
-	"github.com/apache/arrow/go/parquet/internal/utils"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/parquet"
+	"github.com/apache/arrow/go/v7/parquet/internal/utils"
 )
 
 // PlainByteArrayEncoder encodes byte arrays according to the spec for Plain encoding
diff --git a/go/parquet/internal/encoding/decoder.go b/go/parquet/internal/encoding/decoder.go
index 6de61574ec552..ae1bdcff54d88 100644
--- a/go/parquet/internal/encoding/decoder.go
+++ b/go/parquet/internal/encoding/decoder.go
@@ -20,12 +20,12 @@ import (
 	"bytes"
 	"reflect"
 
-	"github.com/apache/arrow/go/arrow/memory"
-	"github.com/apache/arrow/go/parquet"
-	"github.com/apache/arrow/go/parquet/internal/debug"
-	format "github.com/apache/arrow/go/parquet/internal/gen-go/parquet"
-	"github.com/apache/arrow/go/parquet/internal/utils"
-	"github.com/apache/arrow/go/parquet/schema"
+	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/apache/arrow/go/v7/parquet"
+	"github.com/apache/arrow/go/v7/parquet/internal/debug"
+	format "github.com/apache/arrow/go/v7/parquet/internal/gen-go/parquet"
+	"github.com/apache/arrow/go/v7/parquet/internal/utils"
+	"github.com/apache/arrow/go/v7/parquet/schema"
 	"golang.org/x/xerrors"
 )
 
diff --git a/go/parquet/internal/encoding/delta_bit_packing.go b/go/parquet/internal/encoding/delta_bit_packing.go
index f5a3867208be9..b51bd21f07cbe 100644
--- a/go/parquet/internal/encoding/delta_bit_packing.go
+++ b/go/parquet/internal/encoding/delta_bit_packing.go
@@ -22,10 +22,10 @@ import (
 	"math/bits"
 	"reflect"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/memory"
-	"github.com/apache/arrow/go/parquet"
-	"github.com/apache/arrow/go/parquet/internal/utils"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/apache/arrow/go/v7/parquet"
+	"github.com/apache/arrow/go/v7/parquet/internal/utils"
 	"golang.org/x/xerrors"
 )
 
diff --git a/go/parquet/internal/encoding/delta_byte_array.go b/go/parquet/internal/encoding/delta_byte_array.go
index c97e8914dfe3c..db4f965bd92bb 100644
--- a/go/parquet/internal/encoding/delta_byte_array.go
+++ b/go/parquet/internal/encoding/delta_byte_array.go
@@ -17,9 +17,9 @@
 package encoding
 
 import (
-	"github.com/apache/arrow/go/arrow/memory"
-	"github.com/apache/arrow/go/parquet"
-	"github.com/apache/arrow/go/parquet/internal/utils"
+	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/apache/arrow/go/v7/parquet"
+	"github.com/apache/arrow/go/v7/parquet/internal/utils"
 	"golang.org/x/xerrors"
 )
 
diff --git a/go/parquet/internal/encoding/delta_length_byte_array.go b/go/parquet/internal/encoding/delta_length_byte_array.go
index 8a8dd00ae12e7..04a1cee375ab0 100644
--- a/go/parquet/internal/encoding/delta_length_byte_array.go
+++ b/go/parquet/internal/encoding/delta_length_byte_array.go
@@ -17,9 +17,9 @@
 package encoding
 
 import (
-	"github.com/apache/arrow/go/arrow/memory"
-	"github.com/apache/arrow/go/parquet"
-	"github.com/apache/arrow/go/parquet/internal/utils"
+	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/apache/arrow/go/v7/parquet"
+	"github.com/apache/arrow/go/v7/parquet/internal/utils"
 	"golang.org/x/xerrors"
 )
 
diff --git a/go/parquet/internal/encoding/encoder.go b/go/parquet/internal/encoding/encoder.go
index 4121a116afe29..b46a33d21a11b 100644
--- a/go/parquet/internal/encoding/encoder.go
+++ b/go/parquet/internal/encoding/encoder.go
@@ -20,13 +20,13 @@ import (
 	"math/bits"
 	"reflect"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/bitutil"
-	"github.com/apache/arrow/go/arrow/memory"
-	"github.com/apache/arrow/go/parquet"
-	format "github.com/apache/arrow/go/parquet/internal/gen-go/parquet"
-	"github.com/apache/arrow/go/parquet/internal/utils"
-	"github.com/apache/arrow/go/parquet/schema"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/bitutil"
+	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/apache/arrow/go/v7/parquet"
+	format "github.com/apache/arrow/go/v7/parquet/internal/gen-go/parquet"
+	"github.com/apache/arrow/go/v7/parquet/internal/utils"
+	"github.com/apache/arrow/go/v7/parquet/schema"
 )
 
 //go:generate go run ../../../arrow/_tools/tmpl/main.go -i -data=physical_types.tmpldata plain_encoder_types.gen.go.tmpl typed_encoder.gen.go.tmpl
diff --git a/go/parquet/internal/encoding/encoding_benchmarks_test.go b/go/parquet/internal/encoding/encoding_benchmarks_test.go
index f13d3d02187e3..d98307ab4eaae 100644
--- a/go/parquet/internal/encoding/encoding_benchmarks_test.go
+++ b/go/parquet/internal/encoding/encoding_benchmarks_test.go
@@ -21,14 +21,14 @@ import (
 	"math"
 	"testing"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/memory"
-	"github.com/apache/arrow/go/parquet"
-	"github.com/apache/arrow/go/parquet/internal/encoding"
-	"github.com/apache/arrow/go/parquet/internal/hashing"
-	"github.com/apache/arrow/go/parquet/internal/testutils"
-	"github.com/apache/arrow/go/parquet/schema"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/apache/arrow/go/v7/parquet"
+	"github.com/apache/arrow/go/v7/parquet/internal/encoding"
+	"github.com/apache/arrow/go/v7/parquet/internal/hashing"
+	"github.com/apache/arrow/go/v7/parquet/internal/testutils"
+	"github.com/apache/arrow/go/v7/parquet/schema"
 )
 
 const (
diff --git a/go/parquet/internal/encoding/encoding_test.go b/go/parquet/internal/encoding/encoding_test.go
index b58a13c199184..e88e390fbb27a 100644
--- a/go/parquet/internal/encoding/encoding_test.go
+++ b/go/parquet/internal/encoding/encoding_test.go
@@ -22,13 +22,13 @@ import (
 	"testing"
 	"unsafe"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/bitutil"
-	"github.com/apache/arrow/go/arrow/memory"
-	"github.com/apache/arrow/go/parquet"
-	"github.com/apache/arrow/go/parquet/internal/encoding"
-	"github.com/apache/arrow/go/parquet/internal/testutils"
-	"github.com/apache/arrow/go/parquet/schema"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/bitutil"
+	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/apache/arrow/go/v7/parquet"
+	"github.com/apache/arrow/go/v7/parquet/internal/encoding"
+	"github.com/apache/arrow/go/v7/parquet/internal/testutils"
+	"github.com/apache/arrow/go/v7/parquet/schema"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/suite"
 )
diff --git a/go/parquet/internal/encoding/fixed_len_byte_array_decoder.go b/go/parquet/internal/encoding/fixed_len_byte_array_decoder.go
index a23489290c8e5..86e420b0f7f33 100644
--- a/go/parquet/internal/encoding/fixed_len_byte_array_decoder.go
+++ b/go/parquet/internal/encoding/fixed_len_byte_array_decoder.go
@@ -19,8 +19,8 @@ package encoding
 import (
 	"math"
 
-	"github.com/apache/arrow/go/parquet"
-	"github.com/apache/arrow/go/parquet/internal/utils"
+	"github.com/apache/arrow/go/v7/parquet"
+	"github.com/apache/arrow/go/v7/parquet/internal/utils"
 	"golang.org/x/xerrors"
 )
 
diff --git a/go/parquet/internal/encoding/fixed_len_byte_array_encoder.go b/go/parquet/internal/encoding/fixed_len_byte_array_encoder.go
index 7eda0d38b0bfa..83615912739ba 100644
--- a/go/parquet/internal/encoding/fixed_len_byte_array_encoder.go
+++ b/go/parquet/internal/encoding/fixed_len_byte_array_encoder.go
@@ -17,8 +17,8 @@
 package encoding
 
 import (
-	"github.com/apache/arrow/go/parquet"
-	"github.com/apache/arrow/go/parquet/internal/utils"
+	"github.com/apache/arrow/go/v7/parquet"
+	"github.com/apache/arrow/go/v7/parquet/internal/utils"
 )
 
 // PlainFixedLenByteArrayEncoder writes the raw bytes of the byte array
diff --git a/go/parquet/internal/encoding/levels.go b/go/parquet/internal/encoding/levels.go
index 29336bad74917..be5d78c6a890e 100644
--- a/go/parquet/internal/encoding/levels.go
+++ b/go/parquet/internal/encoding/levels.go
@@ -23,10 +23,10 @@ import (
 	"math/bits"
 
 	"github.com/JohnCGriffin/overflow"
-	"github.com/apache/arrow/go/arrow/bitutil"
-	"github.com/apache/arrow/go/parquet"
-	format "github.com/apache/arrow/go/parquet/internal/gen-go/parquet"
-	"github.com/apache/arrow/go/parquet/internal/utils"
+	"github.com/apache/arrow/go/v7/arrow/bitutil"
+	"github.com/apache/arrow/go/v7/parquet"
+	format "github.com/apache/arrow/go/v7/parquet/internal/gen-go/parquet"
+	"github.com/apache/arrow/go/v7/parquet/internal/utils"
 	"golang.org/x/xerrors"
 )
 
diff --git a/go/parquet/internal/encoding/levels_test.go b/go/parquet/internal/encoding/levels_test.go
index a9d98ec2ceb94..bb6a7666b990a 100644
--- a/go/parquet/internal/encoding/levels_test.go
+++ b/go/parquet/internal/encoding/levels_test.go
@@ -21,11 +21,11 @@ import (
 	"strconv"
 	"testing"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/memory"
-	"github.com/apache/arrow/go/parquet"
-	"github.com/apache/arrow/go/parquet/internal/encoding"
-	"github.com/apache/arrow/go/parquet/internal/utils"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/apache/arrow/go/v7/parquet"
+	"github.com/apache/arrow/go/v7/parquet/internal/encoding"
+	"github.com/apache/arrow/go/v7/parquet/internal/utils"
 	"github.com/stretchr/testify/assert"
 )
 
diff --git a/go/parquet/internal/encoding/memo_table.go b/go/parquet/internal/encoding/memo_table.go
index acd769e486a60..e2ca7b36b7189 100644
--- a/go/parquet/internal/encoding/memo_table.go
+++ b/go/parquet/internal/encoding/memo_table.go
@@ -20,11 +20,11 @@ import (
 	"math"
 	"unsafe"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/memory"
-	"github.com/apache/arrow/go/parquet"
-	"github.com/apache/arrow/go/parquet/internal/hashing"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/apache/arrow/go/v7/parquet"
+	"github.com/apache/arrow/go/v7/parquet/internal/hashing"
 )
 
 //go:generate go run ../../../arrow/_tools/tmpl/main.go -i -data=physical_types.tmpldata memo_table_types.gen.go.tmpl
diff --git a/go/parquet/internal/encoding/memo_table_test.go b/go/parquet/internal/encoding/memo_table_test.go
index 82432763f7404..80b83d4592422 100644
--- a/go/parquet/internal/encoding/memo_table_test.go
+++ b/go/parquet/internal/encoding/memo_table_test.go
@@ -20,9 +20,9 @@ import (
 	"math"
 	"testing"
 
-	"github.com/apache/arrow/go/arrow/memory"
-	"github.com/apache/arrow/go/parquet/internal/encoding"
-	"github.com/apache/arrow/go/parquet/internal/hashing"
+	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/apache/arrow/go/v7/parquet/internal/encoding"
+	"github.com/apache/arrow/go/v7/parquet/internal/hashing"
 	"github.com/stretchr/testify/suite"
 )
 
diff --git a/go/parquet/internal/encoding/memo_table_types.gen.go b/go/parquet/internal/encoding/memo_table_types.gen.go
index d9707b0e90cf3..3051283b683b6 100644
--- a/go/parquet/internal/encoding/memo_table_types.gen.go
+++ b/go/parquet/internal/encoding/memo_table_types.gen.go
@@ -19,8 +19,8 @@
 package encoding
 
 import (
-	"github.com/apache/arrow/go/arrow/memory"
-	"github.com/apache/arrow/go/parquet"
+	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/apache/arrow/go/v7/parquet"
 )
 
 // standard map based implementation of memo tables which can be more efficient
diff --git a/go/parquet/internal/encoding/memo_table_types.gen.go.tmpl b/go/parquet/internal/encoding/memo_table_types.gen.go.tmpl
index f809bbf5b0c29..eab606b283b25 100644
--- a/go/parquet/internal/encoding/memo_table_types.gen.go.tmpl
+++ b/go/parquet/internal/encoding/memo_table_types.gen.go.tmpl
@@ -17,7 +17,7 @@
 package encoding
 
 import (
-  "github.com/apache/arrow/go/parquet"
+  "github.com/apache/arrow/go/v7/parquet"
 )
 
 // standard map based implementation of memo tables which can be more efficient
diff --git a/go/parquet/internal/encoding/plain_encoder_types.gen.go b/go/parquet/internal/encoding/plain_encoder_types.gen.go
index a3826339dfae1..c89b1b5eb3906 100644
--- a/go/parquet/internal/encoding/plain_encoder_types.gen.go
+++ b/go/parquet/internal/encoding/plain_encoder_types.gen.go
@@ -23,10 +23,10 @@ import (
 	"encoding/binary"
 	"math"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/endian"
-	"github.com/apache/arrow/go/parquet"
-	"github.com/apache/arrow/go/parquet/internal/utils"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/endian"
+	"github.com/apache/arrow/go/v7/parquet"
+	"github.com/apache/arrow/go/v7/parquet/internal/utils"
 	"golang.org/x/xerrors"
 )
 
diff --git a/go/parquet/internal/encoding/plain_encoder_types.gen.go.tmpl b/go/parquet/internal/encoding/plain_encoder_types.gen.go.tmpl
index 1b72497444cef..57b3acf4ec13c 100644
--- a/go/parquet/internal/encoding/plain_encoder_types.gen.go.tmpl
+++ b/go/parquet/internal/encoding/plain_encoder_types.gen.go.tmpl
@@ -19,9 +19,9 @@ package encoding
 import (
   "encoding/binary"
 
-  "github.com/apache/arrow/go/arrow"
-  "github.com/apache/arrow/go/parquet"
-  "github.com/apache/arrow/go/parquet/internal/utils"
+  "github.com/apache/arrow/go/v7/arrow"
+  "github.com/apache/arrow/go/v7/parquet"
+  "github.com/apache/arrow/go/v7/parquet/internal/utils"
 )
 
 var (
diff --git a/go/parquet/internal/encoding/typed_encoder.gen.go b/go/parquet/internal/encoding/typed_encoder.gen.go
index 54f0f56310737..80a46968a25dd 100644
--- a/go/parquet/internal/encoding/typed_encoder.gen.go
+++ b/go/parquet/internal/encoding/typed_encoder.gen.go
@@ -21,12 +21,12 @@ package encoding
 import (
 	"unsafe"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/memory"
-	"github.com/apache/arrow/go/parquet"
-	format "github.com/apache/arrow/go/parquet/internal/gen-go/parquet"
-	"github.com/apache/arrow/go/parquet/internal/utils"
-	"github.com/apache/arrow/go/parquet/schema"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/apache/arrow/go/v7/parquet"
+	format "github.com/apache/arrow/go/v7/parquet/internal/gen-go/parquet"
+	"github.com/apache/arrow/go/v7/parquet/internal/utils"
+	"github.com/apache/arrow/go/v7/parquet/schema"
 	"golang.org/x/xerrors"
 )
 
diff --git a/go/parquet/internal/encoding/typed_encoder.gen.go.tmpl b/go/parquet/internal/encoding/typed_encoder.gen.go.tmpl
index 14c1e9a46f50f..734a8e9903f26 100644
--- a/go/parquet/internal/encoding/typed_encoder.gen.go.tmpl
+++ b/go/parquet/internal/encoding/typed_encoder.gen.go.tmpl
@@ -17,11 +17,11 @@
 package encoding
 
 import (
-  "github.com/apache/arrow/go/parquet"
-  "github.com/apache/arrow/go/parquet/schema"
-  format "github.com/apache/arrow/go/parquet/internal/gen-go/parquet"
-  "github.com/apache/arrow/go/arrow"
-  "github.com/apache/arrow/go/parquet/internal/utils"
+  "github.com/apache/arrow/go/v7/parquet"
+  "github.com/apache/arrow/go/v7/parquet/schema"
+  format "github.com/apache/arrow/go/v7/parquet/internal/gen-go/parquet"
+  "github.com/apache/arrow/go/v7/arrow"
+  "github.com/apache/arrow/go/v7/parquet/internal/utils"
 )
 
 // fully typed encoder interfaces to enable writing against encoder/decoders
diff --git a/go/parquet/internal/encoding/types.go b/go/parquet/internal/encoding/types.go
index ed3a5c8abce10..76ee94dbc715a 100644
--- a/go/parquet/internal/encoding/types.go
+++ b/go/parquet/internal/encoding/types.go
@@ -20,10 +20,10 @@ import (
 	"io"
 	"sync"
 
-	"github.com/apache/arrow/go/arrow/bitutil"
-	"github.com/apache/arrow/go/arrow/memory"
-	"github.com/apache/arrow/go/parquet"
-	"github.com/apache/arrow/go/parquet/internal/utils"
+	"github.com/apache/arrow/go/v7/arrow/bitutil"
+	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/apache/arrow/go/v7/parquet"
+	"github.com/apache/arrow/go/v7/parquet/internal/utils"
 	"golang.org/x/xerrors"
 )
 
diff --git a/go/parquet/internal/encryption/aes.go b/go/parquet/internal/encryption/aes.go
index 3138b921f80fd..338743d7a7280 100644
--- a/go/parquet/internal/encryption/aes.go
+++ b/go/parquet/internal/encryption/aes.go
@@ -28,7 +28,7 @@ import (
 	"encoding/binary"
 	"io"
 
-	"github.com/apache/arrow/go/parquet"
+	"github.com/apache/arrow/go/v7/parquet"
 	"golang.org/x/xerrors"
 )
 
diff --git a/go/parquet/internal/encryption/decryptor.go b/go/parquet/internal/encryption/decryptor.go
index 9a427a756050b..30cf2fa0fbce3 100644
--- a/go/parquet/internal/encryption/decryptor.go
+++ b/go/parquet/internal/encryption/decryptor.go
@@ -17,8 +17,8 @@
 package encryption
 
 import (
-	"github.com/apache/arrow/go/arrow/memory"
-	"github.com/apache/arrow/go/parquet"
+	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/apache/arrow/go/v7/parquet"
 )
 
 // FileDecryptor is an interface used by the filereader for decrypting an
diff --git a/go/parquet/internal/encryption/encryptor.go b/go/parquet/internal/encryption/encryptor.go
index dda5c186a81e3..512b32c2a16c5 100644
--- a/go/parquet/internal/encryption/encryptor.go
+++ b/go/parquet/internal/encryption/encryptor.go
@@ -19,8 +19,8 @@ package encryption
 import (
 	"io"
 
-	"github.com/apache/arrow/go/arrow/memory"
-	"github.com/apache/arrow/go/parquet"
+	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/apache/arrow/go/v7/parquet"
 )
 
 // FileEncryptor is the interface for constructing encryptors for the different
diff --git a/go/parquet/internal/hashing/xxh3_memo_table.gen.go b/go/parquet/internal/hashing/xxh3_memo_table.gen.go
index 7c0a67bd360eb..6b18bd2324fe0 100644
--- a/go/parquet/internal/hashing/xxh3_memo_table.gen.go
+++ b/go/parquet/internal/hashing/xxh3_memo_table.gen.go
@@ -21,9 +21,9 @@ package hashing
 import (
 	"math"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/bitutil"
-	"github.com/apache/arrow/go/parquet/internal/utils"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/bitutil"
+	"github.com/apache/arrow/go/v7/parquet/internal/utils"
 )
 
 type payloadInt32 struct {
diff --git a/go/parquet/internal/hashing/xxh3_memo_table.gen.go.tmpl b/go/parquet/internal/hashing/xxh3_memo_table.gen.go.tmpl
index 7a84f1c0f668e..b8ca6707d714c 100644
--- a/go/parquet/internal/hashing/xxh3_memo_table.gen.go.tmpl
+++ b/go/parquet/internal/hashing/xxh3_memo_table.gen.go.tmpl
@@ -17,8 +17,8 @@
 package hashing
 
 import (
-  "github.com/apache/arrow/go/arrow/bitutil"
-  "github.com/apache/arrow/go/parquet/internal/utils"
+  "github.com/apache/arrow/go/v7/arrow/bitutil"
+  "github.com/apache/arrow/go/v7/parquet/internal/utils"
 )
 
 {{range .In}}
diff --git a/go/parquet/internal/hashing/xxh3_memo_table.go b/go/parquet/internal/hashing/xxh3_memo_table.go
index db159414c62d5..811d81f37e65c 100644
--- a/go/parquet/internal/hashing/xxh3_memo_table.go
+++ b/go/parquet/internal/hashing/xxh3_memo_table.go
@@ -26,10 +26,10 @@ import (
 	"reflect"
 	"unsafe"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/memory"
-	"github.com/apache/arrow/go/parquet"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/apache/arrow/go/v7/parquet"
 
 	"github.com/zeebo/xxh3"
 )
diff --git a/go/parquet/internal/testutils/pagebuilder.go b/go/parquet/internal/testutils/pagebuilder.go
index f742f1a561aaf..1ced6f9536d0f 100644
--- a/go/parquet/internal/testutils/pagebuilder.go
+++ b/go/parquet/internal/testutils/pagebuilder.go
@@ -21,13 +21,13 @@ import (
 	"io"
 	"reflect"
 
-	"github.com/apache/arrow/go/arrow/memory"
-	"github.com/apache/arrow/go/parquet"
-	"github.com/apache/arrow/go/parquet/compress"
-	"github.com/apache/arrow/go/parquet/file"
-	"github.com/apache/arrow/go/parquet/internal/encoding"
-	"github.com/apache/arrow/go/parquet/internal/utils"
-	"github.com/apache/arrow/go/parquet/schema"
+	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/apache/arrow/go/v7/parquet"
+	"github.com/apache/arrow/go/v7/parquet/compress"
+	"github.com/apache/arrow/go/v7/parquet/file"
+	"github.com/apache/arrow/go/v7/parquet/internal/encoding"
+	"github.com/apache/arrow/go/v7/parquet/internal/utils"
+	"github.com/apache/arrow/go/v7/parquet/schema"
 	"github.com/stretchr/testify/mock"
 )
 
diff --git a/go/parquet/internal/testutils/random.go b/go/parquet/internal/testutils/random.go
index 0ed0943905cec..c37e0392ebd78 100644
--- a/go/parquet/internal/testutils/random.go
+++ b/go/parquet/internal/testutils/random.go
@@ -23,11 +23,11 @@ import (
 	"time"
 	"unsafe"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/bitutil"
-	"github.com/apache/arrow/go/arrow/memory"
-	"github.com/apache/arrow/go/parquet"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/bitutil"
+	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/apache/arrow/go/v7/parquet"
 
 	"golang.org/x/exp/rand"
 	"gonum.org/v1/gonum/stat/distuv"
diff --git a/go/parquet/internal/testutils/random_arrow.go b/go/parquet/internal/testutils/random_arrow.go
index c3edf6b489a5c..08b11580aea31 100644
--- a/go/parquet/internal/testutils/random_arrow.go
+++ b/go/parquet/internal/testutils/random_arrow.go
@@ -17,9 +17,9 @@
 package testutils
 
 import (
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 	"golang.org/x/exp/rand"
 )
 
diff --git a/go/parquet/internal/testutils/utils.go b/go/parquet/internal/testutils/utils.go
index 503c60044abd5..554d908201682 100644
--- a/go/parquet/internal/testutils/utils.go
+++ b/go/parquet/internal/testutils/utils.go
@@ -19,7 +19,7 @@ package testutils
 import (
 	"reflect"
 
-	"github.com/apache/arrow/go/parquet"
+	"github.com/apache/arrow/go/v7/parquet"
 )
 
 var typeToParquetTypeMap = map[reflect.Type]parquet.Type{
diff --git a/go/parquet/internal/thrift/helpers.go b/go/parquet/internal/thrift/helpers.go
index e58df01bd6eaa..dbfcc55f16c6a 100644
--- a/go/parquet/internal/thrift/helpers.go
+++ b/go/parquet/internal/thrift/helpers.go
@@ -23,7 +23,7 @@ import (
 	"context"
 	"io"
 
-	"github.com/apache/arrow/go/parquet/internal/encryption"
+	"github.com/apache/arrow/go/v7/parquet/internal/encryption"
 	"github.com/apache/thrift/lib/go/thrift"
 )
 
diff --git a/go/parquet/internal/utils/bit_benchmark_test.go b/go/parquet/internal/utils/bit_benchmark_test.go
index 7539dfe756e4e..fe5a69479610a 100644
--- a/go/parquet/internal/utils/bit_benchmark_test.go
+++ b/go/parquet/internal/utils/bit_benchmark_test.go
@@ -20,9 +20,9 @@ import (
 	"strconv"
 	"testing"
 
-	"github.com/apache/arrow/go/arrow/bitutil"
-	"github.com/apache/arrow/go/parquet/internal/testutils"
-	"github.com/apache/arrow/go/parquet/internal/utils"
+	"github.com/apache/arrow/go/v7/arrow/bitutil"
+	"github.com/apache/arrow/go/v7/parquet/internal/testutils"
+	"github.com/apache/arrow/go/v7/parquet/internal/utils"
 )
 
 func randomBitsBuffer(nbits, setPct int64) []byte {
diff --git a/go/parquet/internal/utils/bit_block_counter.go b/go/parquet/internal/utils/bit_block_counter.go
index 3576ac9944c39..b8b29e26b6c1c 100644
--- a/go/parquet/internal/utils/bit_block_counter.go
+++ b/go/parquet/internal/utils/bit_block_counter.go
@@ -21,7 +21,7 @@ import (
 	"math/bits"
 	"unsafe"
 
-	"github.com/apache/arrow/go/arrow/bitutil"
+	"github.com/apache/arrow/go/v7/arrow/bitutil"
 )
 
 func loadWord(byt []byte) uint64 {
diff --git a/go/parquet/internal/utils/bit_block_counter_test.go b/go/parquet/internal/utils/bit_block_counter_test.go
index 86c55cd83837d..0495db22eb756 100644
--- a/go/parquet/internal/utils/bit_block_counter_test.go
+++ b/go/parquet/internal/utils/bit_block_counter_test.go
@@ -19,9 +19,9 @@ package utils_test
 import (
 	"testing"
 
-	"github.com/apache/arrow/go/arrow/bitutil"
-	"github.com/apache/arrow/go/arrow/memory"
-	"github.com/apache/arrow/go/parquet/internal/utils"
+	"github.com/apache/arrow/go/v7/arrow/bitutil"
+	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/apache/arrow/go/v7/parquet/internal/utils"
 	"github.com/stretchr/testify/assert"
 	"golang.org/x/exp/rand"
 )
diff --git a/go/parquet/internal/utils/bit_reader.go b/go/parquet/internal/utils/bit_reader.go
index 1d2803e26789c..b77dcf8bc5d96 100644
--- a/go/parquet/internal/utils/bit_reader.go
+++ b/go/parquet/internal/utils/bit_reader.go
@@ -24,9 +24,9 @@ import (
 	"reflect"
 	"unsafe"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/bitutil"
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/bitutil"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 )
 
 // masks for grabbing the trailing bits based on the number of trailing bits desired
diff --git a/go/parquet/internal/utils/bit_reader_test.go b/go/parquet/internal/utils/bit_reader_test.go
index b076a1a30d5f8..cfec327d60489 100644
--- a/go/parquet/internal/utils/bit_reader_test.go
+++ b/go/parquet/internal/utils/bit_reader_test.go
@@ -25,11 +25,11 @@ import (
 	"strconv"
 	"testing"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/array"
-	"github.com/apache/arrow/go/arrow/bitutil"
-	"github.com/apache/arrow/go/arrow/memory"
-	"github.com/apache/arrow/go/parquet/internal/utils"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/bitutil"
+	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/apache/arrow/go/v7/parquet/internal/utils"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/suite"
 	"golang.org/x/exp/rand"
diff --git a/go/parquet/internal/utils/bit_run_reader.go b/go/parquet/internal/utils/bit_run_reader.go
index 47ddaa397c692..6e492c3519edc 100644
--- a/go/parquet/internal/utils/bit_run_reader.go
+++ b/go/parquet/internal/utils/bit_run_reader.go
@@ -22,8 +22,8 @@ import (
 	"math/bits"
 	"unsafe"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/bitutil"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/bitutil"
 )
 
 // BitRun represents a run of bits with the same value of length Len
diff --git a/go/parquet/internal/utils/bit_run_reader_test.go b/go/parquet/internal/utils/bit_run_reader_test.go
index 2b3cc4d2f4d47..01e3c5d7215f5 100644
--- a/go/parquet/internal/utils/bit_run_reader_test.go
+++ b/go/parquet/internal/utils/bit_run_reader_test.go
@@ -21,9 +21,9 @@ import (
 	"testing"
 	"unsafe"
 
-	"github.com/apache/arrow/go/arrow/bitutil"
-	"github.com/apache/arrow/go/arrow/endian"
-	"github.com/apache/arrow/go/parquet/internal/utils"
+	"github.com/apache/arrow/go/v7/arrow/bitutil"
+	"github.com/apache/arrow/go/v7/arrow/endian"
+	"github.com/apache/arrow/go/v7/parquet/internal/utils"
 	"github.com/stretchr/testify/assert"
 )
 
diff --git a/go/parquet/internal/utils/bit_set_run_reader.go b/go/parquet/internal/utils/bit_set_run_reader.go
index 0dd6eeba73a1e..a4139dfe7cab7 100644
--- a/go/parquet/internal/utils/bit_set_run_reader.go
+++ b/go/parquet/internal/utils/bit_set_run_reader.go
@@ -20,7 +20,7 @@ import (
 	"encoding/binary"
 	"math/bits"
 
-	"github.com/apache/arrow/go/arrow/bitutil"
+	"github.com/apache/arrow/go/v7/arrow/bitutil"
 )
 
 // IsMultipleOf64 returns whether v is a multiple of 64.
diff --git a/go/parquet/internal/utils/bit_set_run_reader_test.go b/go/parquet/internal/utils/bit_set_run_reader_test.go
index 7a0017805511b..c43a324b48039 100644
--- a/go/parquet/internal/utils/bit_set_run_reader_test.go
+++ b/go/parquet/internal/utils/bit_set_run_reader_test.go
@@ -20,8 +20,8 @@ import (
 	"reflect"
 	"testing"
 
-	"github.com/apache/arrow/go/arrow/bitutil"
-	"github.com/apache/arrow/go/parquet/internal/utils"
+	"github.com/apache/arrow/go/v7/arrow/bitutil"
+	"github.com/apache/arrow/go/v7/parquet/internal/utils"
 	"github.com/stretchr/testify/suite"
 )
 
diff --git a/go/parquet/internal/utils/bit_writer.go b/go/parquet/internal/utils/bit_writer.go
index 7d523f612773f..f71cfb01cf8c7 100644
--- a/go/parquet/internal/utils/bit_writer.go
+++ b/go/parquet/internal/utils/bit_writer.go
@@ -21,7 +21,7 @@ import (
 	"io"
 	"log"
 
-	"github.com/apache/arrow/go/arrow/bitutil"
+	"github.com/apache/arrow/go/v7/arrow/bitutil"
 )
 
 // WriterAtBuffer is a convenience struct for providing a WriteAt function
diff --git a/go/parquet/internal/utils/bitmap_writer.go b/go/parquet/internal/utils/bitmap_writer.go
index c386705f4446c..419fa9ded6a54 100644
--- a/go/parquet/internal/utils/bitmap_writer.go
+++ b/go/parquet/internal/utils/bitmap_writer.go
@@ -20,7 +20,7 @@ import (
 	"encoding/binary"
 	"math/bits"
 
-	"github.com/apache/arrow/go/arrow/bitutil"
+	"github.com/apache/arrow/go/v7/arrow/bitutil"
 )
 
 // BitmapWriter is an interface for bitmap writers so that we can use multiple
diff --git a/go/parquet/internal/utils/bitmap_writer_test.go b/go/parquet/internal/utils/bitmap_writer_test.go
index ec1218a0d6f63..f2bc2d1e3db5f 100644
--- a/go/parquet/internal/utils/bitmap_writer_test.go
+++ b/go/parquet/internal/utils/bitmap_writer_test.go
@@ -22,8 +22,8 @@ import (
 	"strings"
 	"testing"
 
-	"github.com/apache/arrow/go/arrow/bitutil"
-	"github.com/apache/arrow/go/parquet/internal/utils"
+	"github.com/apache/arrow/go/v7/arrow/bitutil"
+	"github.com/apache/arrow/go/v7/parquet/internal/utils"
 	"github.com/stretchr/testify/suite"
 )
 
diff --git a/go/parquet/internal/utils/rle.go b/go/parquet/internal/utils/rle.go
index 72dbc36767949..63f1d63aa7eff 100644
--- a/go/parquet/internal/utils/rle.go
+++ b/go/parquet/internal/utils/rle.go
@@ -25,8 +25,8 @@ import (
 	"io"
 	"math"
 
-	"github.com/apache/arrow/go/arrow/bitutil"
-	"github.com/apache/arrow/go/parquet"
+	"github.com/apache/arrow/go/v7/arrow/bitutil"
+	"github.com/apache/arrow/go/v7/parquet"
 	"golang.org/x/xerrors"
 )
 
diff --git a/go/parquet/internal/utils/typed_rle_dict.gen.go b/go/parquet/internal/utils/typed_rle_dict.gen.go
index a20f551a53edc..9d95a62e0aa01 100644
--- a/go/parquet/internal/utils/typed_rle_dict.gen.go
+++ b/go/parquet/internal/utils/typed_rle_dict.gen.go
@@ -19,7 +19,7 @@
 package utils
 
 import (
-	"github.com/apache/arrow/go/parquet"
+	"github.com/apache/arrow/go/v7/parquet"
 	"golang.org/x/xerrors"
 )
 
diff --git a/go/parquet/internal/utils/typed_rle_dict.gen.go.tmpl b/go/parquet/internal/utils/typed_rle_dict.gen.go.tmpl
index f511167b1fb4e..f99982e69f01c 100644
--- a/go/parquet/internal/utils/typed_rle_dict.gen.go.tmpl
+++ b/go/parquet/internal/utils/typed_rle_dict.gen.go.tmpl
@@ -17,7 +17,7 @@
 package utils
 
 import (
-  "github.com/apache/arrow/go/parquet"
+  "github.com/apache/arrow/go/v7/parquet"
 )
 
 {{range .In}}
diff --git a/go/parquet/metadata/app_version.go b/go/parquet/metadata/app_version.go
index 02d9a4aa87c69..74d7f328b62c6 100644
--- a/go/parquet/metadata/app_version.go
+++ b/go/parquet/metadata/app_version.go
@@ -21,8 +21,8 @@ import (
 	"strconv"
 	"strings"
 
-	"github.com/apache/arrow/go/parquet"
-	"github.com/apache/arrow/go/parquet/schema"
+	"github.com/apache/arrow/go/v7/parquet"
+	"github.com/apache/arrow/go/v7/parquet/schema"
 )
 
 var (
diff --git a/go/parquet/metadata/column_chunk.go b/go/parquet/metadata/column_chunk.go
index e3a7eae7d8142..60c036605e689 100644
--- a/go/parquet/metadata/column_chunk.go
+++ b/go/parquet/metadata/column_chunk.go
@@ -22,13 +22,13 @@ import (
 	"io"
 	"reflect"
 
-	"github.com/apache/arrow/go/arrow/memory"
-	"github.com/apache/arrow/go/parquet"
-	"github.com/apache/arrow/go/parquet/compress"
-	"github.com/apache/arrow/go/parquet/internal/encryption"
-	format "github.com/apache/arrow/go/parquet/internal/gen-go/parquet"
-	"github.com/apache/arrow/go/parquet/internal/thrift"
-	"github.com/apache/arrow/go/parquet/schema"
+	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/apache/arrow/go/v7/parquet"
+	"github.com/apache/arrow/go/v7/parquet/compress"
+	"github.com/apache/arrow/go/v7/parquet/internal/encryption"
+	format "github.com/apache/arrow/go/v7/parquet/internal/gen-go/parquet"
+	"github.com/apache/arrow/go/v7/parquet/internal/thrift"
+	"github.com/apache/arrow/go/v7/parquet/schema"
 	"golang.org/x/xerrors"
 )
 
diff --git a/go/parquet/metadata/file.go b/go/parquet/metadata/file.go
index 30dcc98e9f2a2..293e62c1eac3c 100644
--- a/go/parquet/metadata/file.go
+++ b/go/parquet/metadata/file.go
@@ -23,12 +23,12 @@ import (
 	"reflect"
 	"unicode/utf8"
 
-	"github.com/apache/arrow/go/parquet"
-	"github.com/apache/arrow/go/parquet/compress"
-	"github.com/apache/arrow/go/parquet/internal/encryption"
-	format "github.com/apache/arrow/go/parquet/internal/gen-go/parquet"
-	"github.com/apache/arrow/go/parquet/internal/thrift"
-	"github.com/apache/arrow/go/parquet/schema"
+	"github.com/apache/arrow/go/v7/parquet"
+	"github.com/apache/arrow/go/v7/parquet/compress"
+	"github.com/apache/arrow/go/v7/parquet/internal/encryption"
+	format "github.com/apache/arrow/go/v7/parquet/internal/gen-go/parquet"
+	"github.com/apache/arrow/go/v7/parquet/internal/thrift"
+	"github.com/apache/arrow/go/v7/parquet/schema"
 	"golang.org/x/xerrors"
 )
 
diff --git a/go/parquet/metadata/metadata_test.go b/go/parquet/metadata/metadata_test.go
index 4124b26ded21b..99ed029cdd554 100644
--- a/go/parquet/metadata/metadata_test.go
+++ b/go/parquet/metadata/metadata_test.go
@@ -21,9 +21,9 @@ import (
 	"testing"
 	"unsafe"
 
-	"github.com/apache/arrow/go/parquet"
-	"github.com/apache/arrow/go/parquet/metadata"
-	"github.com/apache/arrow/go/parquet/schema"
+	"github.com/apache/arrow/go/v7/parquet"
+	"github.com/apache/arrow/go/v7/parquet/metadata"
+	"github.com/apache/arrow/go/v7/parquet/schema"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 )
diff --git a/go/parquet/metadata/row_group.go b/go/parquet/metadata/row_group.go
index 37d3122626ceb..1f9d5a2bc95ac 100644
--- a/go/parquet/metadata/row_group.go
+++ b/go/parquet/metadata/row_group.go
@@ -19,10 +19,10 @@ package metadata
 import (
 	"reflect"
 
-	"github.com/apache/arrow/go/parquet"
-	"github.com/apache/arrow/go/parquet/internal/encryption"
-	format "github.com/apache/arrow/go/parquet/internal/gen-go/parquet"
-	"github.com/apache/arrow/go/parquet/schema"
+	"github.com/apache/arrow/go/v7/parquet"
+	"github.com/apache/arrow/go/v7/parquet/internal/encryption"
+	format "github.com/apache/arrow/go/v7/parquet/internal/gen-go/parquet"
+	"github.com/apache/arrow/go/v7/parquet/schema"
 	"golang.org/x/xerrors"
 )
 
diff --git a/go/parquet/metadata/stat_compare_test.go b/go/parquet/metadata/stat_compare_test.go
index c1fd11a8f6991..e2d1992f948de 100644
--- a/go/parquet/metadata/stat_compare_test.go
+++ b/go/parquet/metadata/stat_compare_test.go
@@ -20,8 +20,8 @@ import (
 	"encoding/binary"
 	"testing"
 
-	"github.com/apache/arrow/go/parquet"
-	"github.com/apache/arrow/go/parquet/schema"
+	"github.com/apache/arrow/go/v7/parquet"
+	"github.com/apache/arrow/go/v7/parquet/schema"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 )
diff --git a/go/parquet/metadata/statistics.go b/go/parquet/metadata/statistics.go
index d03af876bfd2e..0f06d0e7be34f 100644
--- a/go/parquet/metadata/statistics.go
+++ b/go/parquet/metadata/statistics.go
@@ -22,14 +22,14 @@ import (
 	"math"
 	"unsafe"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/memory"
-	"github.com/apache/arrow/go/parquet"
-	"github.com/apache/arrow/go/parquet/internal/debug"
-	"github.com/apache/arrow/go/parquet/internal/encoding"
-	"github.com/apache/arrow/go/parquet/internal/utils"
-	format "github.com/apache/arrow/go/parquet/internal/gen-go/parquet"
-	"github.com/apache/arrow/go/parquet/schema"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/apache/arrow/go/v7/parquet"
+	"github.com/apache/arrow/go/v7/parquet/internal/debug"
+	"github.com/apache/arrow/go/v7/parquet/internal/encoding"
+	"github.com/apache/arrow/go/v7/parquet/internal/utils"
+	format "github.com/apache/arrow/go/v7/parquet/internal/gen-go/parquet"
+	"github.com/apache/arrow/go/v7/parquet/schema"
 )
 
 //go:generate go run ../../arrow/_tools/tmpl/main.go -i -data=../internal/encoding/physical_types.tmpldata statistics_types.gen.go.tmpl
diff --git a/go/parquet/metadata/statistics_test.go b/go/parquet/metadata/statistics_test.go
index b052b37f778a1..d72ca012a33d9 100644
--- a/go/parquet/metadata/statistics_test.go
+++ b/go/parquet/metadata/statistics_test.go
@@ -21,11 +21,11 @@ import (
 	"reflect"
 	"testing"
 
-	"github.com/apache/arrow/go/arrow/bitutil"
-	"github.com/apache/arrow/go/arrow/memory"
-	"github.com/apache/arrow/go/parquet"
-	"github.com/apache/arrow/go/parquet/metadata"
-	"github.com/apache/arrow/go/parquet/schema"
+	"github.com/apache/arrow/go/v7/arrow/bitutil"
+	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/apache/arrow/go/v7/parquet"
+	"github.com/apache/arrow/go/v7/parquet/metadata"
+	"github.com/apache/arrow/go/v7/parquet/schema"
 	"github.com/stretchr/testify/assert"
 )
 
diff --git a/go/parquet/metadata/statistics_types.gen.go b/go/parquet/metadata/statistics_types.gen.go
index 63971a0a7146a..904ea1035a967 100644
--- a/go/parquet/metadata/statistics_types.gen.go
+++ b/go/parquet/metadata/statistics_types.gen.go
@@ -21,12 +21,12 @@ package metadata
 import (
 	"math"
 
-	"github.com/apache/arrow/go/arrow"
-	"github.com/apache/arrow/go/arrow/memory"
-	"github.com/apache/arrow/go/parquet"
-	"github.com/apache/arrow/go/parquet/internal/encoding"
-	"github.com/apache/arrow/go/parquet/internal/utils"
-	"github.com/apache/arrow/go/parquet/schema"
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/apache/arrow/go/v7/parquet"
+	"github.com/apache/arrow/go/v7/parquet/internal/encoding"
+	"github.com/apache/arrow/go/v7/parquet/internal/utils"
+	"github.com/apache/arrow/go/v7/parquet/schema"
 	"golang.org/x/xerrors"
 )
 
diff --git a/go/parquet/metadata/statistics_types.gen.go.tmpl b/go/parquet/metadata/statistics_types.gen.go.tmpl
index 4ee47a917250d..a7d54696b3e7b 100644
--- a/go/parquet/metadata/statistics_types.gen.go.tmpl
+++ b/go/parquet/metadata/statistics_types.gen.go.tmpl
@@ -17,10 +17,10 @@
 package metadata
 
 import (
-  "github.com/apache/arrow/go/parquet"
-  "github.com/apache/arrow/go/parquet/schema"
-  "github.com/apache/arrow/go/parquet/internal/utils"
-  "github.com/apache/arrow/go/parquet/internal/encoding"
+  "github.com/apache/arrow/go/v7/parquet"
+  "github.com/apache/arrow/go/v7/parquet/schema"
+  "github.com/apache/arrow/go/v7/parquet/internal/utils"
+  "github.com/apache/arrow/go/v7/parquet/internal/encoding"
 )
 
 {{range .In}}
diff --git a/go/parquet/reader_properties.go b/go/parquet/reader_properties.go
index 7e99d9f68705a..6bbcca7aa6377 100644
--- a/go/parquet/reader_properties.go
+++ b/go/parquet/reader_properties.go
@@ -20,7 +20,7 @@ import (
 	"bytes"
 	"io"
 
-	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow/memory"
 	"golang.org/x/xerrors"
 )
 
diff --git a/go/parquet/reader_writer_properties_test.go b/go/parquet/reader_writer_properties_test.go
index a8bffef3fdf89..79e3f01ccb992 100644
--- a/go/parquet/reader_writer_properties_test.go
+++ b/go/parquet/reader_writer_properties_test.go
@@ -20,9 +20,9 @@ import (
 	"bytes"
 	"testing"
 
-	"github.com/apache/arrow/go/arrow/memory"
-	"github.com/apache/arrow/go/parquet"
-	"github.com/apache/arrow/go/parquet/compress"
+	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/apache/arrow/go/v7/parquet"
+	"github.com/apache/arrow/go/v7/parquet/compress"
 	"github.com/stretchr/testify/assert"
 )
 
diff --git a/go/parquet/schema/column.go b/go/parquet/schema/column.go
index c33ddf0d8a694..fbe99ae84c37b 100644
--- a/go/parquet/schema/column.go
+++ b/go/parquet/schema/column.go
@@ -20,8 +20,8 @@ import (
 	"fmt"
 	"strings"
 
-	"github.com/apache/arrow/go/parquet"
-	format "github.com/apache/arrow/go/parquet/internal/gen-go/parquet"
+	"github.com/apache/arrow/go/v7/parquet"
+	format "github.com/apache/arrow/go/v7/parquet/internal/gen-go/parquet"
 )
 
 // Column encapsulates the information necessary to interpret primitive
diff --git a/go/parquet/schema/converted_types.go b/go/parquet/schema/converted_types.go
index b5ceff3125714..8ae6b85e8cb98 100644
--- a/go/parquet/schema/converted_types.go
+++ b/go/parquet/schema/converted_types.go
@@ -17,7 +17,7 @@
 package schema
 
 import (
-	format "github.com/apache/arrow/go/parquet/internal/gen-go/parquet"
+	format "github.com/apache/arrow/go/v7/parquet/internal/gen-go/parquet"
 )
 
 // ConvertedType corresponds to the ConvertedType in the parquet.Thrift,
diff --git a/go/parquet/schema/converted_types_test.go b/go/parquet/schema/converted_types_test.go
index 86e0cb023e380..6fedd12672602 100644
--- a/go/parquet/schema/converted_types_test.go
+++ b/go/parquet/schema/converted_types_test.go
@@ -19,7 +19,7 @@ package schema_test
 import (
 	"testing"
 
-	"github.com/apache/arrow/go/parquet/schema"
+	"github.com/apache/arrow/go/v7/parquet/schema"
 	"github.com/stretchr/testify/assert"
 )
 
diff --git a/go/parquet/schema/helpers.go b/go/parquet/schema/helpers.go
index 70df2a3d0b029..df20533f18693 100644
--- a/go/parquet/schema/helpers.go
+++ b/go/parquet/schema/helpers.go
@@ -17,7 +17,7 @@
 package schema
 
 import (
-	"github.com/apache/arrow/go/parquet"
+	"github.com/apache/arrow/go/v7/parquet"
 	"golang.org/x/xerrors"
 )
 
diff --git a/go/parquet/schema/helpers_test.go b/go/parquet/schema/helpers_test.go
index 0c1a6ab35606f..49be33907f3a8 100644
--- a/go/parquet/schema/helpers_test.go
+++ b/go/parquet/schema/helpers_test.go
@@ -21,8 +21,8 @@ import (
 	"strings"
 	"testing"
 
-	"github.com/apache/arrow/go/parquet"
-	"github.com/apache/arrow/go/parquet/schema"
+	"github.com/apache/arrow/go/v7/parquet"
+	"github.com/apache/arrow/go/v7/parquet/schema"
 	"github.com/stretchr/testify/assert"
 )
 
diff --git a/go/parquet/schema/logical_types.go b/go/parquet/schema/logical_types.go
index b425c895d843a..d7c6def0d74fb 100644
--- a/go/parquet/schema/logical_types.go
+++ b/go/parquet/schema/logical_types.go
@@ -21,9 +21,9 @@ import (
 	"fmt"
 	"math"
 
-	"github.com/apache/arrow/go/parquet"
-	"github.com/apache/arrow/go/parquet/internal/debug"
-	format "github.com/apache/arrow/go/parquet/internal/gen-go/parquet"
+	"github.com/apache/arrow/go/v7/parquet"
+	"github.com/apache/arrow/go/v7/parquet/internal/debug"
+	format "github.com/apache/arrow/go/v7/parquet/internal/gen-go/parquet"
 )
 
 // DecimalMetadata is a struct for managing scale and precision information between
diff --git a/go/parquet/schema/logical_types_test.go b/go/parquet/schema/logical_types_test.go
index cc2b23301dfe9..1bf51053a37fd 100644
--- a/go/parquet/schema/logical_types_test.go
+++ b/go/parquet/schema/logical_types_test.go
@@ -20,8 +20,8 @@ import (
 	"encoding/json"
 	"testing"
 
-	"github.com/apache/arrow/go/parquet"
-	"github.com/apache/arrow/go/parquet/schema"
+	"github.com/apache/arrow/go/v7/parquet"
+	"github.com/apache/arrow/go/v7/parquet/schema"
 	"github.com/stretchr/testify/assert"
 )
 
diff --git a/go/parquet/schema/node.go b/go/parquet/schema/node.go
index 0388442649003..f80023736f71b 100644
--- a/go/parquet/schema/node.go
+++ b/go/parquet/schema/node.go
@@ -17,8 +17,8 @@
 package schema
 
 import (
-	"github.com/apache/arrow/go/parquet"
-	format "github.com/apache/arrow/go/parquet/internal/gen-go/parquet"
+	"github.com/apache/arrow/go/v7/parquet"
+	format "github.com/apache/arrow/go/v7/parquet/internal/gen-go/parquet"
 	"github.com/apache/thrift/lib/go/thrift"
 	"golang.org/x/xerrors"
 )
diff --git a/go/parquet/schema/reflection.go b/go/parquet/schema/reflection.go
index 8da05fb540f4c..2eaa0167da7f5 100644
--- a/go/parquet/schema/reflection.go
+++ b/go/parquet/schema/reflection.go
@@ -21,8 +21,8 @@ import (
 	"strconv"
 	"strings"
 
-	"github.com/apache/arrow/go/parquet"
-	format "github.com/apache/arrow/go/parquet/internal/gen-go/parquet"
+	"github.com/apache/arrow/go/v7/parquet"
+	format "github.com/apache/arrow/go/v7/parquet/internal/gen-go/parquet"
 	"golang.org/x/xerrors"
 )
 
diff --git a/go/parquet/schema/reflection_test.go b/go/parquet/schema/reflection_test.go
index ba0921596933d..0e68fe5b8b7ca 100644
--- a/go/parquet/schema/reflection_test.go
+++ b/go/parquet/schema/reflection_test.go
@@ -22,8 +22,8 @@ import (
 	"reflect"
 	"testing"
 
-	"github.com/apache/arrow/go/parquet"
-	"github.com/apache/arrow/go/parquet/schema"
+	"github.com/apache/arrow/go/v7/parquet"
+	"github.com/apache/arrow/go/v7/parquet/schema"
 	"github.com/stretchr/testify/assert"
 )
 
diff --git a/go/parquet/schema/schema.go b/go/parquet/schema/schema.go
index 9402edc6f1f05..aed7a128ad144 100644
--- a/go/parquet/schema/schema.go
+++ b/go/parquet/schema/schema.go
@@ -35,8 +35,8 @@ import (
 	"io"
 	"strings"
 
-	"github.com/apache/arrow/go/parquet"
-	format "github.com/apache/arrow/go/parquet/internal/gen-go/parquet"
+	"github.com/apache/arrow/go/v7/parquet"
+	format "github.com/apache/arrow/go/v7/parquet/internal/gen-go/parquet"
 	"golang.org/x/xerrors"
 )
 
diff --git a/go/parquet/schema/schema_element_test.go b/go/parquet/schema/schema_element_test.go
index 7a43d243215bb..72c94c37698bb 100644
--- a/go/parquet/schema/schema_element_test.go
+++ b/go/parquet/schema/schema_element_test.go
@@ -19,8 +19,8 @@ package schema
 import (
 	"testing"
 
-	"github.com/apache/arrow/go/parquet"
-	format "github.com/apache/arrow/go/parquet/internal/gen-go/parquet"
+	"github.com/apache/arrow/go/v7/parquet"
+	format "github.com/apache/arrow/go/v7/parquet/internal/gen-go/parquet"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/suite"
 )
diff --git a/go/parquet/schema/schema_flatten_test.go b/go/parquet/schema/schema_flatten_test.go
index cbe76df718c35..48ffba0dca8b4 100644
--- a/go/parquet/schema/schema_flatten_test.go
+++ b/go/parquet/schema/schema_flatten_test.go
@@ -19,8 +19,8 @@ package schema
 import (
 	"testing"
 
-	"github.com/apache/arrow/go/parquet"
-	format "github.com/apache/arrow/go/parquet/internal/gen-go/parquet"
+	"github.com/apache/arrow/go/v7/parquet"
+	format "github.com/apache/arrow/go/v7/parquet/internal/gen-go/parquet"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/suite"
 )
diff --git a/go/parquet/schema/schema_test.go b/go/parquet/schema/schema_test.go
index b2ce229161222..300b009dd4017 100644
--- a/go/parquet/schema/schema_test.go
+++ b/go/parquet/schema/schema_test.go
@@ -20,9 +20,9 @@ import (
 	"os"
 	"testing"
 
-	"github.com/apache/arrow/go/parquet"
-	format "github.com/apache/arrow/go/parquet/internal/gen-go/parquet"
-	"github.com/apache/arrow/go/parquet/schema"
+	"github.com/apache/arrow/go/v7/parquet"
+	format "github.com/apache/arrow/go/v7/parquet/internal/gen-go/parquet"
+	"github.com/apache/arrow/go/v7/parquet/schema"
 	"github.com/apache/thrift/lib/go/thrift"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/suite"
diff --git a/go/parquet/types.go b/go/parquet/types.go
index 630244ca8e9df..331ee6092b0b7 100644
--- a/go/parquet/types.go
+++ b/go/parquet/types.go
@@ -24,8 +24,8 @@ import (
 	"time"
 	"unsafe"
 
-	"github.com/apache/arrow/go/arrow"
-	format "github.com/apache/arrow/go/parquet/internal/gen-go/parquet"
+	"github.com/apache/arrow/go/v7/arrow"
+	format "github.com/apache/arrow/go/v7/parquet/internal/gen-go/parquet"
 )
 
 const (
diff --git a/go/parquet/writer_properties.go b/go/parquet/writer_properties.go
index 97bf3343f254f..d6b88fe1814b7 100644
--- a/go/parquet/writer_properties.go
+++ b/go/parquet/writer_properties.go
@@ -17,8 +17,8 @@
 package parquet
 
 import (
-	"github.com/apache/arrow/go/arrow/memory"
-	"github.com/apache/arrow/go/parquet/compress"
+	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/apache/arrow/go/v7/parquet/compress"
 )
 
 // Constants for default property values used for the default reader, writer and column props.

From 8ab345d58e38f84dcc335fd47aa856988214e674 Mon Sep 17 00:00:00 2001
From: Yibo Cai <yibo.cai@arm.com>
Date: Mon, 15 Nov 2021 03:21:00 +0000
Subject: [PATCH 141/194] MINOR: [C++] Ignore clangd index files

clangd 12 has moved index files from .clangd/ to .cache/clangd/.
clangd is a language server for code completion, go-to-definition, etc.

Closes #11700 from cyb70289/ignore-clangd-index

Authored-by: Yibo Cai <yibo.cai@arm.com>
Signed-off-by: Yibo Cai <yibo.cai@arm.com>
---
 .gitignore | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.gitignore b/.gitignore
index 6f123362ef1fb..8b12a9a5f7a97 100644
--- a/.gitignore
+++ b/.gitignore
@@ -47,6 +47,8 @@ perf.data.old
 cpp/.idea/
 .clangd/
 cpp/.clangd/
+.cache/clangd/
+cpp/.cache/clangd/
 cpp/apidoc/xml/
 docs/example.gz
 docs/example1.dat

From 21c514adee686b9db9900154506f65c1b241270f Mon Sep 17 00:00:00 2001
From: Pradeep Garigipati <pradeep.garigipati@gmail.com>
Date: Mon, 15 Nov 2021 14:55:54 +0900
Subject: [PATCH 142/194] ARROW-14678: [C++] Add reasonable CMake presets for
 quick dev setup

Run the following command for cpp builds to see list of presets available.

```bash
cmake --list-presets ..
```

Run the following command to setup build folder.
```bash
cmake --preset <chosen-preset-name> ..
```

Note that, we can still pass the regular `-D<OPTION>=<VALUE` style arguments when using presets. Presets require a minimum CMake version of 3.19. One doesn't need to use presets, they just provide a convenience if the user feasible CMake version.

This is not a comprehensive list by any means, but only a starter based on what I have been building in arrow repository. If there are given list of most used combinations, please suggest and I can add accordingly. The reason I added `ninja` prefix to current presets is different any potential `make` or `msvc` generators we might add later.

Shown below is the first few lines of sample output of CMake command with presets, it shows the CMake options set by the chosen preset.
```bash
Preset CMake variables:

  ARROW_BUILD_BENCHMARKS:BOOL="OFF"
  ARROW_BUILD_TESTS:BOOL="ON"
  ARROW_COMPUTE:BOOL="ON"
  ARROW_CSV:BOOL="ON"
  ARROW_CUDA="ON"
  ARROW_DATASET:BOOL="OFF"
  ARROW_FILESYSTEM:BOOL="ON"
  ARROW_GANDIVA:BOOL="OFF"
  ARROW_GANDIVA_JAVA:BOOL="OFF"
  ARROW_GANDIVA_JNI:BOOL="OFF"
  ARROW_IPC:BOOL="ON"
  ARROW_PARQUET:BOOL="OFF"
  ARROW_PLASMA_JAVA_CLIENT:BOOL="OFF"
  ARROW_PYTHON:BOOL="ON"
  ARROW_SKYHOOK:BOOL="OFF"
  ARROW_WITH_RE2:BOOL="ON"
  CMAKE_BUILD_TYPE:STRING="Debug"
  CMAKE_INSTALL_PREFIX:PATH="/home/pradeep/arrow/cpp/build/ninja-debug-cuda/pkg"

-- Building using CMake version: 3.21.4

```

Closes #11689 from 9prady9/ARROW-14678-Add-reasonable-CMake-Presets-for-quick-a

Lead-authored-by: Pradeep Garigipati <pradeep.garigipati@gmail.com>
Co-authored-by: Sutou Kouhei <kou@cozmixng.org>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 cpp/.gitignore        |   1 +
 cpp/CMakePresets.json | 165 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 166 insertions(+)
 create mode 100644 cpp/CMakePresets.json

diff --git a/cpp/.gitignore b/cpp/.gitignore
index 03c03a401a552..0fa5ae3ff2cf4 100644
--- a/cpp/.gitignore
+++ b/cpp/.gitignore
@@ -18,6 +18,7 @@
 thirdparty/*.tar*
 CMakeFiles/
 CMakeCache.txt
+CMakeUserPresets.json
 CTestTestfile.cmake
 Makefile
 cmake_install.cmake
diff --git a/cpp/CMakePresets.json b/cpp/CMakePresets.json
new file mode 100644
index 0000000000000..a9ca585abdc0a
--- /dev/null
+++ b/cpp/CMakePresets.json
@@ -0,0 +1,165 @@
+{
+  "version": 2,
+  "cmakeMinimumRequired": {
+    "major": 3,
+    "minor": 20,
+    "patch": 0
+  },
+  "configurePresets": [
+    {
+      "name": "ninja-benchmarks",
+      "description": "Build for benchmarks",
+      "inherits": "ninja-release",
+      "cacheVariables": {
+        "ARROW_BUILD_BENCHMARKS": "ON",
+        "ARROW_BUILD_BENCHMARKS_REFERENCE": "ON",
+        "ARROW_BUILD_TESTS": "OFF"
+      }
+    },
+    {
+      "name": "ninja-debug",
+      "description": "Debug configuration with basic build",
+      "binaryDir": "${sourceDir}/build/${presetName}",
+      "generator": "Ninja",
+      "cacheVariables": {
+        "ARROW_BUILD_BENCHMARKS": {
+          "type": "BOOL",
+          "value": "OFF"
+        },
+        "ARROW_BUILD_TESTS": {
+          "type": "BOOL",
+          "value": "ON"
+        },
+        "ARROW_COMPUTE": {
+          "type": "BOOL",
+          "value": "ON"
+        },
+        "ARROW_CSV": {
+          "type": "BOOL",
+          "value": "ON"
+        },
+        "ARROW_CUDA": {
+          "type": "BOOL",
+          "value": "OFF"
+        },
+        "ARROW_DATASET": {
+          "type": "BOOL",
+          "value": "OFF"
+        },
+        "ARROW_GANDIVA": {
+          "type": "BOOL",
+          "value": "OFF"
+        },
+        "ARROW_GANDIVA_JAVA": {
+          "type": "BOOL",
+          "value": "OFF"
+        },
+        "ARROW_GANDIVA_JNI": {
+          "type": "BOOL",
+          "value": "OFF"
+        },
+        "ARROW_FILESYSTEM": {
+          "type": "BOOL",
+          "value": "ON"
+        },
+        "ARROW_IPC": {
+          "type": "BOOL",
+          "value": "ON"
+        },
+        "ARROW_PARQUET": {
+          "type": "BOOL",
+          "value": "OFF"
+        },
+        "ARROW_PLASMA_JAVA_CLIENT": {
+          "type": "BOOL",
+          "value": "OFF"
+        },
+        "ARROW_PYTHON": {
+          "type": "BOOL",
+          "value": "ON"
+        },
+        "ARROW_SKYHOOK": {
+          "type": "BOOL",
+          "value": "OFF"
+        },
+        "ARROW_WITH_RE2": {
+          "type": "BOOL",
+          "value": "ON"
+        },
+        "CMAKE_BUILD_TYPE": {
+          "type": "String",
+          "value": "Debug"
+        },
+        "CMAKE_INSTALL_PREFIX": {
+          "type": "PATH",
+          "value": "${sourceDir}/build/${presetName}/pkg"
+        }
+      }
+    },
+    {
+      "name": "ninja-debug-cuda",
+      "description": "Debug Arrow build with CUDA extensions (requires CUDA toolkit)",
+      "inherits": "ninja-debug",
+      "cacheVariables": {
+        "ARROW_CUDA": "ON"
+      }
+    },
+    {
+      "name": "ninja-debug-dataset",
+      "description": "Builds Arrow Dataset modules",
+      "inherits": "ninja-debug",
+      "cacheVariables": {
+        "ARROW_DATASET": "ON"
+      }
+    },
+    {
+      "name": "ninja-debug-gandiva",
+      "description": "Builds Gandiva libraries",
+      "inherits": "ninja-debug",
+      "cacheVariables": {
+        "ARROW_GANDIVA": "ON"
+      }
+    },
+    {
+      "name": "ninja-debug-parquet",
+      "description": "Builds Parquet libraries",
+      "inherits": "ninja-debug",
+      "cacheVariables": {
+        "ARROW_PARQUET": "ON"
+      }
+    },
+    {
+      "name": "ninja-debug-skyhook",
+      "description": "Builds Skyhook libraries",
+
+      "inherits": "ninja-debug",
+      "cacheVariables": {
+        "ARROW_SKYHOOK": "ON"
+      }
+    },
+    {
+      "name": "ninja-release",
+      "description": "Release configuration",
+      "inherits": "ninja-debug",
+      "cacheVariables": {
+        "CMAKE_BUILD_TYPE": "Release"
+      }
+    },
+    {
+      "name": "ninja-release-gandiva",
+      "description": "Release configuration with Gandiva",
+      "inherits": "ninja-release",
+      "cacheVariables": {
+        "ARROW_GANDIVA": "ON"
+      }
+    },
+    {
+      "name": "ninja-release-parquet",
+      "description": "Release configuration with Parquet",
+      "inherits": "ninja-release",
+      "cacheVariables": {
+        "ARROW_PARQUET": "ON"
+      }
+    }
+  ]
+}

From d738cb15297f0bf99eb61bee3a19ef250e74feee Mon Sep 17 00:00:00 2001
From: Nic Crane <thisisnic@gmail.com>
Date: Mon, 15 Nov 2021 10:07:50 +0000
Subject: [PATCH 143/194] ARROW-14652: [R] Dataset vignette download script
 likely to fail with default options

Closes #11656 from thisisnic/ARROW-14652_timeout

Lead-authored-by: Nic Crane <thisisnic@gmail.com>
Co-authored-by: Neal Richardson <neal.p.richardson@gmail.com>
Signed-off-by: Nic Crane <thisisnic@gmail.com>
---
 r/vignettes/dataset.Rmd | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/r/vignettes/dataset.Rmd b/r/vignettes/dataset.Rmd
index 3f33cbae47c8c..ce2885b22d6bd 100644
--- a/r/vignettes/dataset.Rmd
+++ b/r/vignettes/dataset.Rmd
@@ -48,7 +48,9 @@ arrow::copy_files("s3://ursa-labs-taxi-data", "nyc-taxi")
 ```
 
 If your arrow build doesn't have S3 support, you can download the files
-with some additional code:
+with the additional code shown below.  Since these are large files, 
+you may need to increase R's download timeout from the default of 60 seconds, e.g.
+`options(timeout = 300)`.
 
 ```{r, eval = FALSE}
 bucket <- "https://ursa-labs-taxi-data.s3.us-east-2.amazonaws.com"

From 8f0a5602a1457f9563bbb5e4c45a361a51e72d70 Mon Sep 17 00:00:00 2001
From: Nic Crane <thisisnic@gmail.com>
Date: Mon, 15 Nov 2021 10:12:57 +0000
Subject: [PATCH 144/194] ARROW-14654: [R][Docs] Add article on how to run R
 with C++ debugger to dev docs

This PR adds an article about how to run R with the C++ debugger to the dev docs.  It also adds a submenu for articles to be published on the `pkgdown` site but not distributed with the package as vignettes.  Note that the current R developer guide remains as it's own menu option so we don't break links to it in other packages.

Closes #11658 from thisisnic/ARROW-14654_debugger_docs

Authored-by: Nic Crane <thisisnic@gmail.com>
Signed-off-by: Nic Crane <thisisnic@gmail.com>
---
 r/_pkgdown.yml                       |  4 ++
 r/vignettes/developers/debugging.Rmd | 98 ++++++++++++++++++++++++++++
 2 files changed, 102 insertions(+)
 create mode 100644 r/vignettes/developers/debugging.Rmd

diff --git a/r/_pkgdown.yml b/r/_pkgdown.yml
index b3db0e0f2d4fb..988717ac14f5e 100644
--- a/r/_pkgdown.yml
+++ b/r/_pkgdown.yml
@@ -72,6 +72,10 @@ navbar:
           href: articles/flight.html
         - text: Arrow R Developer Guide
           href: articles/developing.html
+        - text: Developers
+          menu:
+          - text: Debugging
+            href: articles/developers/debugging.html
 reference:
   - title: Multi-file datasets
     contents:
diff --git a/r/vignettes/developers/debugging.Rmd b/r/vignettes/developers/debugging.Rmd
new file mode 100644
index 0000000000000..a18178dfac111
--- /dev/null
+++ b/r/vignettes/developers/debugging.Rmd
@@ -0,0 +1,98 @@
+# Debugging Arrow
+
+If you are a developer working with Arrow code, the package's use of tidy eval 
+and C++ necessitates a solid debugging strategy.  In this article, we recommend
+a few approaches.
+
+## Debugging R code
+
+In general, we have found that using interactive debugging (e.g. calls to 
+`browser()`), where you can inspect objects in a particular environment, is 
+more efficient than simpler techniques such as `print()` statements.
+
+## Getting more descriptive C++ error messages after a segfault
+
+If you are working in the RStudio IDE, your R session will be aborted if there is
+a segfault.  If you re-run your code in a command-line R session, the session 
+isn't automatically aborted and so it will be possible to copy the error 
+message accompanying the segfault.  Here is an example from a bug which
+existed at time of writing.
+
+```shell
+> S3FileSystem$create()
+
+ *** caught segfault ***
+address 0x1a0, cause 'memory not mapped'
+
+Traceback:
+ 1: (function (anonymous, access_key, secret_key, session_token,     role_arn, session_name, external_id, load_frequency, region,     endpoint_override, scheme, background_writes) {    .Call(`_arrow_fs___S3FileSystem__create`, anonymous, access_key,         secret_key, session_token, role_arn, session_name, external_id,         load_frequency, region, endpoint_override, scheme, background_writes)})(access_key = "", secret_key = "", session_token = "", role_arn = "",     session_name = "", external_id = "", load_frequency = 900L,     region = "", endpoint_override = "", scheme = "", background_writes = TRUE,     anonymous = FALSE)
+ 2: exec(fs___S3FileSystem__create, !!!args)
+ 3: S3FileSystem$create()
+```
+
+This output provides the R traceback; however, it doesn't provide any 
+information about the exact line of C++ code from which the segfault originated.
+For this, you will need to run R with the C++ debugger attached.
+
+### Running R code with the C++ debugger attached
+
+As Arrow has C++ code at its core, debugging code can sometimes be tricky when 
+errors originate in the C++ rather than the R layer.  If you are adding new code
+which triggers a C++ bug (or find one in existing code), this can result in a 
+segfault.  If you are working in RStudio, the session is aborted, and you may 
+not be able to retrieve the error messaging needed to diagnose and/or report 
+the bug.  One way around this is to find the code that causes the error, and 
+run R with a C++ debugger.
+
+If you are using macOS and have installed R using the Apple installer, you will 
+not be able to run R with a debugger attached; please see [the instructions here for details on causes of this and workarounds.](https://mac.r-project.org/bin/macosx/RMacOSX-FAQ.html#I-cannot-attach-debugger-to-R)
+
+Firstly, load R with your debugger.  The most common debuggers are `gdb` 
+(typically found on Linux, sometimes on macOS, or Windows via MinGW or Cygwin) 
+and `lldb` (the default macOS debugger).
+
+In my case it's `gdb`, but if you're using the `lldb` debugger (for example, 
+if you're on a Mac), just swap in that command here.
+
+```shell
+R -d gdb
+```
+
+Next, run R.
+
+```shell
+run
+```
+
+You should now be in an R session with the C++ debugger attached.  This will 
+look similar to a normal R session, but with extra output.
+
+Now, run your code - either directly in the session or by sourcing it from a 
+file.  If the code results in a segfault, you will have extra output that you 
+can use to diagnose the problem or attach to an issue as extra information.
+
+Here is debugger output from the segfault shown in the previous example.  You 
+can see here that the exact line which triggers the segfault is included in the 
+output.
+
+```
+> S3FileSystem$create()
+
+Thread 1 "R" received signal SIGSEGV, Segmentation fault.
+0x00007ffff0128369 in std::__atomic_base<long>::operator++ (this=0x178) at /usr/include/c++/9/bits/atomic_base.h:318
+318	      operator++() noexcept
+```
+
+## Resources
+
+The following resources provide detailed guides to debugging R code:
+
+* [The chapter on debugging in 'Advanced R' by Hadley Wickham](https://adv-r.hadley.nz/debugging.html)
+* [The RStudio debugging documentation](https://support.rstudio.com/hc/en-us/articles/205612627-Debugging-with-RStudio)
+
+For an excellent in-depth guide to using the C++ debugger in R, see [this blog 
+post by David Vaughan.](https://blog.davisvaughan.com/2019/04/05/debug-r-package-with-cpp/)
+
+You can find a list of equivalent [gdb and lldb commands on the LLDB website.](https://lldb.llvm.org/use/map.html)
+
+

From f0e1a4f043bb0e8e0ac84467edd94f7ffa2b8ddc Mon Sep 17 00:00:00 2001
From: Augusto Silva <augusto.a.silva@hotmail.com>
Date: Mon, 15 Nov 2021 17:21:13 +0530
Subject: [PATCH 145/194] ARROW-14315: [C++][Gandiva] Implement BROUND function

Returns the rounded BIGINT value of a using HALF_EVEN rounding mode. Also known as Gaussian rounding or bankers' rounding.

Closes #11415 from augustoasilva/feature/implement-bround-function

Authored-by: Augusto Silva <augusto.a.silva@hotmail.com>
Signed-off-by: Pindikura Ravindra <ravindra@dremio.com>
---
 .../gandiva/function_registry_arithmetic.cc   |  4 ++
 .../gandiva/precompiled/extended_math_ops.cc  | 16 ++++++++
 .../precompiled/extended_math_ops_test.cc     | 18 +++++++++
 cpp/src/gandiva/precompiled/types.h           |  1 +
 cpp/src/gandiva/tests/projector_test.cc       | 37 +++++++++++++++++++
 5 files changed, 76 insertions(+)

diff --git a/cpp/src/gandiva/function_registry_arithmetic.cc b/cpp/src/gandiva/function_registry_arithmetic.cc
index f34289f372ebb..3901407cbfb1b 100644
--- a/cpp/src/gandiva/function_registry_arithmetic.cc
+++ b/cpp/src/gandiva/function_registry_arithmetic.cc
@@ -107,6 +107,10 @@ std::vector<NativeFunction> GetArithmeticFunctionRegistry() {
       BINARY_GENERIC_SAFE_NULL_IF_NULL(round, {}, int32, int32, int32),
       BINARY_GENERIC_SAFE_NULL_IF_NULL(round, {}, int64, int32, int64),
 
+      // bround functions
+      NativeFunction("bround", {}, DataTypeVector{float64()}, float64(),
+                     kResultNullIfNull, "bround_float64"),
+
       // compare functions
       BINARY_RELATIONAL_BOOL_FN(equal, ({"eq", "same"})),
       BINARY_RELATIONAL_BOOL_FN(not_equal, {}),
diff --git a/cpp/src/gandiva/precompiled/extended_math_ops.cc b/cpp/src/gandiva/precompiled/extended_math_ops.cc
index 365b08a6da96c..c2331976c6df6 100644
--- a/cpp/src/gandiva/precompiled/extended_math_ops.cc
+++ b/cpp/src/gandiva/precompiled/extended_math_ops.cc
@@ -28,6 +28,7 @@ extern "C" {
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+
 #include "./types.h"
 
 // Expand the inner fn for types that support extended math.
@@ -236,6 +237,21 @@ gdv_int64 round_int64(gdv_int64 num) { return num; }
 ROUND_DECIMAL(float32)
 ROUND_DECIMAL(float64)
 
+// rounds the number to the nearest integer
+FORCE_INLINE
+gdv_float64 bround_float64(gdv_float64 num) {
+  gdv_float64 round_num = round(num);
+  gdv_float64 diff_num = round_num - num;
+  if ((diff_num != 0.5) && (diff_num != -0.5)) {
+    return round_num;
+  }
+  if (fmod(round_num, 2.0) == 0.0) {
+    return round_num;
+  }
+
+  return num - diff_num;
+}
+
 // rounds the number to the given scale
 #define ROUND_DECIMAL_TO_SCALE(TYPE)                                        \
   FORCE_INLINE                                                              \
diff --git a/cpp/src/gandiva/precompiled/extended_math_ops_test.cc b/cpp/src/gandiva/precompiled/extended_math_ops_test.cc
index 147b4035c7d8f..9d84734217b80 100644
--- a/cpp/src/gandiva/precompiled/extended_math_ops_test.cc
+++ b/cpp/src/gandiva/precompiled/extended_math_ops_test.cc
@@ -120,6 +120,24 @@ TEST(TestExtendedMathOps, TestRoundDecimal) {
   VerifyFuzzyEquals(round_float64_int32((double)INT_MIN - 1, 0), (double)INT_MIN - 1);
 }
 
+TEST(TestExtendedMathOps, TestBRoundDecimal) {
+  EXPECT_DOUBLE_EQ(bround_float64(0.0), 0);
+  EXPECT_DOUBLE_EQ(bround_float64(2.5), 2);
+  EXPECT_DOUBLE_EQ(bround_float64(3.5), 4);
+  EXPECT_DOUBLE_EQ(bround_float64(-2.5), -2);
+  EXPECT_DOUBLE_EQ(bround_float64(-3.5), -4);
+  EXPECT_DOUBLE_EQ(bround_float64(1.4999999), 1);
+  EXPECT_DOUBLE_EQ(bround_float64(1.50001), 2);
+  EXPECT_EQ(std::signbit(bround_float64(0)), 0);
+
+  VerifyFuzzyEquals(bround_float64(2.5), 2);
+  VerifyFuzzyEquals(bround_float64(3.5), 4);
+  VerifyFuzzyEquals(bround_float64(-2.5), -2);
+  VerifyFuzzyEquals(bround_float64(-3.5), -4);
+  VerifyFuzzyEquals(bround_float64(1.4999999), 1);
+  VerifyFuzzyEquals(bround_float64(1.50001), 2);
+}
+
 TEST(TestExtendedMathOps, TestRound) {
   EXPECT_EQ(round_int32(21134), 21134);
   EXPECT_EQ(round_int32(-132422), -132422);
diff --git a/cpp/src/gandiva/precompiled/types.h b/cpp/src/gandiva/precompiled/types.h
index 987ee2c6d9f22..2e6e9c6eb7a76 100644
--- a/cpp/src/gandiva/precompiled/types.h
+++ b/cpp/src/gandiva/precompiled/types.h
@@ -167,6 +167,7 @@ gdv_float64 div_float64_float64(gdv_int64 context, gdv_float64 in1, gdv_float64
 
 gdv_float32 round_float32(gdv_float32);
 gdv_float64 round_float64(gdv_float64);
+gdv_float64 bround_float64(gdv_float64);
 gdv_float32 round_float32_int32(gdv_float32 number, gdv_int32 out_scale);
 gdv_float64 round_float64_int32(gdv_float64 number, gdv_int32 out_scale);
 gdv_float64 get_scale_multiplier(gdv_int32);
diff --git a/cpp/src/gandiva/tests/projector_test.cc b/cpp/src/gandiva/tests/projector_test.cc
index 12020777309a4..dea66a792ba3d 100644
--- a/cpp/src/gandiva/tests/projector_test.cc
+++ b/cpp/src/gandiva/tests/projector_test.cc
@@ -1606,4 +1606,41 @@ TEST_F(TestProjector, TestCastNullableIntYearInterval) {
   EXPECT_ARROW_ARRAY_EQUALS(out_int64, outputs.at(1));
 }
 
+TEST_F(TestProjector, TestBround) {
+  // schema for input fields
+  auto field0 = field("f0", arrow::float64());
+
+  auto schema_bround = arrow::schema({field0});
+
+  // output fields
+  auto field_bround = field("bround", arrow::float64());
+
+  // Build expression
+  auto bround_expr = TreeExprBuilder::MakeExpression("bround", {field0}, field_bround);
+
+  std::shared_ptr<Projector> projector;
+  auto status =
+      Projector::Make(schema_bround, {bround_expr}, TestConfiguration(), &projector);
+
+  EXPECT_TRUE(status.ok()) << status.message();
+
+  // Create a row-batch with some sample data
+  int num_records = 4;
+  auto array0 =
+      MakeArrowArrayFloat64({0.0, 2.5, -3.5, 1.499999}, {true, true, true, true});
+  // expected output
+  auto exp_bround = MakeArrowArrayFloat64({0, 2, -4, 1}, {true, true, true, true});
+
+  // prepare input record batch
+  auto in_batch = arrow::RecordBatch::Make(schema_bround, num_records, {array0});
+
+  // Evaluate expression
+  arrow::ArrayVector outputs;
+  status = projector->Evaluate(*in_batch, pool_, &outputs);
+  EXPECT_TRUE(status.ok()) << status.message();
+
+  // Validate results
+  EXPECT_ARROW_ARRAY_EQUALS(exp_bround, outputs.at(0));
+}
+
 }  // namespace gandiva

From 24acebcb5325c2bd2210cbe80e179250b60df7b1 Mon Sep 17 00:00:00 2001
From: Nic Crane <thisisnic@gmail.com>
Date: Mon, 15 Nov 2021 08:05:43 -0600
Subject: [PATCH 146/194] MINOR: [R][Doc] Update phrasing of docs for
 chunk_size argument to better reflect what it means

Closes #11681 from thisisnic/chunk_size

Authored-by: Nic Crane <thisisnic@gmail.com>
Signed-off-by: Jonathan Keane <jkeane@gmail.com>
---
 r/R/parquet.R          | 6 +++++-
 r/man/write_parquet.Rd | 6 +++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/r/R/parquet.R b/r/R/parquet.R
index 33cbb33f33941..d262527f58b9b 100644
--- a/r/R/parquet.R
+++ b/r/R/parquet.R
@@ -82,7 +82,11 @@ read_parquet <- function(file,
 #' @param x `data.frame`, [RecordBatch], or [Table]
 #' @param sink A string file path, URI, or [OutputStream], or path in a file
 #' system (`SubTreeFileSystem`)
-#' @param chunk_size chunk size in number of rows. If NULL, the total number of rows is used.
+#' @param chunk_size how many rows of data to write to disk at once. This
+#' directly corresponds to how many rows will be in each row group in parquet.
+#' If `NULL`, a best guess will be made for optimal size (based on the number of
+#'  columns and number of rows), though if the data has fewer than 250 million
+#'  cells (rows x cols), then the total number of rows is used.
 #' @param version parquet version, "1.0" or "2.0". Default "1.0". Numeric values
 #'   are coerced to character.
 #' @param compression compression algorithm. Default "snappy". See details.
diff --git a/r/man/write_parquet.Rd b/r/man/write_parquet.Rd
index d7147f7e8e67d..efc6856e5e8ac 100644
--- a/r/man/write_parquet.Rd
+++ b/r/man/write_parquet.Rd
@@ -27,7 +27,11 @@ write_parquet(
 \item{sink}{A string file path, URI, or \link{OutputStream}, or path in a file
 system (\code{SubTreeFileSystem})}
 
-\item{chunk_size}{chunk size in number of rows. If NULL, the total number of rows is used.}
+\item{chunk_size}{how many rows of data to write to disk at once. This
+directly corresponds to how many rows will be in each row group in parquet.
+If \code{NULL}, a best guess will be made for optimal size (based on the number of
+columns and number of rows), though if the data has fewer than 250 million
+cells (rows x cols), then the total number of rows is used.}
 
 \item{version}{parquet version, "1.0" or "2.0". Default "1.0". Numeric values
 are coerced to character.}

From 30ac30fd0e375bae1e57b99e38b782e634773fbe Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Tue, 16 Nov 2021 05:45:16 +0900
Subject: [PATCH 147/194] ARROW-14684: [CI][C++] Use aws-sdk-cpp package on
 macOS

Closes #11685 from kou/ci-cpp-macos-aws-sdk-cpp

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 cpp/Brewfile                                |  1 +
 dev/tasks/homebrew-formulae/apache-arrow.rb | 10 ++++++----
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/cpp/Brewfile b/cpp/Brewfile
index 78ee5e64c8fac..039833c274093 100644
--- a/cpp/Brewfile
+++ b/cpp/Brewfile
@@ -16,6 +16,7 @@
 # under the License.
 
 brew "automake"
+brew "aws-sdk-cpp"
 brew "boost"
 brew "brotli"
 brew "c-ares"
diff --git a/dev/tasks/homebrew-formulae/apache-arrow.rb b/dev/tasks/homebrew-formulae/apache-arrow.rb
index 38e0244fb1a88..8de40e3d6b884 100644
--- a/dev/tasks/homebrew-formulae/apache-arrow.rb
+++ b/dev/tasks/homebrew-formulae/apache-arrow.rb
@@ -9,6 +9,7 @@ class ApacheArrow < Formula
   depends_on "boost" => :build
   depends_on "cmake" => :build
   depends_on "llvm" => :build
+  depends_on "aws-sdk-cpp"
   depends_on "brotli"
   depends_on "glog"
   depends_on "grpc"
@@ -29,6 +30,7 @@ def install
     args = %W[
       -DARROW_FLIGHT=ON
       -DARROW_GANDIVA=ON
+      -DARROW_INSTALL_NAME_RPATH=OFF
       -DARROW_JEMALLOC=ON
       -DARROW_MIMALLOC=ON
       -DARROW_ORC=ON
@@ -36,13 +38,13 @@ def install
       -DARROW_PLASMA=ON
       -DARROW_PROTOBUF_USE_SHARED=ON
       -DARROW_PYTHON=ON
+      -DARROW_S3=ON
+      -DARROW_WITH_BROTLI=ON
       -DARROW_WITH_BZ2=ON
-      -DARROW_WITH_ZLIB=ON
-      -DARROW_WITH_ZSTD=ON
       -DARROW_WITH_LZ4=ON
       -DARROW_WITH_SNAPPY=ON
-      -DARROW_WITH_BROTLI=ON
-      -DARROW_INSTALL_NAME_RPATH=OFF
+      -DARROW_WITH_ZLIB=ON
+      -DARROW_WITH_ZSTD=ON
       -DPython3_EXECUTABLE=#{Formula["python@3.9"].bin/"python3"}
     ]
     # Re-enable -DARROW_S3=ON and add back aws-sdk-cpp to depends_on in ARROW-6437

From 3abab2ee044b405476844fbffebfaab8e3350207 Mon Sep 17 00:00:00 2001
From: Kazuaki Ishizaki <ishizaki@jp.ibm.com>
Date: Tue, 16 Nov 2021 09:51:58 +0100
Subject: [PATCH 148/194] ARROW-14685: [Python] test case automatically detects
 byteorder of numpy object

This PR fixes the following test failures

```
FAILED pyarrow/tests/test_pandas.py::test_timestamp_as_object_out_of_range - AssertionError: assert (dtype('O'), dtype('>M8[ns]')) == (dtype('O'), dtype('<M8[ns]'))
```

Closes #11686 from kiszk/ARROW-14685

Authored-by: Kazuaki Ishizaki <ishizaki@jp.ibm.com>
Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 python/pyarrow/tests/test_pandas.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py
index 112c7938edd80..e458d0428bbc7 100644
--- a/python/pyarrow/tests/test_pandas.py
+++ b/python/pyarrow/tests/test_pandas.py
@@ -4329,8 +4329,8 @@ def make_df_with_timestamps():
     # Not part of what we're testing, just ensuring that the inputs are what we
     # expect.
     assert (df.dateTimeMs.dtype, df.dateTimeNs.dtype) == (
-        # O == object, <M8[ns] == timestamp64[ns]
-        np.dtype("O"), np.dtype("<M8[ns]")
+        # O == object, M8[ns] == timestamp64[ns]
+        np.dtype("O"), np.dtype("M8[ns]")
     )
     return df
 

From f1fcc4f2bfe5b115bda96fa547da4c0b4328685c Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Tue, 16 Nov 2021 11:57:09 +0100
Subject: [PATCH 149/194] ARROW-14664: [C++] Fix accepted types for Parquet
 encoding DELTA_BYTE_ARRAY

DELTA_BYTE_ARRAY is only applicable to BYTE_ARRAY but we would also accept FIXED_LEN_BYTE_ARRAY.
Casting the decoder to the expected subtype would then produce a null pointer.

Found by OSS-Fuzz.  Should fix the following issue:
- https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=40865

Closes #11664 from pitrou/ARROW-14664-parquet-encoding

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/parquet/encoding.cc | 5 ++---
 testing                     | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index 3ff59423411ac..549f3566fe9fc 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -2709,11 +2709,10 @@ std::unique_ptr<Decoder> MakeDecoder(Type::type type_num, Encoding::type encodin
         break;
     }
   } else if (encoding == Encoding::DELTA_BYTE_ARRAY) {
-    if (type_num == Type::BYTE_ARRAY || type_num == Type::FIXED_LEN_BYTE_ARRAY) {
+    if (type_num == Type::BYTE_ARRAY) {
       return std::unique_ptr<Decoder>(new DeltaByteArrayDecoder(descr));
     }
-    throw ParquetException(
-        "DELTA_BYTE_ARRAY only supports BYTE_ARRAY and FIXED_LEN_BYTE_ARRAY");
+    throw ParquetException("DELTA_BYTE_ARRAY only supports BYTE_ARRAY");
   } else {
     ParquetException::NYI("Selected encoding is not supported");
   }
diff --git a/testing b/testing
index a51d8788cf349..065f6f2019b75 160000
--- a/testing
+++ b/testing
@@ -1 +1 @@
-Subproject commit a51d8788cf34994ef88c81cffb11117a949350ef
+Subproject commit 065f6f2019b7523dd0f68430790073d4ede5a058

From 690e3644d837505728ee94da576522948711ce0e Mon Sep 17 00:00:00 2001
From: Weston Pace <weston.pace@gmail.com>
Date: Tue, 16 Nov 2021 08:01:58 -0500
Subject: [PATCH 150/194] ARROW-14238: [Python] "could not run mc" error in
 test_fs.py

The error appears to be due to an older version of mc.  This PR adds a minimum version check to ensure that we are working with a recent version of mc.  If we are not it will skip the test.

Closes #11408 from westonpace/bugfix/ARROW-14238--check-mc-version-in-test

Authored-by: Weston Pace <weston.pace@gmail.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 python/pyarrow/tests/test_fs.py | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py
index 48bdae8a54fae..7e0fe0fca0505 100644
--- a/python/pyarrow/tests/test_fs.py
+++ b/python/pyarrow/tests/test_fs.py
@@ -20,6 +20,7 @@
 import os
 import pathlib
 import pickle
+import re
 import subprocess
 import sys
 import time
@@ -287,6 +288,22 @@ def _wait_for_minio_startup(mcdir, address, access_key, secret_key):
     raise Exception("mc command could not connect to local minio")
 
 
+def _ensure_minio_component_version(component, minimum_year):
+    full_args = [component, '--version']
+    proc = subprocess.Popen(full_args, stdout=subprocess.PIPE,
+                            stderr=subprocess.PIPE, encoding='utf-8')
+    if proc.wait(10) != 0:
+        return False
+    stdout = proc.stdout.read()
+    pattern = component + r' version RELEASE\.(\d+)-.*'
+    version_match = re.search(pattern, stdout)
+    if version_match:
+        version_year = version_match.group(1)
+        return int(version_year) >= minimum_year
+    else:
+        return False
+
+
 def _configure_limited_user(tmpdir, address, access_key, secret_key):
     """
     Attempts to use the mc command to configure the minio server
@@ -298,6 +315,12 @@ def _configure_limited_user(tmpdir, address, access_key, secret_key):
     (e.g. see ARROW-13685)
     """
     try:
+        if not _ensure_minio_component_version('mc', 2021):
+            # mc version is too old for the capabilities we need
+            return False
+        if not _ensure_minio_component_version('minio', 2021):
+            # minio version is too old for the capabilities we need
+            return False
         mcdir = os.path.join(tmpdir, 'mc')
         os.mkdir(mcdir)
         policy_path = os.path.join(tmpdir, 'limited-buckets-policy.json')

From d618498df6409e748e50d5a8266ddc33def08853 Mon Sep 17 00:00:00 2001
From: Nic Crane <thisisnic@gmail.com>
Date: Tue, 16 Nov 2021 15:54:54 +0000
Subject: [PATCH 151/194] ARROW-14710: [R] Install error on Linux arm64 with
 cmake-X.X.X-Linux-x86_64

Closes #11703 from thisisnic/ARROW-14710_arm_cmake

Authored-by: Nic Crane <thisisnic@gmail.com>
Signed-off-by: Nic Crane <thisisnic@gmail.com>
---
 r/tools/nixlibs.R | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/r/tools/nixlibs.R b/r/tools/nixlibs.R
index 869e0abcfbb7c..c59e5e13eb3af 100644
--- a/r/tools/nixlibs.R
+++ b/r/tools/nixlibs.R
@@ -371,11 +371,18 @@ ensure_cmake <- function() {
   if (is.null(cmake)) {
     # If not found, download it
     cat("**** cmake\n")
-    CMAKE_VERSION <- Sys.getenv("CMAKE_VERSION", "3.19.2")
+    CMAKE_VERSION <- Sys.getenv("CMAKE_VERSION", "3.21.4")
     if (tolower(Sys.info()[["sysname"]]) %in% "darwin") {
       postfix <- "-macos-universal.tar.gz"
+    } else if (tolower(Sys.info()[["machine"]]) == "arm64") {
+      postfix <- "-linux-aarch64.tar.gz"
+    } else if (tolower(Sys.info()[["machine"]]) == "x86_64") {
+      postfix <- "-linux-x86_64.tar.gz"
     } else {
-      postfix <- "-Linux-x86_64.tar.gz"
+      stop(paste0(
+         "*** cmake was not found locally.\n",
+         "    Please make sure cmake >= 3.10 is installed and available on your PATH.\n"
+      ))
     }
     cmake_binary_url <- paste0(
       "https://github.com/Kitware/CMake/releases/download/v", CMAKE_VERSION,

From b305edb2db9177e1a2b56a7713bfe49259315961 Mon Sep 17 00:00:00 2001
From: Neal Richardson <neal.p.richardson@gmail.com>
Date: Tue, 16 Nov 2021 13:33:19 -0500
Subject: [PATCH 152/194] ARROW-14716: [R][CI] Bump R versions used in docker
 tests

Closes #11708 from nealrichardson/r-crossbow-update-versions

Authored-by: Neal Richardson <neal.p.richardson@gmail.com>
Signed-off-by: Neal Richardson <neal.p.richardson@gmail.com>
---
 dev/tasks/tasks.yml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml
index cfa6a79518764..aaf107728b119 100644
--- a/dev/tasks/tasks.yml
+++ b/dev/tasks/tasks.yml
@@ -1144,10 +1144,10 @@ tasks:
 
 {% for r_org, r_image, r_tag in [("rhub", "ubuntu-gcc-release", "latest"),
                                  ("rocker", "r-base", "latest"),
-                                 ("rstudio", "r-base", "3.6-bionic"),
-                                 ("rstudio", "r-base", "3.6-centos8"),
-                                 ("rstudio", "r-base", "3.6-opensuse15"),
-                                 ("rstudio", "r-base", "3.6-opensuse42")] %}
+                                 ("rstudio", "r-base", "4.1-focal"),
+                                 ("rstudio", "r-base", "4.1-centos8"),
+                                 ("rstudio", "r-base", "4.1-opensuse15"),
+                                 ("rstudio", "r-base", "4.1-opensuse42")] %}
   test-r-{{ r_org }}-{{ r_image }}-{{ r_tag }}:
     ci: azure
     template: r/azure.linux.yml
@@ -1193,13 +1193,13 @@ tasks:
       flags: '-e ARROW_SOURCE_HOME="/arrow" -e FORCE_BUNDLED_BUILD=TRUE -e LIBARROW_BUILD=TRUE -e ARROW_S3=OFF'
       image: ubuntu-r-only-r
 
-  test-r-rstudio-r-base-3.6-centos7-devtoolset-8:
+  test-r-rstudio-r-base-4.1-centos7-devtoolset-8:
     ci: azure
     template: r/azure.linux.yml
     params:
       r_org: rstudio
       r_image: r-base
-      r_tag: 3.6-centos7
+      r_tag: 4.1-centos7
       devtoolset_version: 8
 
   test-r-minimal-build:

From d28b6f44813128775c18b14e03b4be402a0b4cca Mon Sep 17 00:00:00 2001
From: Matthew Topol <mtopol@factset.com>
Date: Wed, 17 Nov 2021 09:21:56 +0900
Subject: [PATCH 153/194] ARROW-14728: [Go] Pull LICENSE.txt up to new module
 root

Closes #11715 from zeroshade/arrow-14728-license

Authored-by: Matthew Topol <mtopol@factset.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 go/{arrow => }/LICENSE.txt |    0
 go/parquet/LICENSE.txt     | 1873 ------------------------------------
 2 files changed, 1873 deletions(-)
 rename go/{arrow => }/LICENSE.txt (100%)
 delete mode 100644 go/parquet/LICENSE.txt

diff --git a/go/arrow/LICENSE.txt b/go/LICENSE.txt
similarity index 100%
rename from go/arrow/LICENSE.txt
rename to go/LICENSE.txt
diff --git a/go/parquet/LICENSE.txt b/go/parquet/LICENSE.txt
deleted file mode 100644
index bb586ce1e1e45..0000000000000
--- a/go/parquet/LICENSE.txt
+++ /dev/null
@@ -1,1873 +0,0 @@
-
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright [yyyy] [name of copyright owner]
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-
---------------------------------------------------------------------------------
-
-src/plasma/fling.cc and src/plasma/fling.h: Apache 2.0
-
-Copyright 2013 Sharvil Nanavati
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-
---------------------------------------------------------------------------------
-
-src/plasma/thirdparty/ae: Modified / 3-Clause BSD
-
-Copyright (c) 2006-2010, Salvatore Sanfilippo <antirez at gmail dot com>
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
-   this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
-   notice, this list of conditions and the following disclaimer in the
-   documentation and/or other materials provided with the distribution.
- * Neither the name of Redis nor the names of its contributors may be used
-   to endorse or promote products derived from this software without
-   specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-
---------------------------------------------------------------------------------
-
-src/plasma/thirdparty/dlmalloc.c: CC0
-
-This is a version (aka dlmalloc) of malloc/free/realloc written by
-Doug Lea and released to the public domain, as explained at
-http://creativecommons.org/publicdomain/zero/1.0/ Send questions,
-comments, complaints, performance data, etc to dl@cs.oswego.edu
-
---------------------------------------------------------------------------------
-
-src/plasma/common.cc (some portions)
-
-Copyright (c) Austin Appleby (aappleby (AT) gmail)
-
-Some portions of this file are derived from code in the MurmurHash project
-
-All code is released to the public domain. For business purposes, Murmurhash is
-under the MIT license.
-
-https://sites.google.com/site/murmurhash/
-
---------------------------------------------------------------------------------
-
-src/arrow/util (some portions): Apache 2.0, and 3-clause BSD
-
-Some portions of this module are derived from code in the Chromium project,
-copyright (c) Google inc and (c) The Chromium Authors and licensed under the
-Apache 2.0 License or the under the 3-clause BSD license:
-
-  Copyright (c) 2013 The Chromium Authors. All rights reserved.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions are
-  met:
-
-     * Redistributions of source code must retain the above copyright
-  notice, this list of conditions and the following disclaimer.
-     * Redistributions in binary form must reproduce the above
-  copyright notice, this list of conditions and the following disclaimer
-  in the documentation and/or other materials provided with the
-  distribution.
-     * Neither the name of Google Inc. nor the names of its
-  contributors may be used to endorse or promote products derived from
-  this software without specific prior written permission.
-
-  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
---------------------------------------------------------------------------------
-
-This project includes code from Daniel Lemire's FrameOfReference project.
-
-https://github.com/lemire/FrameOfReference/blob/6ccaf9e97160f9a3b299e23a8ef739e711ef0c71/src/bpacking.cpp
-
-Copyright: 2013 Daniel Lemire
-Home page: http://lemire.me/en/
-Project page: https://github.com/lemire/FrameOfReference
-License: Apache License Version 2.0 http://www.apache.org/licenses/LICENSE-2.0
-
---------------------------------------------------------------------------------
-
-This project includes code from the TensorFlow project
-
-Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-
---------------------------------------------------------------------------------
-
-This project includes code from the NumPy project.
-
-https://github.com/numpy/numpy/blob/e1f191c46f2eebd6cb892a4bfe14d9dd43a06c4e/numpy/core/src/multiarray/multiarraymodule.c#L2910
-
-https://github.com/numpy/numpy/blob/68fd82271b9ea5a9e50d4e761061dfcca851382a/numpy/core/src/multiarray/datetime.c
-
-Copyright (c) 2005-2017, NumPy Developers.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-    * Redistributions of source code must retain the above copyright
-       notice, this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above
-       copyright notice, this list of conditions and the following
-       disclaimer in the documentation and/or other materials provided
-       with the distribution.
-
-    * Neither the name of the NumPy Developers nor the names of any
-       contributors may be used to endorse or promote products derived
-       from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
---------------------------------------------------------------------------------
-
-This project includes code from the Boost project
-
-Boost Software License - Version 1.0 - August 17th, 2003
-
-Permission is hereby granted, free of charge, to any person or organization
-obtaining a copy of the software and accompanying documentation covered by
-this license (the "Software") to use, reproduce, display, distribute,
-execute, and transmit the Software, and to prepare derivative works of the
-Software, and to permit third-parties to whom the Software is furnished to
-do so, all subject to the following:
-
-The copyright notices in the Software and this entire statement, including
-the above license grant, this restriction and the following disclaimer,
-must be included in all copies of the Software, in whole or in part, and
-all derivative works of the Software, unless such copies or derivative
-works are solely in the form of machine-executable object code generated by
-a source language processor.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
-SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
-FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
-ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE.
-
---------------------------------------------------------------------------------
-
-This project includes code from the FlatBuffers project
-
-Copyright 2014 Google Inc.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-
---------------------------------------------------------------------------------
-
-This project includes code from the tslib project
-
-Copyright 2015 Microsoft Corporation. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-
---------------------------------------------------------------------------------
-
-This project includes code from the jemalloc project
-
-https://github.com/jemalloc/jemalloc
-
-Copyright (C) 2002-2017 Jason Evans <jasone@canonware.com>.
-All rights reserved.
-Copyright (C) 2007-2012 Mozilla Foundation.  All rights reserved.
-Copyright (C) 2009-2017 Facebook, Inc.  All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-1. Redistributions of source code must retain the above copyright notice(s),
-   this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright notice(s),
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY EXPRESS
-OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO
-EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
-INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
-OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
-ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
---------------------------------------------------------------------------------
-
-This project includes code from the Go project, BSD 3-clause license + PATENTS
-weak patent termination clause
-(https://github.com/golang/go/blob/master/PATENTS).
-
-Copyright (c) 2009 The Go Authors. All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-   * Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-   * Redistributions in binary form must reproduce the above
-copyright notice, this list of conditions and the following disclaimer
-in the documentation and/or other materials provided with the
-distribution.
-   * Neither the name of Google Inc. nor the names of its
-contributors may be used to endorse or promote products derived from
-this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
---------------------------------------------------------------------------------
-
-This project includes code from the hs2client
-
-https://github.com/cloudera/hs2client
-
-Copyright 2016 Cloudera Inc.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-
---------------------------------------------------------------------------------
-
-The script ci/scripts/util_wait_for_it.sh has the following license
-
-Copyright (c) 2016 Giles Hall
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of
-this software and associated documentation files (the "Software"), to deal in
-the Software without restriction, including without limitation the rights to
-use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
-of the Software, and to permit persons to whom the Software is furnished to do
-so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
---------------------------------------------------------------------------------
-
-The script r/configure has the following license (MIT)
-
-Copyright (c) 2017, Jeroen Ooms and Jim Hester
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of
-this software and associated documentation files (the "Software"), to deal in
-the Software without restriction, including without limitation the rights to
-use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
-of the Software, and to permit persons to whom the Software is furnished to do
-so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
---------------------------------------------------------------------------------
-
-cpp/src/arrow/util/logging.cc, cpp/src/arrow/util/logging.h and
-cpp/src/arrow/util/logging-test.cc are adapted from
-Ray Project (https://github.com/ray-project/ray) (Apache 2.0).
-
-Copyright (c) 2016 Ray Project (https://github.com/ray-project/ray)
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-
---------------------------------------------------------------------------------
-The files cpp/src/arrow/vendored/datetime/date.h, cpp/src/arrow/vendored/datetime/tz.h,
-cpp/src/arrow/vendored/datetime/tz_private.h, cpp/src/arrow/vendored/datetime/ios.h,
-cpp/src/arrow/vendored/datetime/ios.mm,
-cpp/src/arrow/vendored/datetime/tz.cpp are adapted from
-Howard Hinnant's date library (https://github.com/HowardHinnant/date)
-It is licensed under MIT license.
-
-The MIT License (MIT)
-Copyright (c) 2015, 2016, 2017 Howard Hinnant
-Copyright (c) 2016 Adrian Colomitchi
-Copyright (c) 2017 Florian Dang
-Copyright (c) 2017 Paul Thompson
-Copyright (c) 2018 Tomasz Kamiński
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
---------------------------------------------------------------------------------
-
-The file cpp/src/arrow/util/utf8.h includes code adapted from the page
-  https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
-with the following license (MIT)
-
-Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
---------------------------------------------------------------------------------
-
-The file cpp/src/arrow/vendored/string_view.hpp has the following license
-
-Boost Software License - Version 1.0 - August 17th, 2003
-
-Permission is hereby granted, free of charge, to any person or organization
-obtaining a copy of the software and accompanying documentation covered by
-this license (the "Software") to use, reproduce, display, distribute,
-execute, and transmit the Software, and to prepare derivative works of the
-Software, and to permit third-parties to whom the Software is furnished to
-do so, all subject to the following:
-
-The copyright notices in the Software and this entire statement, including
-the above license grant, this restriction and the following disclaimer,
-must be included in all copies of the Software, in whole or in part, and
-all derivative works of the Software, unless such copies or derivative
-works are solely in the form of machine-executable object code generated by
-a source language processor.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
-SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
-FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
-ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE.
-
---------------------------------------------------------------------------------
-
-The files in cpp/src/arrow/vendored/xxhash/ have the following license
-(BSD 2-Clause License)
-
-xxHash Library
-Copyright (c) 2012-2014, Yann Collet
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
-* Redistributions of source code must retain the above copyright notice, this
-  list of conditions and the following disclaimer.
-
-* Redistributions in binary form must reproduce the above copyright notice, this
-  list of conditions and the following disclaimer in the documentation and/or
-  other materials provided with the distribution.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-You can contact the author at :
-- xxHash homepage: http://www.xxhash.com
-- xxHash source repository : https://github.com/Cyan4973/xxHash
-
---------------------------------------------------------------------------------
-
-The files in cpp/src/arrow/vendored/double-conversion/ have the following license
-(BSD 3-Clause License)
-
-Copyright 2006-2011, the V8 project authors. All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above
-      copyright notice, this list of conditions and the following
-      disclaimer in the documentation and/or other materials provided
-      with the distribution.
-    * Neither the name of Google Inc. nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
---------------------------------------------------------------------------------
-
-The files in cpp/src/arrow/vendored/uriparser/ have the following license
-(BSD 3-Clause License)
-
-uriparser - RFC 3986 URI parsing library
-
-Copyright (C) 2007, Weijia Song <songweijia@gmail.com>
-Copyright (C) 2007, Sebastian Pipping <sebastian@pipping.org>
-All rights reserved.
-
-Redistribution  and use in source and binary forms, with or without
-modification,  are permitted provided that the following conditions
-are met:
-
-    * Redistributions   of  source  code  must  retain  the   above
-      copyright  notice, this list of conditions and the  following
-      disclaimer.
-
-    * Redistributions  in  binary  form must  reproduce  the  above
-      copyright  notice, this list of conditions and the  following
-      disclaimer   in  the  documentation  and/or  other  materials
-      provided with the distribution.
-
-    * Neither  the name of the <ORGANIZATION> nor the names of  its
-      contributors  may  be  used to endorse  or  promote  products
-      derived  from  this software without specific  prior  written
-      permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS  IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT  NOT
-LIMITED  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND  FITNESS
-FOR  A  PARTICULAR  PURPOSE ARE DISCLAIMED. IN NO EVENT  SHALL  THE
-COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-INCIDENTAL,    SPECIAL,   EXEMPLARY,   OR   CONSEQUENTIAL   DAMAGES
-(INCLUDING,  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES;  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-STRICT  LIABILITY,  OR  TORT (INCLUDING  NEGLIGENCE  OR  OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
-OF THE POSSIBILITY OF SUCH DAMAGE.
-
---------------------------------------------------------------------------------
-
-The files under dev/tasks/conda-recipes have the following license
-
-BSD 3-clause license
-Copyright (c) 2015-2018, conda-forge
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors
-   may be used to endorse or promote products derived from this software without
-   specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
-TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
-THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
---------------------------------------------------------------------------------
-
-The files in cpp/src/arrow/vendored/utf8cpp/ have the following license
-
-Copyright 2006 Nemanja Trifunovic
-
-Permission is hereby granted, free of charge, to any person or organization
-obtaining a copy of the software and accompanying documentation covered by
-this license (the "Software") to use, reproduce, display, distribute,
-execute, and transmit the Software, and to prepare derivative works of the
-Software, and to permit third-parties to whom the Software is furnished to
-do so, all subject to the following:
-
-The copyright notices in the Software and this entire statement, including
-the above license grant, this restriction and the following disclaimer,
-must be included in all copies of the Software, in whole or in part, and
-all derivative works of the Software, unless such copies or derivative
-works are solely in the form of machine-executable object code generated by
-a source language processor.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
-SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
-FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
-ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE.
-
---------------------------------------------------------------------------------
-
-This project includes code from Apache Kudu.
-
- * cpp/cmake_modules/CompilerInfo.cmake is based on Kudu's cmake_modules/CompilerInfo.cmake
-
-Copyright: 2016 The Apache Software Foundation.
-Home page: https://kudu.apache.org/
-License: http://www.apache.org/licenses/LICENSE-2.0
-
---------------------------------------------------------------------------------
-
-This project includes code from Apache Impala (incubating), formerly
-Impala. The Impala code and rights were donated to the ASF as part of the
-Incubator process after the initial code imports into Apache Parquet.
-
-Copyright: 2012 Cloudera, Inc.
-Copyright: 2016 The Apache Software Foundation.
-Home page: http://impala.apache.org/
-License: http://www.apache.org/licenses/LICENSE-2.0
-
---------------------------------------------------------------------------------
-
-This project includes code from Apache Aurora.
-
-* dev/release/{release,changelog,release-candidate} are based on the scripts from
-  Apache Aurora
-
-Copyright: 2016 The Apache Software Foundation.
-Home page: https://aurora.apache.org/
-License: http://www.apache.org/licenses/LICENSE-2.0
-
---------------------------------------------------------------------------------
-
-This project includes code from the Google styleguide.
-
-* cpp/build-support/cpplint.py is based on the scripts from the Google styleguide.
-
-Copyright: 2009 Google Inc. All rights reserved.
-Homepage: https://github.com/google/styleguide
-License: 3-clause BSD
-
---------------------------------------------------------------------------------
-
-This project includes code from Snappy.
-
-* cpp/cmake_modules/{SnappyCMakeLists.txt,SnappyConfig.h} are based on code
-  from Google's Snappy project.
-
-Copyright: 2009 Google Inc. All rights reserved.
-Homepage: https://github.com/google/snappy
-License: 3-clause BSD
-
---------------------------------------------------------------------------------
-
-This project includes code from the manylinux project.
-
-* python/manylinux1/scripts/{build_python.sh,python-tag-abi-tag.py,
-  requirements.txt} are based on code from the manylinux project.
-
-Copyright: 2016 manylinux
-Homepage: https://github.com/pypa/manylinux
-License: The MIT License (MIT)
-
---------------------------------------------------------------------------------
-
-This project includes code from the cymove project:
-
-* python/pyarrow/includes/common.pxd includes code from the cymove project
-
-The MIT License (MIT)
-Copyright (c) 2019 Omer Ozarslan
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
-DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
-OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
-OR OTHER DEALINGS IN THE SOFTWARE.
-
---------------------------------------------------------------------------------
-
-The projects includes code from the Ursabot project under the dev/archery
-directory.
-
-License: BSD 2-Clause
-
-Copyright 2019 RStudio, Inc.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
---------------------------------------------------------------------------------
-
-This project include code from CMake.
-
-* cpp/cmake_modules/FindGTest.cmake is based on code from CMake.
-
-Copyright: Copyright 2000-2019 Kitware, Inc. and Contributors
-Homepage: https://gitlab.kitware.com/cmake/cmake
-License: 3-clause BSD
-
---------------------------------------------------------------------------------
-
-This project include code from mingw-w64.
-
-* cpp/src/arrow/util/cpu-info.cc has a polyfill for mingw-w64 < 5
-
-Copyright (c) 2009 - 2013 by the mingw-w64 project
-Homepage: https://mingw-w64.org
-License: Zope Public License (ZPL) Version 2.1.
-
----------------------------------------------------------------------------------
-
-This project include code from Google's Asylo project.
-
-* cpp/src/arrow/result.h is based on status_or.h
-
-Copyright (c)  Copyright 2017 Asylo authors
-Homepage: https://asylo.dev/
-License: Apache 2.0
-
---------------------------------------------------------------------------------
-
-This project includes code from Google's protobuf project
-
-* cpp/src/arrow/result.h ARROW_ASSIGN_OR_RAISE is based off ASSIGN_OR_RETURN
-
-Copyright 2008 Google Inc.  All rights reserved.
-Homepage: https://developers.google.com/protocol-buffers/
-License:
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-    * Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above
-copyright notice, this list of conditions and the following disclaimer
-in the documentation and/or other materials provided with the
-distribution.
-    * Neither the name of Google Inc. nor the names of its
-contributors may be used to endorse or promote products derived from
-this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-Code generated by the Protocol Buffer compiler is owned by the owner
-of the input file used when generating it.  This code is not
-standalone and requires a support library to be linked with it.  This
-support library is itself covered by the above license.
-
---------------------------------------------------------------------------------
-
-3rdparty dependency LLVM is statically linked in certain binary distributions.
-Additionally some sections of source code have been derived from sources in LLVM
-and have been clearly labeled as such. LLVM has the following license:
-
-==============================================================================
-LLVM Release License
-==============================================================================
-University of Illinois/NCSA
-Open Source License
-
-Copyright (c) 2003-2018 University of Illinois at Urbana-Champaign.
-All rights reserved.
-
-Developed by:
-
-    LLVM Team
-
-    University of Illinois at Urbana-Champaign
-
-    http://llvm.org
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of
-this software and associated documentation files (the "Software"), to deal with
-the Software without restriction, including without limitation the rights to
-use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
-of the Software, and to permit persons to whom the Software is furnished to do
-so, subject to the following conditions:
-
-    * Redistributions of source code must retain the above copyright notice,
-      this list of conditions and the following disclaimers.
-
-    * Redistributions in binary form must reproduce the above copyright notice,
-      this list of conditions and the following disclaimers in the
-      documentation and/or other materials provided with the distribution.
-
-    * Neither the names of the LLVM Team, University of Illinois at
-      Urbana-Champaign, nor the names of its contributors may be used to
-      endorse or promote products derived from this Software without specific
-      prior written permission.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
-FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
-SOFTWARE.
-
-==============================================================================
-Copyrights and Licenses for Third Party Software Distributed with LLVM:
-==============================================================================
-The LLVM software contains code written by third parties.  Such software will
-have its own individual LICENSE.TXT file in the directory in which it appears.
-This file will describe the copyrights, license, and restrictions which apply
-to that code.
-
-The disclaimer of warranty in the University of Illinois Open Source License
-applies to all code in the LLVM Distribution, and nothing in any of the
-other licenses gives permission to use the names of the LLVM Team or the
-University of Illinois to endorse or promote products derived from this
-Software.
-
-The following pieces of software have additional or alternate copyrights,
-licenses, and/or restrictions:
-
-Program             Directory
--------             ---------
-Google Test         llvm/utils/unittest/googletest
-OpenBSD regex       llvm/lib/Support/{reg*, COPYRIGHT.regex}
-pyyaml tests        llvm/test/YAMLParser/{*.data, LICENSE.TXT}
-ARM contributions   llvm/lib/Target/ARM/LICENSE.TXT
-md5 contributions   llvm/lib/Support/MD5.cpp llvm/include/llvm/Support/MD5.h
-
---------------------------------------------------------------------------------
-
-3rdparty dependency gRPC is statically linked in certain binary
-distributions, like the python wheels. gRPC has the following license:
-
-Copyright 2014 gRPC authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-
---------------------------------------------------------------------------------
-
-3rdparty dependency Apache Thrift is statically linked in certain binary
-distributions, like the python wheels. Apache Thrift has the following license:
-
-Apache Thrift
-Copyright (C) 2006 - 2019, The Apache Software Foundation
-
-This product includes software developed at
-The Apache Software Foundation (http://www.apache.org/).
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-
---------------------------------------------------------------------------------
-
-3rdparty dependency Apache ORC is statically linked in certain binary
-distributions, like the python wheels. Apache ORC has the following license:
-
-Apache ORC
-Copyright 2013-2019 The Apache Software Foundation
-
-This product includes software developed by The Apache Software
-Foundation (http://www.apache.org/).
-
-This product includes software developed by Hewlett-Packard:
-(c) Copyright [2014-2015] Hewlett-Packard Development Company, L.P
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-
---------------------------------------------------------------------------------
-
-3rdparty dependency zstd is statically linked in certain binary
-distributions, like the python wheels. ZSTD has the following license:
-
-BSD License
-
-For Zstandard software
-
-Copyright (c) 2016-present, Facebook, Inc. All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
-
- * Neither the name Facebook nor the names of its contributors may be used to
-   endorse or promote products derived from this software without specific
-   prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
---------------------------------------------------------------------------------
-
-3rdparty dependency lz4 is statically linked in certain binary
-distributions, like the python wheels. lz4 has the following license:
-
-LZ4 Library
-Copyright (c) 2011-2016, Yann Collet
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
-* Redistributions of source code must retain the above copyright notice, this
-  list of conditions and the following disclaimer.
-
-* Redistributions in binary form must reproduce the above copyright notice, this
-  list of conditions and the following disclaimer in the documentation and/or
-  other materials provided with the distribution.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
---------------------------------------------------------------------------------
-
-3rdparty dependency Brotli is statically linked in certain binary
-distributions, like the python wheels. Brotli has the following license:
-
-Copyright (c) 2009, 2010, 2013-2016 by the Brotli Authors.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-
---------------------------------------------------------------------------------
-
-3rdparty dependency snappy is statically linked in certain binary
-distributions, like the python wheels. snappy has the following license:
-
-Copyright 2011, Google Inc.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-    * Redistributions of source code must retain the above copyright notice,
-      this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright notice,
-      this list of conditions and the following disclaimer in the documentation
-      and/or other materials provided with the distribution.
-    * Neither the name of Google Inc. nor the names of its contributors may be
-      used to endorse or promote products derived from this software without
-      specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-===
-
-Some of the benchmark data in testdata/ is licensed differently:
-
- - fireworks.jpeg is Copyright 2013 Steinar H. Gunderson, and
-   is licensed under the Creative Commons Attribution 3.0 license
-   (CC-BY-3.0). See https://creativecommons.org/licenses/by/3.0/
-   for more information.
-
- - kppkn.gtb is taken from the Gaviota chess tablebase set, and
-   is licensed under the MIT License. See
-   https://sites.google.com/site/gaviotachessengine/Home/endgame-tablebases-1
-   for more information.
-
- - paper-100k.pdf is an excerpt (bytes 92160 to 194560) from the paper
-   “Combinatorial Modeling of Chromatin Features Quantitatively Predicts DNA
-   Replication Timing in _Drosophila_” by Federico Comoglio and Renato Paro,
-   which is licensed under the CC-BY license. See
-   http://www.ploscompbiol.org/static/license for more ifnormation.
-
- - alice29.txt, asyoulik.txt, plrabn12.txt and lcet10.txt are from Project
-   Gutenberg. The first three have expired copyrights and are in the public
-   domain; the latter does not have expired copyright, but is still in the
-   public domain according to the license information
-   (http://www.gutenberg.org/ebooks/53).
-
---------------------------------------------------------------------------------
-
-3rdparty dependency gflags is statically linked in certain binary
-distributions, like the python wheels. gflags has the following license:
-
-Copyright (c) 2006, Google Inc.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-    * Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above
-copyright notice, this list of conditions and the following disclaimer
-in the documentation and/or other materials provided with the
-distribution.
-    * Neither the name of Google Inc. nor the names of its
-contributors may be used to endorse or promote products derived from
-this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
---------------------------------------------------------------------------------
-
-3rdparty dependency glog is statically linked in certain binary
-distributions, like the python wheels. glog has the following license:
-
-Copyright (c) 2008, Google Inc.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-    * Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above
-copyright notice, this list of conditions and the following disclaimer
-in the documentation and/or other materials provided with the
-distribution.
-    * Neither the name of Google Inc. nor the names of its
-contributors may be used to endorse or promote products derived from
-this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-A function gettimeofday in utilities.cc is based on
-
-http://www.google.com/codesearch/p?hl=en#dR3YEbitojA/COPYING&q=GetSystemTimeAsFileTime%20license:bsd
-
-The license of this code is:
-
-Copyright (c) 2003-2008, Jouni Malinen <j@w1.fi> and contributors
-All Rights Reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-1. Redistributions of source code must retain the above copyright
-   notice, this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright
-   notice, this list of conditions and the following disclaimer in the
-   documentation and/or other materials provided with the distribution.
-
-3. Neither the name(s) of the above-listed copyright holder(s) nor the
-   names of its contributors may be used to endorse or promote products
-   derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
---------------------------------------------------------------------------------
-
-3rdparty dependency re2 is statically linked in certain binary
-distributions, like the python wheels. re2 has the following license:
-
-Copyright (c) 2009 The RE2 Authors. All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above
-      copyright notice, this list of conditions and the following
-      disclaimer in the documentation and/or other materials provided
-      with the distribution.
-    * Neither the name of Google Inc. nor the names of its contributors
-      may be used to endorse or promote products derived from this
-      software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
---------------------------------------------------------------------------------
-
-3rdparty dependency c-ares is statically linked in certain binary
-distributions, like the python wheels. c-ares has the following license:
-
-# c-ares license
-
-Copyright (c) 2007 - 2018, Daniel Stenberg with many contributors, see AUTHORS
-file.
-
-Copyright 1998 by the Massachusetts Institute of Technology.
-
-Permission to use, copy, modify, and distribute this software and its
-documentation for any purpose and without fee is hereby granted, provided that
-the above copyright notice appear in all copies and that both that copyright
-notice and this permission notice appear in supporting documentation, and that
-the name of M.I.T. not be used in advertising or publicity pertaining to
-distribution of the software without specific, written prior permission.
-M.I.T. makes no representations about the suitability of this software for any
-purpose.  It is provided "as is" without express or implied warranty.
-
---------------------------------------------------------------------------------
-
-3rdparty dependency zlib is redistributed as a dynamically linked shared
-library in certain binary distributions, like the python wheels. In the future
-this will likely change to static linkage. zlib has the following license:
-
-zlib.h -- interface of the 'zlib' general purpose compression library
-  version 1.2.11, January 15th, 2017
-
-  Copyright (C) 1995-2017 Jean-loup Gailly and Mark Adler
-
-  This software is provided 'as-is', without any express or implied
-  warranty.  In no event will the authors be held liable for any damages
-  arising from the use of this software.
-
-  Permission is granted to anyone to use this software for any purpose,
-  including commercial applications, and to alter it and redistribute it
-  freely, subject to the following restrictions:
-
-  1. The origin of this software must not be misrepresented; you must not
-     claim that you wrote the original software. If you use this software
-     in a product, an acknowledgment in the product documentation would be
-     appreciated but is not required.
-  2. Altered source versions must be plainly marked as such, and must not be
-     misrepresented as being the original software.
-  3. This notice may not be removed or altered from any source distribution.
-
-  Jean-loup Gailly        Mark Adler
-  jloup@gzip.org          madler@alumni.caltech.edu
-
---------------------------------------------------------------------------------
-
-3rdparty dependency openssl is redistributed as a dynamically linked shared
-library in certain binary distributions, like the python wheels. openssl
-preceding version 3 has the following license:
-
-  LICENSE ISSUES
-  ==============
-
-  The OpenSSL toolkit stays under a double license, i.e. both the conditions of
-  the OpenSSL License and the original SSLeay license apply to the toolkit.
-  See below for the actual license texts.
-
-  OpenSSL License
-  ---------------
-
-/* ====================================================================
- * Copyright (c) 1998-2019 The OpenSSL Project.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *
- * 3. All advertising materials mentioning features or use of this
- *    software must display the following acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
- *
- * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
- *    endorse or promote products derived from this software without
- *    prior written permission. For written permission, please contact
- *    openssl-core@openssl.org.
- *
- * 5. Products derived from this software may not be called "OpenSSL"
- *    nor may "OpenSSL" appear in their names without prior written
- *    permission of the OpenSSL Project.
- *
- * 6. Redistributions of any form whatsoever must retain the following
- *    acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
- *
- * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
- * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- * OF THE POSSIBILITY OF SUCH DAMAGE.
- * ====================================================================
- *
- * This product includes cryptographic software written by Eric Young
- * (eay@cryptsoft.com).  This product includes software written by Tim
- * Hudson (tjh@cryptsoft.com).
- *
- */
-
- Original SSLeay License
- -----------------------
-
-/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
- * All rights reserved.
- *
- * This package is an SSL implementation written
- * by Eric Young (eay@cryptsoft.com).
- * The implementation was written so as to conform with Netscapes SSL.
- *
- * This library is free for commercial and non-commercial use as long as
- * the following conditions are aheared to.  The following conditions
- * apply to all code found in this distribution, be it the RC4, RSA,
- * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
- * included with this distribution is covered by the same copyright terms
- * except that the holder is Tim Hudson (tjh@cryptsoft.com).
- *
- * Copyright remains Eric Young's, and as such any Copyright notices in
- * the code are not to be removed.
- * If this package is used in a product, Eric Young should be given attribution
- * as the author of the parts of the library used.
- * This can be in the form of a textual message at program startup or
- * in documentation (online or textual) provided with the package.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- *    must display the following acknowledgement:
- *    "This product includes cryptographic software written by
- *     Eric Young (eay@cryptsoft.com)"
- *    The word 'cryptographic' can be left out if the rouines from the library
- *    being used are not cryptographic related :-).
- * 4. If you include any Windows specific code (or a derivative thereof) from
- *    the apps directory (application code) you must include an acknowledgement:
- *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
- *
- * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * The licence and distribution terms for any publically available version or
- * derivative of this code cannot be changed.  i.e. this code cannot simply be
- * copied and put under another distribution licence
- * [including the GNU Public Licence.]
- */
-
---------------------------------------------------------------------------------
-
-This project includes code from the rtools-backports project.
-
-* ci/scripts/PKGBUILD and ci/scripts/r_windows_build.sh are based on code
-  from the rtools-backports project.
-
-Copyright: Copyright (c) 2013 - 2019, Алексей and Jeroen Ooms.
-All rights reserved.
-Homepage: https://github.com/r-windows/rtools-backports
-License: 3-clause BSD
-
---------------------------------------------------------------------------------
-
-Some code from pandas has been adapted for the pyarrow codebase. pandas is
-available under the 3-clause BSD license, which follows:
-
-pandas license
-==============
-
-Copyright (c) 2011-2012, Lambda Foundry, Inc. and PyData Development Team
-All rights reserved.
-
-Copyright (c) 2008-2011 AQR Capital Management, LLC
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-    * Redistributions of source code must retain the above copyright
-       notice, this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above
-       copyright notice, this list of conditions and the following
-       disclaimer in the documentation and/or other materials provided
-       with the distribution.
-
-    * Neither the name of the copyright holder nor the names of any
-       contributors may be used to endorse or promote products derived
-       from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
---------------------------------------------------------------------------------
-
-Some bits from DyND, in particular aspects of the build system, have been
-adapted from libdynd and dynd-python under the terms of the BSD 2-clause
-license
-
-The BSD 2-Clause License
-
-    Copyright (C) 2011-12, Dynamic NDArray Developers
-    All rights reserved.
-
-    Redistribution and use in source and binary forms, with or without
-    modification, are permitted provided that the following conditions are
-    met:
-
-        * Redistributions of source code must retain the above copyright
-           notice, this list of conditions and the following disclaimer.
-
-        * Redistributions in binary form must reproduce the above
-           copyright notice, this list of conditions and the following
-           disclaimer in the documentation and/or other materials provided
-           with the distribution.
-
-    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-Dynamic NDArray Developers list:
-
- * Mark Wiebe
- * Continuum Analytics
-
---------------------------------------------------------------------------------
-
-Some source code from Ibis (https://github.com/cloudera/ibis) has been adapted
-for PyArrow. Ibis is released under the Apache License, Version 2.0.
-
---------------------------------------------------------------------------------
-
-This project includes code from the autobrew project.
-
-* r/tools/autobrew and dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb
-  are based on code from the autobrew project.
-
-Copyright (c) 2019, Jeroen Ooms
-License: MIT
-Homepage: https://github.com/jeroen/autobrew
-
---------------------------------------------------------------------------------
-
-dev/tasks/homebrew-formulae/apache-arrow.rb has the following license:
-
-BSD 2-Clause License
-
-Copyright (c) 2009-present, Homebrew contributors
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-* Redistributions of source code must retain the above copyright notice, this
-  list of conditions and the following disclaimer.
-
-* Redistributions in binary form must reproduce the above copyright notice,
-  this list of conditions and the following disclaimer in the documentation
-  and/or other materials provided with the distribution.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-----------------------------------------------------------------------
-
-cpp/src/arrow/vendored/base64.cpp has the following license
-
-ZLIB License
-
-Copyright (C) 2004-2017 René Nyffenegger
-
-This source code is provided 'as-is', without any express or implied
-warranty. In no event will the author be held liable for any damages arising
-from the use of this software.
-
-Permission is granted to anyone to use this software for any purpose, including
-commercial applications, and to alter it and redistribute it freely, subject to
-the following restrictions:
-
-1. The origin of this source code must not be misrepresented; you must not
-   claim that you wrote the original source code. If you use this source code
-   in a product, an acknowledgment in the product documentation would be
-   appreciated but is not required.
-
-2. Altered source versions must be plainly marked as such, and must not be
-   misrepresented as being the original source code.
-
-3. This notice may not be removed or altered from any source distribution.
-
-René Nyffenegger rene.nyffenegger@adp-gmbh.ch
-
---------------------------------------------------------------------------------
-
-The file cpp/src/arrow/vendored/optional.hpp has the following license
-
-Boost Software License - Version 1.0 - August 17th, 2003
-
-Permission is hereby granted, free of charge, to any person or organization
-obtaining a copy of the software and accompanying documentation covered by
-this license (the "Software") to use, reproduce, display, distribute,
-execute, and transmit the Software, and to prepare derivative works of the
-Software, and to permit third-parties to whom the Software is furnished to
-do so, all subject to the following:
-
-The copyright notices in the Software and this entire statement, including
-the above license grant, this restriction and the following disclaimer,
-must be included in all copies of the Software, in whole or in part, and
-all derivative works of the Software, unless such copies or derivative
-works are solely in the form of machine-executable object code generated by
-a source language processor.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
-SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
-FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
-ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE.
-
---------------------------------------------------------------------------------
-
-The file cpp/src/arrow/vendored/musl/strptime.c has the following license
-
-Copyright © 2005-2020 Rich Felker, et al.
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
-
-The above copyright notice and this permission notice shall be
-included in all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

From 954577b078c39783642954dd991bb3de983badde Mon Sep 17 00:00:00 2001
From: David Li <li.davidm96@gmail.com>
Date: Wed, 17 Nov 2021 10:28:34 +0100
Subject: [PATCH 154/194] ARROW-14700: [C++] Only check zone offset sign when
 offset present

Fixes a Valgrind failure.

Closes #11695 from lidavidm/arrow-14700

Authored-by: David Li <li.davidm96@gmail.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/arrow/util/value_parsing.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/src/arrow/util/value_parsing.h b/cpp/src/arrow/util/value_parsing.h
index 927bcffcca3ec..fd285713b11ff 100644
--- a/cpp/src/arrow/util/value_parsing.h
+++ b/cpp/src/arrow/util/value_parsing.h
@@ -688,6 +688,7 @@ static inline bool ParseTimestampISO8601(const char* s, size_t length,
     if (ARROW_PREDICT_FALSE(!detail::ParseHH(s + length + 1, &zone_offset))) {
       return false;
     }
+    if (s[length] == '+') zone_offset *= -1;
     if (out_zone_offset_present) *out_zone_offset_present = true;
   } else if (s[length - 5] == '+' || s[length - 5] == '-') {
     // [+-]HHMM
@@ -695,6 +696,7 @@ static inline bool ParseTimestampISO8601(const char* s, size_t length,
     if (ARROW_PREDICT_FALSE(!detail::ParseHHMM(s + length + 1, &zone_offset))) {
       return false;
     }
+    if (s[length] == '+') zone_offset *= -1;
     if (out_zone_offset_present) *out_zone_offset_present = true;
   } else if ((s[length - 6] == '+' || s[length - 6] == '-') && (s[length - 3] == ':')) {
     // [+-]HH:MM
@@ -702,11 +704,9 @@ static inline bool ParseTimestampISO8601(const char* s, size_t length,
     if (ARROW_PREDICT_FALSE(!detail::ParseHH_MM(s + length + 1, &zone_offset))) {
       return false;
     }
+    if (s[length] == '+') zone_offset *= -1;
     if (out_zone_offset_present) *out_zone_offset_present = true;
   }
-  if (s[length] == '+') {
-    zone_offset *= -1;
-  }
 
   seconds_type seconds_since_midnight;
   switch (length) {

From be5a0c6248ee171fb902654c98bde0aacdb0e054 Mon Sep 17 00:00:00 2001
From: Benson Muite <bkmgit@users.noreply.github.com>
Date: Wed, 17 Nov 2021 10:29:35 +0100
Subject: [PATCH 155/194] MINOR: [C++] Use alphabetical order

Closes #11719 from bkmgit/alphabetical-ordering

Authored-by: Benson Muite <bkmgit@users.noreply.github.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/arrow/CMakeLists.txt | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index 481d5fb6ba051..8d9cbb32300a6 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -377,11 +377,21 @@ if(ARROW_COMPUTE)
        compute/exec/exec_plan.cc
        compute/exec/expression.cc
        compute/exec/filter_node.cc
+       compute/exec/hash_join.cc
+       compute/exec/hash_join_dict.cc
+       compute/exec/hash_join_node.cc
        compute/exec/ir_consumer.cc
+       compute/exec/key_compare.cc
+       compute/exec/key_encode.cc
+       compute/exec/key_hash.cc
+       compute/exec/key_map.cc
+       compute/exec/order_by_impl.cc
        compute/exec/project_node.cc
-       compute/exec/source_node.cc
        compute/exec/sink_node.cc
-       compute/exec/order_by_impl.cc
+       compute/exec/source_node.cc
+       compute/exec/task_util.cc
+       compute/exec/union_node.cc
+       compute/exec/util.cc
        compute/function.cc
        compute/function_internal.cc
        compute/kernel.cc
@@ -393,6 +403,7 @@ if(ARROW_COMPUTE)
        compute/kernels/aggregate_var_std.cc
        compute/kernels/codegen_internal.cc
        compute/kernels/hash_aggregate.cc
+       compute/kernels/row_encoder.cc
        compute/kernels/scalar_arithmetic.cc
        compute/kernels/scalar_boolean.cc
        compute/kernels/scalar_cast_boolean.cc
@@ -403,39 +414,28 @@ if(ARROW_COMPUTE)
        compute/kernels/scalar_cast_string.cc
        compute/kernels/scalar_cast_temporal.cc
        compute/kernels/scalar_compare.cc
+       compute/kernels/scalar_if_else.cc
        compute/kernels/scalar_nested.cc
        compute/kernels/scalar_set_lookup.cc
        compute/kernels/scalar_string.cc
        compute/kernels/scalar_temporal_binary.cc
        compute/kernels/scalar_temporal_unary.cc
        compute/kernels/scalar_validity.cc
-       compute/kernels/scalar_if_else.cc
        compute/kernels/util_internal.cc
        compute/kernels/vector_array_sort.cc
        compute/kernels/vector_hash.cc
        compute/kernels/vector_nested.cc
        compute/kernels/vector_replace.cc
        compute/kernels/vector_selection.cc
-       compute/kernels/vector_sort.cc
-       compute/kernels/row_encoder.cc
-       compute/exec/union_node.cc
-       compute/exec/key_hash.cc
-       compute/exec/key_map.cc
-       compute/exec/key_compare.cc
-       compute/exec/key_encode.cc
-       compute/exec/util.cc
-       compute/exec/hash_join_dict.cc
-       compute/exec/hash_join.cc
-       compute/exec/hash_join_node.cc
-       compute/exec/task_util.cc)
+       compute/kernels/vector_sort.cc)
 
   append_avx2_src(compute/kernels/aggregate_basic_avx2.cc)
   append_avx512_src(compute/kernels/aggregate_basic_avx512.cc)
 
-  append_avx2_src(compute/exec/key_hash_avx2.cc)
-  append_avx2_src(compute/exec/key_map_avx2.cc)
   append_avx2_src(compute/exec/key_compare_avx2.cc)
   append_avx2_src(compute/exec/key_encode_avx2.cc)
+  append_avx2_src(compute/exec/key_hash_avx2.cc)
+  append_avx2_src(compute/exec/key_map_avx2.cc)
   append_avx2_src(compute/exec/util_avx2.cc)
 
   list(APPEND ARROW_TESTING_SRCS compute/exec/test_util.cc)

From 6b0248d5b5502a1cf76d5853d15be9844b84522c Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Wed, 17 Nov 2021 10:59:42 +0100
Subject: [PATCH 156/194] ARROW-13589: [C++] Reconcile ValidateArray and
 ValidateArrayFull

Also fix MakeArrayOfNull() on dense unions.

Closes #11670 from pitrou/ARROW-13589-validate-refactor

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/arrow/array/array_base.cc |   5 +-
 cpp/src/arrow/array/array_test.cc |  29 +-
 cpp/src/arrow/array/util.cc       |  19 +-
 cpp/src/arrow/array/validate.cc   | 766 ++++++++++++++----------------
 cpp/src/arrow/array/validate.h    |   5 +-
 cpp/src/arrow/chunked_array.cc    |  35 +-
 cpp/src/arrow/compute/exec.cc     |   4 +-
 cpp/src/arrow/ipc/dictionary.cc   |   1 -
 cpp/src/arrow/record_batch.cc     |  36 +-
 9 files changed, 441 insertions(+), 459 deletions(-)

diff --git a/cpp/src/arrow/array/array_base.cc b/cpp/src/arrow/array/array_base.cc
index dd3cec1d7e9e4..136e6cadb956b 100644
--- a/cpp/src/arrow/array/array_base.cc
+++ b/cpp/src/arrow/array/array_base.cc
@@ -305,9 +305,6 @@ Status Array::Accept(ArrayVisitor* visitor) const {
 
 Status Array::Validate() const { return internal::ValidateArray(*this); }
 
-Status Array::ValidateFull() const {
-  RETURN_NOT_OK(internal::ValidateArray(*this));
-  return internal::ValidateArrayFull(*this);
-}
+Status Array::ValidateFull() const { return internal::ValidateArrayFull(*this); }
 
 }  // namespace arrow
diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc
index 5841b35c389d3..efe600f1223bb 100644
--- a/cpp/src/arrow/array/array_test.cc
+++ b/cpp/src/arrow/array/array_test.cc
@@ -117,7 +117,8 @@ TEST_F(TestArray, TestNullToString) {
   auto data = std::make_shared<Buffer>(nullptr, 400);
 
   std::unique_ptr<Int32Array> arr(new Int32Array(100, data));
-  ASSERT_EQ(arr->ToString(), "<Invalid array: Missing values buffer in non-empty array>");
+  ASSERT_EQ(arr->ToString(),
+            "<Invalid array: Missing values buffer in non-empty fixed-width array>");
 }
 
 TEST_F(TestArray, TestSliceSafe) {
@@ -332,6 +333,10 @@ TEST_F(TestArray, BuildLargeInMemoryArray) {
 }
 
 TEST_F(TestArray, TestMakeArrayOfNull) {
+  FieldVector union_fields1({field("a", utf8()), field("b", int32())});
+  FieldVector union_fields2({field("a", null()), field("b", list(large_utf8()))});
+  std::vector<int8_t> union_type_codes{7, 42};
+
   std::shared_ptr<DataType> types[] = {
       // clang-format off
       null(),
@@ -354,19 +359,33 @@ TEST_F(TestArray, TestMakeArrayOfNull) {
       fixed_size_list(int64(), 4),
       dictionary(int32(), utf8()),
       struct_({field("a", utf8()), field("b", int32())}),
+      sparse_union(union_fields1, union_type_codes),
+      sparse_union(union_fields2, union_type_codes),
+      dense_union(union_fields1, union_type_codes),
+      dense_union(union_fields2, union_type_codes),
       smallint(),  // extension type
       // clang-format on
   };
 
   for (int64_t length : {0, 1, 16, 133}) {
     for (auto type : types) {
+      ARROW_SCOPED_TRACE("type = ", type->ToString());
       ASSERT_OK_AND_ASSIGN(auto array, MakeArrayOfNull(type, length));
       ASSERT_OK(array->ValidateFull());
       ASSERT_EQ(array->length(), length);
-      ASSERT_EQ(array->null_count(), length);
-      for (int64_t i = 0; i < length; ++i) {
-        ASSERT_TRUE(array->IsNull(i));
-        ASSERT_FALSE(array->IsValid(i));
+      if (is_union(type->id())) {
+        // For unions, MakeArrayOfNull places the nulls in the children
+        ASSERT_EQ(array->null_count(), 0);
+        const auto& union_array = checked_cast<const UnionArray&>(*array);
+        for (int i = 0; i < union_array.num_fields(); ++i) {
+          ASSERT_EQ(union_array.field(i)->null_count(), union_array.field(i)->length());
+        }
+      } else {
+        ASSERT_EQ(array->null_count(), length);
+        for (int64_t i = 0; i < length; ++i) {
+          ASSERT_TRUE(array->IsNull(i));
+          ASSERT_FALSE(array->IsValid(i));
+        }
       }
     }
   }
diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc
index d639830f469e0..2045b8f5c71bd 100644
--- a/cpp/src/arrow/array/util.cc
+++ b/cpp/src/arrow/array/util.cc
@@ -363,19 +363,28 @@ class NullArrayFactory {
       return Status::OK();
     }
 
-    Status Visit(const UnionType& type) {
+    Status Visit(const SparseUnionType& type) {
       // type codes
       RETURN_NOT_OK(MaxOf(length_));
-      if (type.mode() == UnionMode::DENSE) {
-        // offsets
-        RETURN_NOT_OK(MaxOf(sizeof(int32_t) * length_));
-      }
+      // will create children of the same length as the union
       for (const auto& child : type.fields()) {
         RETURN_NOT_OK(MaxOf(GetBufferLength(child->type(), length_)));
       }
       return Status::OK();
     }
 
+    Status Visit(const DenseUnionType& type) {
+      // type codes
+      RETURN_NOT_OK(MaxOf(length_));
+      // offsets
+      RETURN_NOT_OK(MaxOf(sizeof(int32_t) * length_));
+      // will create children of length 1
+      for (const auto& child : type.fields()) {
+        RETURN_NOT_OK(MaxOf(GetBufferLength(child->type(), 1)));
+      }
+      return Status::OK();
+    }
+
     Status Visit(const DictionaryType& type) {
       RETURN_NOT_OK(MaxOf(GetBufferLength(type.value_type(), length_)));
       return MaxOf(GetBufferLength(type.index_type(), length_));
diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc
index 8bf53cc5ca912..44889e4f36d6f 100644
--- a/cpp/src/arrow/array/validate.cc
+++ b/cpp/src/arrow/array/validate.cc
@@ -35,17 +35,101 @@
 namespace arrow {
 namespace internal {
 
-///////////////////////////////////////////////////////////////////////////
-// ValidateArray: cheap validation checks
-
 namespace {
 
+struct UTF8DataValidator {
+  const ArrayData& data;
+
+  Status Visit(const DataType&) {
+    // Default, should be unreachable
+    return Status::NotImplemented("");
+  }
+
+  template <typename StringType>
+  enable_if_string<StringType, Status> Visit(const StringType&) {
+    util::InitializeUTF8();
+
+    int64_t i = 0;
+    return VisitArrayDataInline<StringType>(
+        data,
+        [&](util::string_view v) {
+          if (ARROW_PREDICT_FALSE(!util::ValidateUTF8(v))) {
+            return Status::Invalid("Invalid UTF8 sequence at string index ", i);
+          }
+          ++i;
+          return Status::OK();
+        },
+        [&]() {
+          ++i;
+          return Status::OK();
+        });
+  }
+};
+
+struct BoundsChecker {
+  const ArrayData& data;
+  int64_t min_value;
+  int64_t max_value;
+
+  Status Visit(const DataType&) {
+    // Default, should be unreachable
+    return Status::NotImplemented("");
+  }
+
+  template <typename IntegerType>
+  enable_if_integer<IntegerType, Status> Visit(const IntegerType&) {
+    using c_type = typename IntegerType::c_type;
+
+    int64_t i = 0;
+    return VisitArrayDataInline<IntegerType>(
+        data,
+        [&](c_type value) {
+          const auto v = static_cast<int64_t>(value);
+          if (ARROW_PREDICT_FALSE(v < min_value || v > max_value)) {
+            return Status::Invalid("Value at position ", i, " out of bounds: ", v,
+                                   " (should be in [", min_value, ", ", max_value, "])");
+          }
+          ++i;
+          return Status::OK();
+        },
+        [&]() {
+          ++i;
+          return Status::OK();
+        });
+  }
+};
+
 struct ValidateArrayImpl {
   const ArrayData& data;
+  const bool full_validation;
+
+  Status Validate() {
+    if (data.type == nullptr) {
+      return Status::Invalid("Array type is absent");
+    }
 
-  Status Validate() { return ValidateWithType(*data.type); }
+    // XXX should we unpack extension types here?
 
-  Status ValidateWithType(const DataType& type) { return VisitTypeInline(type, this); }
+    RETURN_NOT_OK(ValidateLayout(*data.type));
+    // Check nulls *after* validating the buffer sizes, to avoid
+    // reading out of bounds.
+    RETURN_NOT_OK(ValidateNulls(*data.type));
+
+    // Run type-specific validations
+    return ValidateWithType(*data.type);
+  }
+
+  Status ValidateWithType(const DataType& type) {
+    if (type.id() != Type::EXTENSION) {
+      if (data.child_data.size() != static_cast<size_t>(type.num_fields())) {
+        return Status::Invalid("Expected ", type.num_fields(),
+                               " child arrays in array "
+                               "of type ",
+                               type.ToString(), ", got ", data.child_data.size());
+      }
+    }
+    return VisitTypeInline(type, this);
+  }
 
   Status Visit(const NullType&) {
     if (data.null_count != data.length) {
@@ -54,21 +138,36 @@ struct ValidateArrayImpl {
     return Status::OK();
   }
 
-  Status Visit(const FixedWidthType&) {
-    if (data.length > 0) {
-      if (!IsBufferValid(1)) {
-        return Status::Invalid("Missing values buffer in non-empty array");
-      }
+  Status Visit(const FixedWidthType&) { return ValidateFixedWidthBuffers(); }
+
+  Status Visit(const Decimal128Type& type) {
+    RETURN_NOT_OK(ValidateFixedWidthBuffers());
+    return ValidateDecimals(type);
+  }
+
+  Status Visit(const Decimal256Type& type) {
+    RETURN_NOT_OK(ValidateFixedWidthBuffers());
+    return ValidateDecimals(type);
+  }
+
+  Status Visit(const StringType& type) {
+    RETURN_NOT_OK(ValidateBinaryLike(type));
+    if (full_validation) {
+      RETURN_NOT_OK(ValidateUTF8(data));
     }
     return Status::OK();
   }
 
-  Status Visit(const StringType& type) { return ValidateBinaryLike(type); }
+  Status Visit(const LargeStringType& type) {
+    RETURN_NOT_OK(ValidateBinaryLike(type));
+    if (full_validation) {
+      RETURN_NOT_OK(ValidateUTF8(data));
+    }
+    return Status::OK();
+  }
 
   Status Visit(const BinaryType& type) { return ValidateBinaryLike(type); }
 
-  Status Visit(const LargeStringType& type) { return ValidateBinaryLike(type); }
-
   Status Visit(const LargeBinaryType& type) { return ValidateBinaryLike(type); }
 
   Status Visit(const ListType& type) { return ValidateListLike(type); }
@@ -95,7 +194,7 @@ struct ValidateArrayImpl {
                              ") multiplied by the value size (", list_size, ")");
     }
 
-    const Status child_valid = ValidateArray(values);
+    const Status child_valid = RecurseInto(values);
     if (!child_valid.ok()) {
       return Status::Invalid("Fixed size list child array invalid: ",
                              child_valid.ToString());
@@ -109,7 +208,7 @@ struct ValidateArrayImpl {
       const auto& field_data = *data.child_data[i];
 
       // Validate child first, to catch nonsensical length / offset etc.
-      const Status field_valid = ValidateArray(field_data);
+      const Status field_valid = RecurseInto(field_data);
       if (!field_valid.ok()) {
         return Status::Invalid("Struct child array #", i,
                                " invalid: ", field_valid.ToString());
@@ -135,8 +234,8 @@ struct ValidateArrayImpl {
     for (int i = 0; i < type.num_fields(); ++i) {
       const auto& field_data = *data.child_data[i];
 
-      // Validate child first, to catch nonsensical length / offset etc.
-      const Status field_valid = ValidateArray(field_data);
+      // Validate children first, to catch nonsensical length / offset etc.
+      const Status field_valid = RecurseInto(field_data);
       if (!field_valid.ok()) {
         return Status::Invalid("Union child array #", i,
                                " invalid: ", field_valid.ToString());
@@ -156,6 +255,57 @@ struct ValidateArrayImpl {
                                field_type->ToString());
       }
     }
+
+    if (full_validation) {
+      // Validate all type codes
+      const auto& child_ids = type.child_ids();
+      const auto& type_codes_map = type.type_codes();
+
+      const int8_t* type_codes = data.GetValues<int8_t>(1);
+
+      for (int64_t i = 0; i < data.length; ++i) {
+        // Note that union arrays never have top-level nulls
+        const int32_t code = type_codes[i];
+        if (code < 0 || child_ids[code] == UnionType::kInvalidChildId) {
+          return Status::Invalid("Union value at position ", i, " has invalid type id ",
+                                 code);
+        }
+      }
+
+      if (type.mode() == UnionMode::DENSE) {
+        // Validate all offsets
+
+        // Map logical type id to child length
+        std::vector<int64_t> child_lengths(256);
+        for (int child_id = 0; child_id < type.num_fields(); ++child_id) {
+          child_lengths[type_codes_map[child_id]] = data.child_data[child_id]->length;
+        }
+
+        // Check offsets are in bounds
+        std::vector<int64_t> last_child_offsets(256, 0);
+        const int32_t* offsets = data.GetValues<int32_t>(2);
+        for (int64_t i = 0; i < data.length; ++i) {
+          const int32_t code = type_codes[i];
+          const int32_t offset = offsets[i];
+          if (offset < 0) {
+            return Status::Invalid("Union value at position ", i, " has negative offset ",
+                                   offset);
+          }
+          if (offset >= child_lengths[code]) {
+            return Status::Invalid("Union value at position ", i,
+                                   " has offset larger "
+                                   "than child length (",
+                                   offset, " >= ", child_lengths[code], ")");
+          }
+          if (offset < last_child_offsets[code]) {
+            return Status::Invalid("Union value at position ", i,
+                                   " has non-monotonic offset ", offset);
+          }
+          last_child_offsets[code] = offset;
+        }
+      }
+    }
+
     return Status::OK();
   }
 
@@ -167,12 +317,23 @@ struct ValidateArrayImpl {
     if (!data.dictionary) {
       return Status::Invalid("Dictionary values must be non-null");
     }
-    const Status dict_valid = ValidateArray(*data.dictionary);
+    // Validate dictionary
+    const Status dict_valid = RecurseInto(*data.dictionary);
     if (!dict_valid.ok()) {
       return Status::Invalid("Dictionary array invalid: ", dict_valid.ToString());
     }
-    // Visit indices
-    return ValidateWithType(*type.index_type());
+    // Validate indices
+    RETURN_NOT_OK(ValidateWithType(*type.index_type()));
+
+    if (full_validation) {
+      // Check indices within dictionary bounds
+      const Status indices_status =
+          CheckBounds(*type.index_type(), 0, data.dictionary->length - 1);
+      if (!indices_status.ok()) {
+        return Status::Invalid("Dictionary indices invalid: ", indices_status.ToString());
+      }
+    }
+    return Status::OK();
   }
 
   Status Visit(const ExtensionType& type) {
@@ -187,13 +348,126 @@ struct ValidateArrayImpl {
     return data.buffers[index] != nullptr && data.buffers[index]->address() != 0;
   }
 
+  Status RecurseInto(const ArrayData& related_data) {
+    ValidateArrayImpl impl{related_data, full_validation};
+    return impl.Validate();
+  }
+
+  Status ValidateLayout(const DataType& type) {
+    // Check the data layout conforms to the spec
+    const auto layout = type.layout();
+
+    if (data.length < 0) {
+      return Status::Invalid("Array length is negative");
+    }
+
+    if (data.buffers.size() != layout.buffers.size()) {
+      return Status::Invalid("Expected ", layout.buffers.size(),
+                             " buffers in array "
+                             "of type ",
+                             type.ToString(), ", got ", data.buffers.size());
+    }
+
+    // This check is required to avoid addition overflow below
+    int64_t length_plus_offset = -1;
+    if (AddWithOverflow(data.length, data.offset, &length_plus_offset)) {
+      return Status::Invalid("Array of type ", type.ToString(),
+                             " has impossibly large length and offset");
+    }
+
+    for (int i = 0; i < static_cast<int>(data.buffers.size()); ++i) {
+      const auto& buffer = data.buffers[i];
+      const auto& spec = layout.buffers[i];
+
+      if (buffer == nullptr) {
+        continue;
+      }
+      int64_t min_buffer_size = -1;
+      switch (spec.kind) {
+        case DataTypeLayout::BITMAP:
+          min_buffer_size = BitUtil::BytesForBits(length_plus_offset);
+          break;
+        case DataTypeLayout::FIXED_WIDTH:
+          if (MultiplyWithOverflow(length_plus_offset, spec.byte_width,
+                                   &min_buffer_size)) {
+            return Status::Invalid("Array of type ", type.ToString(),
+                                   " has impossibly large length and offset");
+          }
+          break;
+        case DataTypeLayout::ALWAYS_NULL:
+          // XXX Should we raise on non-null buffer?
+          continue;
+        default:
+          continue;
+      }
+      if (buffer->size() < min_buffer_size) {
+        return Status::Invalid("Buffer #", i, " too small in array of type ",
+                               type.ToString(), " and length ", data.length,
+                               ": expected at least ", min_buffer_size, " byte(s), got ",
+                               buffer->size());
+      }
+    }
+    if (layout.has_dictionary && !data.dictionary) {
+      return Status::Invalid("Array of type ", type.ToString(),
+                             " must have dictionary values");
+    }
+    if (!layout.has_dictionary && data.dictionary) {
+      return Status::Invalid("Unexpected dictionary values in array of type ",
+                             type.ToString());
+    }
+    return Status::OK();
+  }
+
+  Status ValidateNulls(const DataType& type) {
+    if (type.id() != Type::NA && data.null_count > 0 && data.buffers[0] == nullptr) {
+      return Status::Invalid("Array of type ", type.ToString(), " has ", data.null_count,
+                             " nulls but no null bitmap");
+    }
+    if (data.null_count > data.length) {
+      return Status::Invalid("Null count exceeds array length");
+    }
+    if (data.null_count < 0 && data.null_count != kUnknownNullCount) {
+      return Status::Invalid("Negative null count");
+    }
+
+    if (full_validation) {
+      if (data.null_count != kUnknownNullCount) {
+        int64_t actual_null_count;
+        if (HasValidityBitmap(data.type->id()) && data.buffers[0]) {
+          // Do not call GetNullCount() as it would also set the `null_count` member
+          actual_null_count = data.length - CountSetBits(data.buffers[0]->data(),
+                                                         data.offset, data.length);
+        } else if (data.type->id() == Type::NA) {
+          actual_null_count = data.length;
+        } else {
+          actual_null_count = 0;
+        }
+        if (actual_null_count != data.null_count) {
+          return Status::Invalid("null_count value (", data.null_count,
+                                 ") doesn't match actual number of nulls in array (",
+                                 actual_null_count, ")");
+        }
+      }
+    }
+    return Status::OK();
+  }
+
+  Status ValidateFixedWidthBuffers() {
+    if (data.length > 0 && !IsBufferValid(1)) {
+      return Status::Invalid("Missing values buffer in non-empty fixed-width array");
+    }
+    return Status::OK();
+  }
+
   template <typename BinaryType>
   Status ValidateBinaryLike(const BinaryType& type) {
     if (!IsBufferValid(2)) {
       return Status::Invalid("Value data buffer is null");
     }
+    const Buffer& values = *data.buffers[2];
+
     // First validate offsets, to make sure the accesses below are valid
-    RETURN_NOT_OK(ValidateOffsets(type));
+    RETURN_NOT_OK(ValidateOffsets(type, values.size()));
 
     if (data.length > 0 && data.buffers[1]->is_cpu()) {
       using offset_type = typename BinaryType::offset_type;
@@ -227,10 +501,14 @@ struct ValidateArrayImpl {
 
   template <typename ListType>
   Status ValidateListLike(const ListType& type) {
-    // First validate offsets, to make sure the accesses below are valid
-    RETURN_NOT_OK(ValidateOffsets(type));
-
     const ArrayData& values = *data.child_data[0];
+    const Status child_valid = RecurseInto(values);
+    if (!child_valid.ok()) {
+      return Status::Invalid("List child array invalid: ", child_valid.ToString());
+    }
+
+    // First validate offsets, to make sure the accesses below are valid
+    RETURN_NOT_OK(ValidateOffsets(type, values.offset + values.length));
 
     // An empty list array can have 0 offsets
     if (data.length > 0 && data.buffers[1]->is_cpu()) {
@@ -260,19 +538,14 @@ struct ValidateArrayImpl {
       }
     }
 
-    const Status child_valid = ValidateArray(values);
-    if (!child_valid.ok()) {
-      return Status::Invalid("List child array invalid: ", child_valid.ToString());
-    }
     return Status::OK();
   }
 
   template <typename TypeClass>
-  Status ValidateOffsets(const TypeClass& type) {
+  Status ValidateOffsets(const TypeClass& type, int64_t offset_limit) {
     using offset_type = typename TypeClass::offset_type;
 
-    const Buffer* offsets = data.buffers[1].get();
-    if (offsets == nullptr) {
+    if (!IsBufferValid(1)) {
       // For length 0, an empty offsets buffer seems accepted as a special case
       // (ARROW-544)
       if (data.length > 0) {
@@ -282,377 +555,58 @@ struct ValidateArrayImpl {
     }
 
     // An empty list array can have 0 offsets
-    auto required_offsets = (data.length > 0) ? data.length + data.offset + 1 : 0;
-    if (offsets->size() / static_cast<int32_t>(sizeof(offset_type)) < required_offsets) {
-      return Status::Invalid("Offsets buffer size (bytes): ", offsets->size(),
-                             " isn't large enough for length: ", data.length);
-    }
-
-    return Status::OK();
-  }
-};
-
-}  // namespace
-
-ARROW_EXPORT
-Status ValidateArray(const ArrayData& data) {
-  if (data.type == nullptr) {
-    return Status::Invalid("Array type is absent");
-  }
-
-  // First check the data layout conforms to the spec
-  const DataType& type = *data.type;
-  const auto layout = type.layout();
-
-  if (data.length < 0) {
-    return Status::Invalid("Array length is negative");
-  }
-
-  if (data.buffers.size() != layout.buffers.size()) {
-    return Status::Invalid("Expected ", layout.buffers.size(),
-                           " buffers in array "
-                           "of type ",
-                           type.ToString(), ", got ", data.buffers.size());
-  }
-
-  // This check is required to avoid addition overflow below
-  int64_t length_plus_offset = -1;
-  if (AddWithOverflow(data.length, data.offset, &length_plus_offset)) {
-    return Status::Invalid("Array of type ", type.ToString(),
-                           " has impossibly large length and offset");
-  }
-
-  for (int i = 0; i < static_cast<int>(data.buffers.size()); ++i) {
-    const auto& buffer = data.buffers[i];
-    const auto& spec = layout.buffers[i];
-
-    if (buffer == nullptr) {
-      continue;
-    }
-    int64_t min_buffer_size = -1;
-    switch (spec.kind) {
-      case DataTypeLayout::BITMAP:
-        min_buffer_size = BitUtil::BytesForBits(length_plus_offset);
-        break;
-      case DataTypeLayout::FIXED_WIDTH:
-        if (MultiplyWithOverflow(length_plus_offset, spec.byte_width, &min_buffer_size)) {
-          return Status::Invalid("Array of type ", type.ToString(),
-                                 " has impossibly large length and offset");
-        }
-        break;
-      case DataTypeLayout::ALWAYS_NULL:
-        // XXX Should we raise on non-null buffer?
-        continue;
-      default:
-        continue;
-    }
-    if (buffer->size() < min_buffer_size) {
-      return Status::Invalid("Buffer #", i, " too small in array of type ",
-                             type.ToString(), " and length ", data.length,
-                             ": expected at least ", min_buffer_size, " byte(s), got ",
-                             buffer->size());
-    }
-  }
-  if (type.id() != Type::NA && data.null_count > 0 && data.buffers[0] == nullptr) {
-    return Status::Invalid("Array of type ", type.ToString(), " has ", data.null_count,
-                           " nulls but no null bitmap");
-  }
-
-  // Check null_count() *after* validating the buffer sizes, to avoid
-  // reading out of bounds.
-  if (data.null_count > data.length) {
-    return Status::Invalid("Null count exceeds array length");
-  }
-  if (data.null_count < 0 && data.null_count != kUnknownNullCount) {
-    return Status::Invalid("Negative null count");
-  }
-
-  if (type.id() != Type::EXTENSION) {
-    if (data.child_data.size() != static_cast<size_t>(type.num_fields())) {
-      return Status::Invalid("Expected ", type.num_fields(),
-                             " child arrays in array "
-                             "of type ",
-                             type.ToString(), ", got ", data.child_data.size());
-    }
-  }
-  if (layout.has_dictionary && !data.dictionary) {
-    return Status::Invalid("Array of type ", type.ToString(),
-                           " must have dictionary values");
-  }
-  if (!layout.has_dictionary && data.dictionary) {
-    return Status::Invalid("Unexpected dictionary values in array of type ",
-                           type.ToString());
-  }
-
-  ValidateArrayImpl validator{data};
-  return validator.Validate();
-}
-
-ARROW_EXPORT
-Status ValidateArray(const Array& array) { return ValidateArray(*array.data()); }
-
-///////////////////////////////////////////////////////////////////////////
-// ValidateArrayFull: expensive validation checks
-
-namespace {
-
-struct UTF8DataValidator {
-  const ArrayData& data;
-
-  Status Visit(const DataType&) {
-    // Default, should be unreachable
-    return Status::NotImplemented("");
-  }
-
-  template <typename StringType>
-  enable_if_string<StringType, Status> Visit(const StringType&) {
-    util::InitializeUTF8();
-
-    int64_t i = 0;
-    return VisitArrayDataInline<StringType>(
-        data,
-        [&](util::string_view v) {
-          if (ARROW_PREDICT_FALSE(!util::ValidateUTF8(v))) {
-            return Status::Invalid("Invalid UTF8 sequence at string index ", i);
-          }
-          ++i;
-          return Status::OK();
-        },
-        [&]() {
-          ++i;
-          return Status::OK();
-        });
-  }
-};
-
-struct BoundsChecker {
-  const ArrayData& data;
-  int64_t min_value;
-  int64_t max_value;
-
-  Status Visit(const DataType&) {
-    // Default, should be unreachable
-    return Status::NotImplemented("");
-  }
-
-  template <typename IntegerType>
-  enable_if_integer<IntegerType, Status> Visit(const IntegerType&) {
-    using c_type = typename IntegerType::c_type;
-
-    int64_t i = 0;
-    return VisitArrayDataInline<IntegerType>(
-        data,
-        [&](c_type value) {
-          const auto v = static_cast<int64_t>(value);
-          if (ARROW_PREDICT_FALSE(v < min_value || v > max_value)) {
-            return Status::Invalid("Value at position ", i, " out of bounds: ", v,
-                                   " (should be in [", min_value, ", ", max_value, "])");
-          }
-          ++i;
-          return Status::OK();
-        },
-        [&]() {
-          ++i;
-          return Status::OK();
-        });
-  }
-};
-
-struct ValidateArrayFullImpl {
-  const ArrayData& data;
-
-  Status Validate() { return ValidateWithType(*data.type); }
-
-  Status ValidateWithType(const DataType& type) { return VisitTypeInline(type, this); }
-
-  Status Visit(const NullType& type) { return Status::OK(); }
-
-  Status Visit(const FixedWidthType& type) { return Status::OK(); }
-
-  Status Visit(const StringType& type) {
-    RETURN_NOT_OK(ValidateBinaryLike(type));
-    return ValidateUTF8(data);
-  }
-
-  Status Visit(const LargeStringType& type) {
-    RETURN_NOT_OK(ValidateBinaryLike(type));
-    return ValidateUTF8(data);
-  }
-
-  Status Visit(const BinaryType& type) { return ValidateBinaryLike(type); }
-
-  Status Visit(const LargeBinaryType& type) { return ValidateBinaryLike(type); }
-
-  Status Visit(const Decimal128Type& type) { return ValidateDecimal(type); }
-
-  Status Visit(const Decimal256Type& type) { return ValidateDecimal(type); }
-
-  Status Visit(const ListType& type) { return ValidateListLike(type); }
-
-  Status Visit(const LargeListType& type) { return ValidateListLike(type); }
-
-  Status Visit(const MapType& type) { return ValidateListLike(type); }
-
-  Status Visit(const FixedSizeListType& type) {
-    const ArrayData& child = *data.child_data[0];
-    const Status child_valid = ValidateArrayFull(child);
-    if (!child_valid.ok()) {
-      return Status::Invalid("Fixed size list child array invalid: ",
-                             child_valid.ToString());
-    }
-    return Status::OK();
-  }
-
-  Status Visit(const StructType& type) {
-    // Validate children
-    for (int64_t i = 0; i < type.num_fields(); ++i) {
-      const ArrayData& field = *data.child_data[i];
-      const Status field_valid = ValidateArrayFull(field);
-      if (!field_valid.ok()) {
-        return Status::Invalid("Struct child array #", i,
-                               " invalid: ", field_valid.ToString());
-      }
-    }
-    return Status::OK();
-  }
-
-  Status Visit(const UnionType& type) {
-    const auto& child_ids = type.child_ids();
-    const auto& type_codes_map = type.type_codes();
-
-    const int8_t* type_codes = data.GetValues<int8_t>(1);
-
-    for (int64_t i = 0; i < data.length; ++i) {
-      // Note that union arrays never have top-level nulls
-      const int32_t code = type_codes[i];
-      if (code < 0 || child_ids[code] == UnionType::kInvalidChildId) {
-        return Status::Invalid("Union value at position ", i, " has invalid type id ",
-                               code);
-      }
-    }
-
-    if (type.mode() == UnionMode::DENSE) {
-      // Map logical type id to child length
-      std::vector<int64_t> child_lengths(256);
-      for (int child_id = 0; child_id < type.num_fields(); ++child_id) {
-        child_lengths[type_codes_map[child_id]] = data.child_data[child_id]->length;
+    const auto required_offsets = (data.length > 0) ? data.length + data.offset + 1 : 0;
+    const auto offsets_byte_size = data.buffers[1]->size();
+    if (offsets_byte_size / static_cast<int32_t>(sizeof(offset_type)) <
+        required_offsets) {
+      return Status::Invalid("Offsets buffer size (bytes): ", offsets_byte_size,
+                             " isn't large enough for length: ", data.length,
+                             " and offset: ", data.offset);
+    }
+
+    if (full_validation && offsets_byte_size != 0) {
+      // Validate all offset values
+      const offset_type* offsets = data.GetValues<offset_type>(1);
+
+      auto prev_offset = offsets[0];
+      if (prev_offset < 0) {
+        return Status::Invalid(
+            "Offset invariant failure: array starts at negative offset ", prev_offset);
       }
-
-      // Check offsets are in bounds
-      std::vector<int64_t> last_child_offsets(256, 0);
-      const int32_t* offsets = data.GetValues<int32_t>(2);
-      for (int64_t i = 0; i < data.length; ++i) {
-        const int32_t code = type_codes[i];
-        const int32_t offset = offsets[i];
-        if (offset < 0) {
-          return Status::Invalid("Union value at position ", i, " has negative offset ",
-                                 offset);
+      for (int64_t i = 1; i <= data.length; ++i) {
+        const auto current_offset = offsets[i];
+        if (current_offset < prev_offset) {
+          return Status::Invalid(
+              "Offset invariant failure: non-monotonic offset at slot ", i, ": ",
+              current_offset, " < ", prev_offset);
         }
-        if (offset >= child_lengths[code]) {
-          return Status::Invalid("Union value at position ", i,
-                                 " has offset larger "
-                                 "than child length (",
-                                 offset, " >= ", child_lengths[code], ")");
+        if (current_offset > offset_limit) {
+          return Status::Invalid("Offset invariant failure: offset for slot ", i,
+                                 " out of bounds: ", current_offset, " > ", offset_limit);
         }
-        if (offset < last_child_offsets[code]) {
-          return Status::Invalid("Union value at position ", i,
-                                 " has non-monotonic offset ", offset);
-        }
-        last_child_offsets[code] = offset;
-      }
-    }
-
-    // Validate children
-    for (int64_t i = 0; i < type.num_fields(); ++i) {
-      const ArrayData& field = *data.child_data[i];
-      const Status field_valid = ValidateArrayFull(field);
-      if (!field_valid.ok()) {
-        return Status::Invalid("Union child array #", i,
-                               " invalid: ", field_valid.ToString());
+        prev_offset = current_offset;
       }
     }
     return Status::OK();
   }
 
-  Status Visit(const DictionaryType& type) {
-    const Status indices_status =
-        CheckBounds(*type.index_type(), 0, data.dictionary->length - 1);
-    if (!indices_status.ok()) {
-      return Status::Invalid("Dictionary indices invalid: ", indices_status.ToString());
-    }
-    return ValidateArrayFull(*data.dictionary);
-  }
-
-  Status Visit(const ExtensionType& type) {
-    return ValidateWithType(*type.storage_type());
-  }
-
- protected:
-  template <typename BinaryType>
-  Status ValidateBinaryLike(const BinaryType& type) {
-    const auto& data_buffer = data.buffers[2];
-    if (data_buffer == nullptr) {
-      return Status::Invalid("Binary data buffer is null");
-    }
-    return ValidateOffsets(type, data_buffer->size());
-  }
-
   template <typename DecimalType>
-  Status ValidateDecimal(const DecimalType& type) {
+  Status ValidateDecimals(const DecimalType& type) {
     using CType = typename TypeTraits<DecimalType>::CType;
-    const int32_t precision = type.precision();
-    return VisitArrayDataInline<DecimalType>(
-        data,
-        [&](util::string_view bytes) {
-          DCHECK_EQ(bytes.size(), DecimalType::kByteWidth);
-          CType value(reinterpret_cast<const uint8_t*>(bytes.data()));
-          if (!value.FitsInPrecision(precision)) {
-            return Status::Invalid("Decimal value ", value.ToIntegerString(),
-                                   " does not fit in precision of ", type);
-          }
-          return Status::OK();
-        },
-        []() { return Status::OK(); });
-  }
-
-  template <typename ListType>
-  Status ValidateListLike(const ListType& type) {
-    const ArrayData& child = *data.child_data[0];
-    const Status child_valid = ValidateArrayFull(child);
-    if (!child_valid.ok()) {
-      return Status::Invalid("List child array invalid: ", child_valid.ToString());
-    }
-    return ValidateOffsets(type, child.offset + child.length);
-  }
-
-  template <typename TypeClass>
-  Status ValidateOffsets(const TypeClass& type, int64_t offset_limit) {
-    using offset_type = typename TypeClass::offset_type;
-    if (data.length == 0) {
-      return Status::OK();
-    }
-
-    const offset_type* offsets = data.GetValues<offset_type>(1);
-    if (offsets == nullptr) {
-      return Status::Invalid("Non-empty array but offsets are null");
-    }
-
-    auto prev_offset = offsets[0];
-    if (prev_offset < 0) {
-      return Status::Invalid("Offset invariant failure: array starts at negative offset ",
-                             prev_offset);
-    }
-    for (int64_t i = 1; i <= data.length; ++i) {
-      const auto current_offset = offsets[i];
-      if (current_offset < prev_offset) {
-        return Status::Invalid("Offset invariant failure: non-monotonic offset at slot ",
-                               i, ": ", current_offset, " < ", prev_offset);
-      }
-      if (current_offset > offset_limit) {
-        return Status::Invalid("Offset invariant failure: offset for slot ", i,
-                               " out of bounds: ", current_offset, " > ", offset_limit);
-      }
-      prev_offset = current_offset;
+    if (full_validation) {
+      const int32_t precision = type.precision();
+      return VisitArrayDataInline<DecimalType>(
+          data,
+          [&](util::string_view bytes) {
+            DCHECK_EQ(bytes.size(), DecimalType::kByteWidth);
+            CType value(reinterpret_cast<const uint8_t*>(bytes.data()));
+            if (!value.FitsInPrecision(precision)) {
+              return Status::Invalid("Decimal value ", value.ToIntegerString(),
+                                     " does not fit in precision of ", type);
+            }
+            return Status::OK();
+          },
+          []() { return Status::OK(); });
     }
     return Status::OK();
   }
@@ -665,26 +619,18 @@ struct ValidateArrayFullImpl {
 
 }  // namespace
 
+ARROW_EXPORT
+Status ValidateArray(const ArrayData& data) {
+  ValidateArrayImpl validator{data, /*full_validation=*/false};
+  return validator.Validate();
+}
+
+ARROW_EXPORT
+Status ValidateArray(const Array& array) { return ValidateArray(*array.data()); }
+
 ARROW_EXPORT
 Status ValidateArrayFull(const ArrayData& data) {
-  if (data.null_count != kUnknownNullCount) {
-    int64_t actual_null_count;
-    if (HasValidityBitmap(data.type->id()) && data.buffers[0]) {
-      // Do not call GetNullCount() as it would also set the `null_count` member
-      actual_null_count =
-          data.length - CountSetBits(data.buffers[0]->data(), data.offset, data.length);
-    } else if (data.type->id() == Type::NA) {
-      actual_null_count = data.length;
-    } else {
-      actual_null_count = 0;
-    }
-    if (actual_null_count != data.null_count) {
-      return Status::Invalid("null_count value (", data.null_count,
-                             ") doesn't match actual number of nulls in array (",
-                             actual_null_count, ")");
-    }
-  }
-  return ValidateArrayFullImpl{data}.Validate();
+  return ValidateArrayImpl{data, /*full_validation=*/true}.Validate();
 }
 
 ARROW_EXPORT
diff --git a/cpp/src/arrow/array/validate.h b/cpp/src/arrow/array/validate.h
index cae3e16b3c577..3ebfa0a51edce 100644
--- a/cpp/src/arrow/array/validate.h
+++ b/cpp/src/arrow/array/validate.h
@@ -35,8 +35,9 @@ ARROW_EXPORT
 Status ValidateArray(const ArrayData& data);
 
 // O(N) array data validation.
-// Note the "full" routines don't validate metadata.  It should be done
-// beforehand using ValidateArray(), otherwise invalid memory accesses
+// Note that, starting from 7.0.0, "full" routines also validate metadata.
+// Before, ValidateArray() needed to be called before ValidateArrayFull()
+// to ensure metadata correctness, otherwise invalid memory accesses
 // may occur.
 
 ARROW_EXPORT
diff --git a/cpp/src/arrow/chunked_array.cc b/cpp/src/arrow/chunked_array.cc
index 0c954e72e5047..eb483d1b8587f 100644
--- a/cpp/src/arrow/chunked_array.cc
+++ b/cpp/src/arrow/chunked_array.cc
@@ -227,24 +227,27 @@ std::string ChunkedArray::ToString() const {
   return ss.str();
 }
 
-Status ChunkedArray::Validate() const {
-  if (chunks_.size() == 0) {
+namespace {
+
+Status ValidateChunks(const ArrayVector& chunks, bool full_validation) {
+  if (chunks.size() == 0) {
     return Status::OK();
   }
 
-  const auto& type = *chunks_[0]->type();
+  const auto& type = *chunks[0]->type();
   // Make sure chunks all have the same type
-  for (size_t i = 1; i < chunks_.size(); ++i) {
-    const Array& chunk = *chunks_[i];
+  for (size_t i = 1; i < chunks.size(); ++i) {
+    const Array& chunk = *chunks[i];
     if (!chunk.type()->Equals(type)) {
       return Status::Invalid("In chunk ", i, " expected type ", type.ToString(),
                              " but saw ", chunk.type()->ToString());
     }
   }
   // Validate the chunks themselves
-  for (size_t i = 0; i < chunks_.size(); ++i) {
-    const Array& chunk = *chunks_[i];
-    const Status st = internal::ValidateArray(chunk);
+  for (size_t i = 0; i < chunks.size(); ++i) {
+    const Array& chunk = *chunks[i];
+    const Status st = full_validation ? internal::ValidateArrayFull(chunk)
+                                      : internal::ValidateArray(chunk);
     if (!st.ok()) {
       return Status::Invalid("In chunk ", i, ": ", st.ToString());
     }
@@ -252,16 +255,14 @@ Status ChunkedArray::Validate() const {
   return Status::OK();
 }
 
+}  // namespace
+
+Status ChunkedArray::Validate() const {
+  return ValidateChunks(chunks_, /*full_validation=*/false);
+}
+
 Status ChunkedArray::ValidateFull() const {
-  RETURN_NOT_OK(Validate());
-  for (size_t i = 0; i < chunks_.size(); ++i) {
-    const Array& chunk = *chunks_[i];
-    const Status st = internal::ValidateArrayFull(chunk);
-    if (!st.ok()) {
-      return Status::Invalid("In chunk ", i, ": ", st.ToString());
-    }
-  }
-  return Status::OK();
+  return ValidateChunks(chunks_, /*full_validation=*/true);
 }
 
 namespace internal {
diff --git a/cpp/src/arrow/compute/exec.cc b/cpp/src/arrow/compute/exec.cc
index 8a869453bdd5d..3696477b8575d 100644
--- a/cpp/src/arrow/compute/exec.cc
+++ b/cpp/src/arrow/compute/exec.cc
@@ -684,7 +684,9 @@ class ScalarExecutor : public KernelExecutorImpl<ScalarKernel> {
 
     if (output_descr_.shape == ValueDescr::ARRAY) {
       ArrayData* out_arr = out.mutable_array();
-      if (kernel_->null_handling == NullHandling::INTERSECTION) {
+      if (output_descr_.type->id() == Type::NA) {
+        out_arr->null_count = out_arr->length;
+      } else if (kernel_->null_handling == NullHandling::INTERSECTION) {
         RETURN_NOT_OK(PropagateNulls(kernel_ctx_, batch, out_arr));
       } else if (kernel_->null_handling == NullHandling::OUTPUT_NOT_NULL) {
         out_arr->null_count = 0;
diff --git a/cpp/src/arrow/ipc/dictionary.cc b/cpp/src/arrow/ipc/dictionary.cc
index 3ab2c8b384715..82fec31fda20a 100644
--- a/cpp/src/arrow/ipc/dictionary.cc
+++ b/cpp/src/arrow/ipc/dictionary.cc
@@ -209,7 +209,6 @@ struct DictionaryMemo::Impl {
           return Status::NotImplemented(
               "Encountered delta dictionary with an unresolved nested dictionary");
         }
-        RETURN_NOT_OK(::arrow::internal::ValidateArray(*data));
         RETURN_NOT_OK(::arrow::internal::ValidateArrayFull(*data));
         to_combine.push_back(MakeArray(data));
       }
diff --git a/cpp/src/arrow/record_batch.cc b/cpp/src/arrow/record_batch.cc
index 66f9e932b5816..5dc9d59e6eec9 100644
--- a/cpp/src/arrow/record_batch.cc
+++ b/cpp/src/arrow/record_batch.cc
@@ -282,31 +282,39 @@ std::string RecordBatch::ToString() const {
   return ss.str();
 }
 
-Status RecordBatch::Validate() const {
-  for (int i = 0; i < num_columns(); ++i) {
-    const auto& array = *this->column(i);
-    if (array.length() != num_rows_) {
+namespace {
+
+Status ValidateBatch(const RecordBatch& batch, bool full_validation) {
+  for (int i = 0; i < batch.num_columns(); ++i) {
+    const auto& array = *batch.column(i);
+    if (array.length() != batch.num_rows()) {
       return Status::Invalid("Number of rows in column ", i,
-                             " did not match batch: ", array.length(), " vs ", num_rows_);
+                             " did not match batch: ", array.length(), " vs ",
+                             batch.num_rows());
     }
-    const auto& schema_type = *schema_->field(i)->type();
+    const auto& schema_type = batch.schema()->field(i)->type();
     if (!array.type()->Equals(schema_type)) {
       return Status::Invalid("Column ", i,
                              " type not match schema: ", array.type()->ToString(), " vs ",
-                             schema_type.ToString());
+                             schema_type->ToString());
+    }
+    const auto st = full_validation ? internal::ValidateArrayFull(array)
+                                    : internal::ValidateArray(array);
+    if (!st.ok()) {
+      return Status::Invalid("In column ", i, ": ", st.ToString());
     }
-    RETURN_NOT_OK(internal::ValidateArray(array));
   }
   return Status::OK();
 }
 
+}  // namespace
+
+Status RecordBatch::Validate() const {
+  return ValidateBatch(*this, /*full_validation=*/false);
+}
+
 Status RecordBatch::ValidateFull() const {
-  RETURN_NOT_OK(Validate());
-  for (int i = 0; i < num_columns(); ++i) {
-    const auto& array = *this->column(i);
-    RETURN_NOT_OK(internal::ValidateArrayFull(array));
-  }
-  return Status::OK();
+  return ValidateBatch(*this, /*full_validation=*/true);
 }
 
 // ----------------------------------------------------------------------

From 35cc9d1409ed4bab07a40af704a13a6b2011f691 Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Wed, 17 Nov 2021 11:02:38 +0100
Subject: [PATCH 157/194] ARROW-14721: [C++] Strengthen DELTA_BYTE_ARRAY
 decoder

OSS-Fuzz found situations where invalid data could crash the DELTA_BYTE_ARRAY decoder.

Should fix the following issues:
* https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=40865
* https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=40902
* https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=40922
* https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=41003

Closes #11710 from pitrou/ARROW-14721-parquet-fuzz

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/arrow/util/windows_compatibility.h |  2 --
 cpp/src/parquet/arrow/reader_internal.cc   |  3 +-
 cpp/src/parquet/column_reader.cc           |  3 +-
 cpp/src/parquet/encoding.cc                | 37 ++++++++++++++++------
 cpp/src/parquet/schema.h                   |  1 +
 cpp/src/parquet/thrift_internal.h          |  2 +-
 cpp/src/parquet/types.h                    | 10 +-----
 cpp/src/parquet/windows_compatibility.h    | 11 +------
 cpp/src/parquet/windows_fixup.h            | 29 +++++++++++++++++
 testing                                    |  2 +-
 10 files changed, 64 insertions(+), 36 deletions(-)
 create mode 100644 cpp/src/parquet/windows_fixup.h

diff --git a/cpp/src/arrow/util/windows_compatibility.h b/cpp/src/arrow/util/windows_compatibility.h
index 64a2772c41c6f..ea0d0167569e8 100644
--- a/cpp/src/arrow/util/windows_compatibility.h
+++ b/cpp/src/arrow/util/windows_compatibility.h
@@ -15,8 +15,6 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#pragma once
-
 #ifdef _WIN32
 
 // Windows defines min and max macros that mess up std::min/max
diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc
index f13687079d415..62e6579dddc46 100644
--- a/cpp/src/parquet/arrow/reader_internal.cc
+++ b/cpp/src/parquet/arrow/reader_internal.cc
@@ -55,8 +55,7 @@
 #include "parquet/schema.h"
 #include "parquet/statistics.h"
 #include "parquet/types.h"
-// Required after "arrow/util/int_util_internal.h" (for OPTIONAL)
-#include "parquet/windows_compatibility.h"
+#include "parquet/windows_fixup.h"  // for OPTIONAL
 
 using arrow::Array;
 using arrow::BooleanArray;
diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc
index c05f3564236d9..8d0abe122bb08 100644
--- a/cpp/src/parquet/column_reader.cc
+++ b/cpp/src/parquet/column_reader.cc
@@ -50,8 +50,7 @@
 #include "parquet/properties.h"
 #include "parquet/statistics.h"
 #include "parquet/thrift_internal.h"  // IWYU pragma: keep
-// Required after "arrow/util/int_util_internal.h" (for OPTIONAL)
-#include "parquet/windows_compatibility.h"
+#include "parquet/windows_fixup.h"    // for OPTIONAL
 
 using arrow::MemoryPool;
 using arrow::internal::AddWithOverflow;
diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index 549f3566fe9fc..4c980dfe9384b 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -38,8 +38,10 @@
 #include "arrow/util/byte_stream_split.h"
 #include "arrow/util/checked_cast.h"
 #include "arrow/util/hashing.h"
+#include "arrow/util/int_util_internal.h"
 #include "arrow/util/logging.h"
 #include "arrow/util/rle_encoding.h"
+#include "arrow/util/string_view.h"
 #include "arrow/util/ubsan.h"
 #include "arrow/visitor_inline.h"
 #include "parquet/exception.h"
@@ -51,7 +53,9 @@ namespace BitUtil = arrow::BitUtil;
 
 using arrow::Status;
 using arrow::VisitNullBitmapInline;
+using arrow::internal::AddWithOverflow;
 using arrow::internal::checked_cast;
+using arrow::util::string_view;
 
 template <typename T>
 using ArrowPoolVector = std::vector<T, ::arrow::stl::allocator<T>>;
@@ -2265,21 +2269,27 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl,
   }
 
   int Decode(ByteArray* buffer, int max_values) override {
+    // Decode up to `max_values` strings into an internal buffer
+    // and reference them into `buffer`.
     max_values = std::min(max_values, num_valid_values_);
 
-    int64_t data_size = 0;
+    int32_t data_size = 0;
     const int32_t* length_ptr =
         reinterpret_cast<const int32_t*>(buffered_length_->data()) + length_idx_;
     for (int i = 0; i < max_values; ++i) {
       int32_t len = length_ptr[i];
+      if (ARROW_PREDICT_FALSE(len < 0)) {
+        throw ParquetException("negative string delta length");
+      }
       buffer[i].len = len;
-      data_size += len;
+      if (AddWithOverflow(data_size, len, &data_size)) {
+        throw ParquetException("excess expansion in DELTA_(LENGTH_)BYTE_ARRAY");
+      }
     }
     length_idx_ += max_values;
 
     PARQUET_THROW_NOT_OK(buffered_data_->Resize(data_size));
-    if (decoder_->GetBatch(8, buffered_data_->mutable_data(),
-                           static_cast<int>(data_size)) != static_cast<int>(data_size)) {
+    if (decoder_->GetBatch(8, buffered_data_->mutable_data(), data_size) != data_size) {
       ParquetException::EofException();
     }
     const uint8_t* data_ptr = buffered_data_->data();
@@ -2393,6 +2403,8 @@ class DeltaByteArrayDecoder : public DecoderImpl,
 
  private:
   int GetInternal(ByteArray* buffer, int max_values) {
+    // Decode up to `max_values` strings into an internal buffer
+    // and reference them into `buffer`.
     max_values = std::min(max_values, num_valid_values_);
     suffix_decoder_.Decode(buffer, max_values);
 
@@ -2401,24 +2413,31 @@ class DeltaByteArrayDecoder : public DecoderImpl,
         reinterpret_cast<const int32_t*>(buffered_prefix_length_->data()) +
         prefix_len_offset_;
     for (int i = 0; i < max_values; ++i) {
-      data_size += prefix_len_ptr[i] + buffer[i].len;
+      if (AddWithOverflow(data_size, prefix_len_ptr[i], &data_size) ||
+          AddWithOverflow(data_size, buffer[i].len, &data_size)) {
+        throw ParquetException("excess expansion in DELTA_BYTE_ARRAY");
+      }
     }
     PARQUET_THROW_NOT_OK(buffered_data_->Resize(data_size));
 
+    string_view prefix{last_value_};
     uint8_t* data_ptr = buffered_data_->mutable_data();
     for (int i = 0; i < max_values; ++i) {
-      DCHECK_LE(static_cast<const size_t>(prefix_len_ptr[i]), last_value_.length());
-      memcpy(data_ptr, last_value_.data(), prefix_len_ptr[i]);
+      if (ARROW_PREDICT_FALSE(static_cast<size_t>(prefix_len_ptr[i]) > prefix.length())) {
+        throw ParquetException("prefix length too large");
+      }
+      memcpy(data_ptr, prefix.data(), prefix_len_ptr[i]);
+      // buffer[i] currently points to the string suffix
       memcpy(data_ptr + prefix_len_ptr[i], buffer[i].ptr, buffer[i].len);
       buffer[i].ptr = data_ptr;
       buffer[i].len += prefix_len_ptr[i];
       data_ptr += buffer[i].len;
-      last_value_ =
-          std::string(reinterpret_cast<const char*>(buffer[i].ptr), buffer[i].len);
+      prefix = string_view{reinterpret_cast<const char*>(buffer[i].ptr), buffer[i].len};
     }
     prefix_len_offset_ += max_values;
     this->num_values_ -= max_values;
     num_valid_values_ -= max_values;
+    last_value_ = std::string{prefix};
 
     if (num_valid_values_ == 0) {
       last_value_in_previous_page_ = last_value_;
diff --git a/cpp/src/parquet/schema.h b/cpp/src/parquet/schema.h
index 83d0cf24f1eec..9e06040226b80 100644
--- a/cpp/src/parquet/schema.h
+++ b/cpp/src/parquet/schema.h
@@ -30,6 +30,7 @@
 
 #include "parquet/platform.h"
 #include "parquet/types.h"
+#include "parquet/windows_fixup.h"  // for OPTIONAL
 
 namespace parquet {
 
diff --git a/cpp/src/parquet/thrift_internal.h b/cpp/src/parquet/thrift_internal.h
index 99bd39c6528f4..cebc5d4f49823 100644
--- a/cpp/src/parquet/thrift_internal.h
+++ b/cpp/src/parquet/thrift_internal.h
@@ -17,7 +17,7 @@
 
 #pragma once
 
-#include "arrow/util/windows_compatibility.h"
+#include "parquet/windows_compatibility.h"
 
 #include <cstdint>
 #include <limits>
diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h
index 505a6c5cba8eb..b419bf5dcf901 100644
--- a/cpp/src/parquet/types.h
+++ b/cpp/src/parquet/types.h
@@ -28,15 +28,7 @@
 
 #include "parquet/platform.h"
 #include "parquet/type_fwd.h"
-
-#ifdef _WIN32
-
-// Repetition::OPTIONAL conflicts with a #define, so we undefine it
-#ifdef OPTIONAL
-#undef OPTIONAL
-#endif
-
-#endif  // _WIN32
+#include "parquet/windows_fixup.h"  // for OPTIONAL
 
 namespace arrow {
 namespace util {
diff --git a/cpp/src/parquet/windows_compatibility.h b/cpp/src/parquet/windows_compatibility.h
index 31ca04c8b660b..fe84d8c6ce06e 100644
--- a/cpp/src/parquet/windows_compatibility.h
+++ b/cpp/src/parquet/windows_compatibility.h
@@ -18,13 +18,4 @@
 #pragma once
 
 #include "arrow/util/windows_compatibility.h"
-
-#ifdef _WIN32
-
-// parquet.thrift's OPTIONAL RepetitionType conflicts with a #define from
-// above, so we undefine it
-#ifdef OPTIONAL
-#undef OPTIONAL
-#endif
-
-#endif
+#include "parquet/windows_fixup.h"
diff --git a/cpp/src/parquet/windows_fixup.h b/cpp/src/parquet/windows_fixup.h
new file mode 100644
index 0000000000000..ce44480c5732e
--- /dev/null
+++ b/cpp/src/parquet/windows_fixup.h
@@ -0,0 +1,29 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This header needs to be included multiple times.
+
+#include "arrow/util/windows_fixup.h"
+
+#ifdef _WIN32
+
+// parquet.thrift's OPTIONAL RepetitionType conflicts with a Windows #define
+#ifdef OPTIONAL
+#undef OPTIONAL
+#endif
+
+#endif  // _WIN32
diff --git a/testing b/testing
index 065f6f2019b75..1d8525e109a12 160000
--- a/testing
+++ b/testing
@@ -1 +1 @@
-Subproject commit 065f6f2019b7523dd0f68430790073d4ede5a058
+Subproject commit 1d8525e109a12a8c67c489eba48715a199609153

From c6eb07462ceaf99601eb22d2dc13a837d6ea631b Mon Sep 17 00:00:00 2001
From: Kazuaki Ishizaki <ishizaki@jp.ibm.com>
Date: Wed, 17 Nov 2021 12:34:32 +0100
Subject: [PATCH 158/194] ARROW-14686: [Python][C++] make byte order detection
 for numpy builtin type correct

This PR fixes the following test failures

```
FAILED pyarrow/tests/test_array.py::test_array_from_numpy_unicode - UnicodeDecodeError: 'utf-32-le' codec can't decode bytes in position 0-3: code point not in range(0x110000)
FAILED pyarrow/tests/test_array.py::test_array_from_strided - UnicodeDecodeError: 'utf-32-le' codec can't decode bytes in position 0-3: code point not in range(0x110000)
FAILED pyarrow/tests/test_array.py::test_array_from_numpy_str_utf8 - UnicodeDecodeError: 'utf-32-le' codec can't decode bytes in position 0-3: code point not in range(0x110000)
```

Closes #11687 from kiszk/ARROW-14686

Authored-by: Kazuaki Ishizaki <ishizaki@jp.ibm.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/arrow/python/numpy_to_arrow.cc | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc
index 466074592bf1f..0bcfee163a27d 100644
--- a/cpp/src/arrow/python/numpy_to_arrow.cc
+++ b/cpp/src/arrow/python/numpy_to_arrow.cc
@@ -40,6 +40,7 @@
 #include "arrow/util/bitmap_generate.h"
 #include "arrow/util/bitmap_ops.h"
 #include "arrow/util/checked_cast.h"
+#include "arrow/util/endian.h"
 #include "arrow/util/logging.h"
 #include "arrow/util/macros.h"
 #include "arrow/util/string.h"
@@ -662,7 +663,13 @@ Status NumPyConverter::Visit(const StringType& type) {
   char numpy_byteorder = dtype_->byteorder;
 
   // For Python C API, -1 is little-endian, 1 is big-endian
+#if ARROW_LITTLE_ENDIAN
+  // Yield little-endian from both '|' (native) and '<'
   int byteorder = numpy_byteorder == '>' ? 1 : -1;
+#else
+  // Yield big-endian from both '|' (native) and '>'
+  int byteorder = numpy_byteorder == '<' ? -1 : 1;
+#endif
 
   PyAcquireGIL gil_lock;
 

From a15776172e8dab8605b92c9d7101b810049a08f8 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Wed, 17 Nov 2021 15:30:01 +0100
Subject: [PATCH 159/194] ARROW-14732: [Python] Improve error message in
 compute functions when passing wrong number of positional arguments

Closes #11721 from jorisvandenbossche/ARROW-14732

Authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 python/pyarrow/compute.py            | 14 ++++++++++++--
 python/pyarrow/tests/test_compute.py |  3 +--
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py
index 6e3bd7fcab334..bf2bf5f35db7b 100644
--- a/python/pyarrow/compute.py
+++ b/python/pyarrow/compute.py
@@ -180,12 +180,22 @@ def _handle_options(name, option_class, options, kwargs):
     return options
 
 
-def _make_generic_wrapper(func_name, func, option_class):
+def _make_generic_wrapper(func_name, func, option_class, arity):
     if option_class is None:
         def wrapper(*args, memory_pool=None):
+            if arity is not Ellipsis and len(args) != arity:
+                raise TypeError(
+                    f"{func_name} takes {arity} positional argument(s), "
+                    "but {len(args)} were given"
+                )
             return func.call(args, None, memory_pool)
     else:
         def wrapper(*args, memory_pool=None, options=None, **kwargs):
+            if arity is not Ellipsis and len(args) != arity:
+                raise TypeError(
+                    f"{func_name} takes {arity} positional argument(s), "
+                    "but {len(args)} were given"
+                )
             options = _handle_options(func_name, option_class, options,
                                       kwargs)
             return func.call(args, options, memory_pool)
@@ -221,7 +231,7 @@ def _wrap_function(name, func):
     else:
         var_arg_names = []
 
-    wrapper = _make_generic_wrapper(name, func, option_class)
+    wrapper = _make_generic_wrapper(name, func, option_class, arity=func.arity)
     wrapper.__signature__ = _make_signature(arg_names, var_arg_names,
                                             option_class)
     return _decorate_compute_function(wrapper, name, func, option_class)
diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py
index 5185232fd9b97..eac01236b64fe 100644
--- a/python/pyarrow/tests/test_compute.py
+++ b/python/pyarrow/tests/test_compute.py
@@ -578,8 +578,7 @@ def test_min_max():
         s = pc.min_max(data, options=options)
 
     # Missing argument
-    with pytest.raises(ValueError,
-                       match="Function min_max accepts 1 argument"):
+    with pytest.raises(TypeError, match="min_max takes 1 positional"):
         s = pc.min_max()
 
 

From 8aad0d607583ddb0ee1e1c4a3995c444bd3b0eb1 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Wed, 17 Nov 2021 15:31:57 +0100
Subject: [PATCH 160/194] ARROW-12066: [Python] Test to ensure filtering with
 equal to null does not crash

Closes #11722 from jorisvandenbossche/ARROW-12066

Authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 python/pyarrow/tests/test_dataset.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py
index 7d539551b65c3..b993a3a6eee4a 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -2426,6 +2426,19 @@ def test_filter_implicit_cast(tempdir, dataset_reader):
     assert len(dataset_reader.to_table(dataset, filter=filter_)) == 3
 
 
+@pytest.mark.parquet
+def test_filter_equal_null(tempdir, dataset_reader):
+    # ARROW-12066 equality with null, although not useful, should not crash
+    table = pa.table({"A": ["a", "b", None]})
+    _, path = _create_single_file(tempdir, table)
+    dataset = ds.dataset(str(path))
+
+    table = dataset_reader.to_table(
+        dataset, filter=ds.field("A") == ds.scalar(None)
+    )
+    assert table.num_rows == 0
+
+
 def test_dataset_union(multisourcefs):
     child = ds.FileSystemDatasetFactory(
         multisourcefs, fs.FileSelector('/plain'),

From 24689928da8b122df89f29b63cffa75845058d4e Mon Sep 17 00:00:00 2001
From: Dewey Dunnington <dewey@fishandwhistle.net>
Date: Wed, 17 Nov 2021 08:34:46 -0600
Subject: [PATCH 161/194] ARROW-14297 [R] smooth out integer division to better
 match R

This PR updates the floor division dplyr translation to better respect the input types (as determined by how this would be done in R). The main change is the output type: `integer_type_1 %/% integer_type_2` will now have the same type as `integer_type_1` and everything else has the same type as `floor(arg1 / arg2)`. As a side effect, floor division by zero is `Inf` rather than the maximum integer value (unless you try to floor divide by `0L`...see below). A few things that need some hashing out:

- Floor division by `0L` results in the max integer value rather than `NA`. This is, I think, because it's how cast (even with `safe = TRUE`) to integer from `Inf`. That is perhaps a different issue than this one?
- There's [some tests for floor division for arrays outside a dplyr verb](https://github.com/apache/arrow/blob/master/r/tests/testthat/test-compute-arith.R#L64-L94) that appear to be using a [completely different translation logic](https://github.com/apache/arrow/blob/master/r/R/compute.R). I didn't update those tests or that logic because it seemed like a different issue to me (maybe needs to implement the Math and/or Ops group generics or more S3 methods for the array class?).

Reprex before this PR:

<details>

``` r
# remotes::install_github("apache/arrow/r")
library(arrow, warn.conflicts = FALSE)
library(dplyr, warn.conflicts = FALSE)

tbl <- tibble::tibble(
  integers = c(1:4, NA_integer_),
  doubles = c(as.numeric(1:4), NA_real_)
)

tbl %>%
  mutate(
    int_div_dbl = integers %/% 2,
    int_div_int = integers %/% 2L,
    int_div_zero_int = integers %/% 0L,
    int_div_zero_dbl = integers %/% 0,
    dbl_div_dbl = doubles %/% 2,
    dbl_div_int = doubles %/% 2L,
    dbl_div_zero_int = doubles %/% 0L,
    dbl_div_zero_dbl = doubles %/% 0
  ) %>%
  glimpse()
#> Rows: 5
#> Columns: 10
#> $ integers         <int> 1, 2, 3, 4, NA
#> $ doubles          <dbl> 1, 2, 3, 4, NA
#> $ int_div_dbl      <dbl> 0, 1, 1, 2, NA
#> $ int_div_int      <int> 0, 1, 1, 2, NA
#> $ int_div_zero_int <int> NA, NA, NA, NA, NA
#> $ int_div_zero_dbl <dbl> Inf, Inf, Inf, Inf, NA
#> $ dbl_div_dbl      <dbl> 0, 1, 1, 2, NA
#> $ dbl_div_int      <dbl> 0, 1, 1, 2, NA
#> $ dbl_div_zero_int <dbl> Inf, Inf, Inf, Inf, NA
#> $ dbl_div_zero_dbl <dbl> Inf, Inf, Inf, Inf, NA

RecordBatch$create(!!! tbl) %>%
  mutate(
    int_div_dbl = integers %/% 2,
    int_div_int = integers %/% 2L,
    int_div_zero_int = integers %/% 0L,
    int_div_zero_dbl = integers %/% 0,
    dbl_div_dbl = doubles %/% 2,
    dbl_div_int = doubles %/% 2L,
    dbl_div_zero_int = doubles %/% 0L,
    dbl_div_zero_dbl = doubles %/% 0,
  ) %>%
  collect() %>%
  glimpse()
#> Rows: 5
#> Columns: 10
#> $ integers         <int> 1, 2, 3, 4, NA
#> $ doubles          <dbl> 1, 2, 3, 4, NA
#> $ int_div_dbl      <int> 0, 1, 1, 2, NA
#> $ int_div_int      <int> 0, 1, 1, 2, NA
#> $ int_div_zero_int <int> 2147483647, 2147483647, 2147483647, 2147483647, NA
#> $ int_div_zero_dbl <int> 2147483647, 2147483647, 2147483647, 2147483647, NA
#> $ dbl_div_dbl      <int> 0, 1, 1, 2, NA
#> $ dbl_div_int      <int> 0, 1, 1, 2, NA
#> $ dbl_div_zero_int <int> 2147483647, 2147483647, 2147483647, 2147483647, NA
#> $ dbl_div_zero_dbl <int> 2147483647, 2147483647, 2147483647, 2147483647, NA
```

<sup>Created on 2021-11-09 by the [reprex package](https://reprex.tidyverse.org) (v2.0.1)</sup>

</details>

Reprex after this PR:

<details>

``` r
# remotes::install_github("paleolimbot/arrow/r@r-floor-div")
library(arrow, warn.conflicts = FALSE)
library(dplyr, warn.conflicts = FALSE)

tbl <- tibble::tibble(
  integers = c(1:4, NA_integer_),
  doubles = c(as.numeric(1:4), NA_real_)
)

tbl %>%
  mutate(
    int_div_dbl = integers %/% 2,
    int_div_int = integers %/% 2L,
    int_div_zero_int = integers %/% 0L,
    int_div_zero_dbl = integers %/% 0,
    dbl_div_dbl = doubles %/% 2,
    dbl_div_int = doubles %/% 2L,
    dbl_div_zero_int = doubles %/% 0L,
    dbl_div_zero_dbl = doubles %/% 0
  ) %>%
  glimpse()
#> Rows: 5
#> Columns: 10
#> $ integers         <int> 1, 2, 3, 4, NA
#> $ doubles          <dbl> 1, 2, 3, 4, NA
#> $ int_div_dbl      <dbl> 0, 1, 1, 2, NA
#> $ int_div_int      <int> 0, 1, 1, 2, NA
#> $ int_div_zero_int <int> NA, NA, NA, NA, NA
#> $ int_div_zero_dbl <dbl> Inf, Inf, Inf, Inf, NA
#> $ dbl_div_dbl      <dbl> 0, 1, 1, 2, NA
#> $ dbl_div_int      <dbl> 0, 1, 1, 2, NA
#> $ dbl_div_zero_int <dbl> Inf, Inf, Inf, Inf, NA
#> $ dbl_div_zero_dbl <dbl> Inf, Inf, Inf, Inf, NA

RecordBatch$create(!!! tbl) %>%
  mutate(
    int_div_dbl = integers %/% 2,
    int_div_int = integers %/% 2L,
    int_div_zero_int = integers %/% 0L,
    int_div_zero_dbl = integers %/% 0,
    dbl_div_dbl = doubles %/% 2,
    dbl_div_int = doubles %/% 2L,
    dbl_div_zero_int = doubles %/% 0L,
    dbl_div_zero_dbl = doubles %/% 0,
  ) %>%
  collect() %>%
  glimpse()
#> Rows: 5
#> Columns: 10
#> $ integers         <int> 1, 2, 3, 4, NA
#> $ doubles          <dbl> 1, 2, 3, 4, NA
#> $ int_div_dbl      <dbl> 0, 1, 1, 2, NA
#> $ int_div_int      <int> 0, 1, 1, 2, NA
#> $ int_div_zero_int <int> 2147483647, 2147483647, 2147483647, 2147483647, NA
#> $ int_div_zero_dbl <dbl> Inf, Inf, Inf, Inf, NA
#> $ dbl_div_dbl      <dbl> 0, 1, 1, 2, NA
#> $ dbl_div_int      <dbl> 0, 1, 1, 2, NA
#> $ dbl_div_zero_int <dbl> Inf, Inf, Inf, Inf, NA
#> $ dbl_div_zero_dbl <dbl> Inf, Inf, Inf, Inf, NA
```

<sup>Created on 2021-11-09 by the [reprex package](https://reprex.tidyverse.org) (v2.0.1)</sup>

</details>

Closes #11652 from paleolimbot/r-floor-div

Authored-by: Dewey Dunnington <dewey@fishandwhistle.net>
Signed-off-by: Jonathan Keane <jkeane@gmail.com>
---
 r/R/arrow-datum.R                        | 20 ++++++++++++++++++--
 r/R/expression.R                         | 18 +++++++++++++++++-
 r/tests/testthat/test-compute-arith.R    | 14 ++++++++++----
 r/tests/testthat/test-dplyr-funcs-math.R | 24 ++++++++++++++++++++++++
 4 files changed, 69 insertions(+), 7 deletions(-)

diff --git a/r/R/arrow-datum.R b/r/R/arrow-datum.R
index 557321f68dbbc..32d40ec2f4f58 100644
--- a/r/R/arrow-datum.R
+++ b/r/R/arrow-datum.R
@@ -102,8 +102,24 @@ eval_array_expression <- function(FUN,
     args <- map(args, ~ .$cast(float64()))
   } else if (FUN == "%/%") {
     # In R, integer division works like floor(float division)
-    out <- eval_array_expression("/", args = args, options = options)
-    return(out$cast(int32(), allow_float_truncate = TRUE))
+    out <- eval_array_expression("/", args = args)
+
+    # integer output only for all integer input
+    int_type_ids <- Type[toupper(INTEGER_TYPES)]
+    numerator_is_int <- args[[1]]$type_id() %in% int_type_ids
+    denominator_is_int <- args[[2]]$type_id() %in% int_type_ids
+
+    if (numerator_is_int && denominator_is_int) {
+      out_float <- eval_array_expression(
+        "if_else",
+        eval_array_expression("equal", args[[2]], 0L),
+        Scalar$create(NA_integer_),
+        eval_array_expression("floor", out)
+      )
+      return(out_float$cast(args[[1]]$type))
+    } else {
+      return(eval_array_expression("floor", out))
+    }
   } else if (FUN == "%%") {
     # We can't simply do {e1 - e2 * ( e1 %/% e2 )} since Ops.Array evaluates
     # eagerly, but we can build that up
diff --git a/r/R/expression.R b/r/R/expression.R
index bb300de524c4e..a76e16185cb56 100644
--- a/r/R/expression.R
+++ b/r/R/expression.R
@@ -216,7 +216,23 @@ build_expr <- function(FUN,
     } else if (FUN == "%/%") {
       # In R, integer division works like floor(float division)
       out <- build_expr("/", args = args)
-      return(out$cast(int32(), allow_float_truncate = TRUE))
+
+      # integer output only for all integer input
+      int_type_ids <- Type[toupper(INTEGER_TYPES)]
+      numerator_is_int <- args[[1]]$type_id() %in% int_type_ids
+      denominator_is_int <- args[[2]]$type_id() %in% int_type_ids
+
+      if (numerator_is_int && denominator_is_int) {
+        out_float <- build_expr(
+          "if_else",
+          build_expr("equal", args[[2]], 0L),
+          Scalar$create(NA_integer_),
+          build_expr("floor", out)
+        )
+        return(out_float$cast(args[[1]]$type()))
+      } else {
+        return(build_expr("floor", out))
+      }
     } else if (FUN == "%%") {
       return(args[[1]] - args[[2]] * (args[[1]] %/% args[[2]]))
     }
diff --git a/r/tests/testthat/test-compute-arith.R b/r/tests/testthat/test-compute-arith.R
index e8674e3158416..8d8c43451cac8 100644
--- a/r/tests/testthat/test-compute-arith.R
+++ b/r/tests/testthat/test-compute-arith.R
@@ -64,16 +64,22 @@ test_that("Multiplication", {
 test_that("Division", {
   a <- Array$create(c(1:4, NA_integer_))
   expect_equal(a / 2, Array$create(c(1:4 / 2, NA_real_)))
-  expect_equal(a %/% 2, Array$create(c(0L, 1L, 1L, 2L, NA_integer_)))
+  expect_equal(a %/% 0, Array$create(c(Inf, Inf, Inf, Inf, NA_real_)))
+  expect_equal(a %/% 2, Array$create(c(0, 1, 1, 2, NA_real_)))
+  expect_equal(a %/% 2L, Array$create(c(0L, 1L, 1L, 2L, NA_integer_)))
+  expect_equal(a %/% 0L, Array$create(rep(NA_integer_, 5)))
   expect_equal(a / 2 / 2, Array$create(c(1:4 / 2 / 2, NA_real_)))
-  expect_equal(a %/% 2 %/% 2, Array$create(c(0L, 0L, 0L, 1L, NA_integer_)))
+  expect_equal(a %/% 2L %/% 2L, Array$create(c(0L, 0L, 0L, 1L, NA_integer_)))
   expect_equal(a / 0, Array$create(c(Inf, Inf, Inf, Inf, NA_real_)))
   # TODO add tests for integer division %/% by 0
   # see https://issues.apache.org/jira/browse/ARROW-14297
 
   b <- a$cast(float64())
   expect_equal(b / 2, Array$create(c(1:4 / 2, NA_real_)))
-  expect_equal(b %/% 2, Array$create(c(0L, 1L, 1L, 2L, NA_integer_)))
+  expect_equal(b %/% 0, Array$create(c(Inf, Inf, Inf, Inf, NA_real_)))
+  expect_equal(b %/% 0L, Array$create(c(Inf, Inf, Inf, Inf, NA_real_)))
+  expect_equal(b %/% 2, Array$create(c(0, 1, 1, 2, NA_real_)))
+  expect_equal(b %/% 2L, Array$create(c(0, 1, 1, 2, NA_real_)))
   expect_equal(b / 0, Array$create(c(Inf, Inf, Inf, Inf, NA_real_)))
   # TODO add tests for integer division %/% by 0
   # see https://issues.apache.org/jira/browse/ARROW-14297
@@ -86,7 +92,7 @@ test_that("Division", {
   # c(1:4) %/% 2.2             == c(0L, 0L, 1L, 1L)
   # c(1:4) %/% as.integer(2.2) == c(0L, 1L, 1L, 2L)
   # nolint end
-  expect_equal(b %/% 2.2, Array$create(c(0L, 0L, 1L, 1L, NA_integer_)))
+  expect_equal(b %/% 2.2, Array$create(c(0, 0, 1, 1, NA_integer_)))
 
   expect_equal(a %% 2, Array$create(c(1L, 0L, 1L, 0L, NA_integer_)))
 
diff --git a/r/tests/testthat/test-dplyr-funcs-math.R b/r/tests/testthat/test-dplyr-funcs-math.R
index 045ac72e8f022..b5321945dccb2 100644
--- a/r/tests/testthat/test-dplyr-funcs-math.R
+++ b/r/tests/testthat/test-dplyr-funcs-math.R
@@ -307,3 +307,27 @@ test_that("arith functions ", {
     df
   )
 })
+
+test_that("floor division maintains type consistency with R",  {
+  df <- tibble(
+    integers = c(1:4, NA_integer_),
+    doubles = c(as.numeric(1:4), NA_real_)
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      transmute(
+        int_div_dbl = integers %/% 2,
+        int_div_int = integers %/% 2L,
+        int_div_zero_int = integers %/% 0L,
+        int_div_zero_dbl = integers %/% 0,
+
+        dbl_div_dbl = doubles %/% 2,
+        dbl_div_int = doubles %/% 2L,
+        dbl_div_zero_int = doubles %/% 0L,
+        dbl_div_zero_dbl = doubles %/% 0
+      ) %>%
+      collect(),
+    df
+  )
+})

From 463b5dd1f58344aeec72946d6fae0f42b013a161 Mon Sep 17 00:00:00 2001
From: Nic Crane <thisisnic@gmail.com>
Date: Wed, 17 Nov 2021 15:36:02 +0000
Subject: [PATCH 162/194] ARROW-14733: [R] Add section to how to get output
 when things hang to debugger docs

Closes #11723 from thisisnic/ARROW-14733_hangs

Authored-by: Nic Crane <thisisnic@gmail.com>
Signed-off-by: Nic Crane <thisisnic@gmail.com>
---
 r/vignettes/developers/debugging.Rmd | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/r/vignettes/developers/debugging.Rmd b/r/vignettes/developers/debugging.Rmd
index a18178dfac111..b9aa8bb75a5c2 100644
--- a/r/vignettes/developers/debugging.Rmd
+++ b/r/vignettes/developers/debugging.Rmd
@@ -75,7 +75,7 @@ Here is debugger output from the segfault shown in the previous example.  You
 can see here that the exact line which triggers the segfault is included in the 
 output.
 
-```
+```shell
 > S3FileSystem$create()
 
 Thread 1 "R" received signal SIGSEGV, Segmentation fault.
@@ -83,6 +83,23 @@ Thread 1 "R" received signal SIGSEGV, Segmentation fault.
 318	      operator++() noexcept
 ```
 
+#### Getting debugger output if your session hangs
+
+The instructions above can provide valuable additional context when a segfault 
+occurs.  However, there are occasionally circumstances in which a bug could 
+cause your session to hang indefinitely without segfaulting.  In this case, it 
+may be diagnostically useful to interrupt the debugger and generate backtraces 
+from all running threads.  
+
+To do this, firstly, press Ctrl/Cmd and C to interrupt the debugger, and then run:
+
+```shell
+thread apply all bt
+```
+
+This will generate a large amount of output, but this information is useful when
+identifying the cause of the issue.
+
 ## Resources
 
 The following resources provide detailed guides to debugging R code:

From acf1d491cad1045ce71e31576081308f1b6897e9 Mon Sep 17 00:00:00 2001
From: Dewey Dunnington <dewey@fishandwhistle.net>
Date: Wed, 17 Nov 2021 10:58:42 -0600
Subject: [PATCH 163/194] ARROW-14640 [R] reading data from S3

This PR adds the ability for `proxy_options` to be passed from R (`S3FileSystem$create()`) to C++ (`fs::S3Options.proxy_options`). This functionality is [available in Python](https://arrow.apache.org/docs/python/generated/pyarrow.fs.S3FileSystem.html#pyarrow.fs.S3FileSystem) but not in R.

Some issues I'm not sure about:

- I can test the failing case where there's an invalid URI passed but I'm not sure how to test the passing case (valid proxy URI specified).
- Should the `HTTP_PROXY` and/or the `HTTPS_PROXY` environment variables be read to set the default value (as suggested by the issue opener)? I can see how this would be useful but I'm too new to this to have an opinion about whether we should or shouldn't implement it at the R level.

Closes #11691 from paleolimbot/r-s3-proxy-options

Authored-by: Dewey Dunnington <dewey@fishandwhistle.net>
Signed-off-by: Jonathan Keane <jkeane@gmail.com>
---
 r/R/arrowExports.R                 |  4 ++--
 r/R/filesystem.R                   |  1 +
 r/src/arrowExports.cpp             | 11 ++++++-----
 r/src/filesystem.cpp               |  8 +++++++-
 r/tests/testthat/test-filesystem.R | 11 +++++++++++
 5 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R
index d478d20803340..821c73a99f109 100644
--- a/r/R/arrowExports.R
+++ b/r/R/arrowExports.R
@@ -1060,8 +1060,8 @@ fs___CopyFiles <- function(source_fs, source_sel, destination_fs, destination_ba
   invisible(.Call(`_arrow_fs___CopyFiles`, source_fs, source_sel, destination_fs, destination_base_dir, chunk_size, use_threads))
 }
 
-fs___S3FileSystem__create <- function(anonymous, access_key, secret_key, session_token, role_arn, session_name, external_id, load_frequency, region, endpoint_override, scheme, background_writes) {
-  .Call(`_arrow_fs___S3FileSystem__create`, anonymous, access_key, secret_key, session_token, role_arn, session_name, external_id, load_frequency, region, endpoint_override, scheme, background_writes)
+fs___S3FileSystem__create <- function(anonymous, access_key, secret_key, session_token, role_arn, session_name, external_id, load_frequency, region, endpoint_override, scheme, proxy_options, background_writes) {
+  .Call(`_arrow_fs___S3FileSystem__create`, anonymous, access_key, secret_key, session_token, role_arn, session_name, external_id, load_frequency, region, endpoint_override, scheme, proxy_options, background_writes)
 }
 
 fs___S3FileSystem__region <- function(fs) {
diff --git a/r/R/filesystem.R b/r/R/filesystem.R
index a09d0a51d7bac..282f1c1a28f0d 100644
--- a/r/R/filesystem.R
+++ b/r/R/filesystem.R
@@ -382,6 +382,7 @@ default_s3_options <- list(
   region = "",
   endpoint_override = "",
   scheme = "",
+  proxy_options = "",
   background_writes = TRUE
 )
 
diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp
index 964d374526776..7c9bb7cdc5aae 100644
--- a/r/src/arrowExports.cpp
+++ b/r/src/arrowExports.cpp
@@ -4158,8 +4158,8 @@ extern "C" SEXP _arrow_fs___CopyFiles(SEXP source_fs_sexp, SEXP source_sel_sexp,
 
 // filesystem.cpp
 #if defined(ARROW_R_WITH_S3)
-std::shared_ptr<fs::S3FileSystem> fs___S3FileSystem__create(bool anonymous, std::string access_key, std::string secret_key, std::string session_token, std::string role_arn, std::string session_name, std::string external_id, int load_frequency, std::string region, std::string endpoint_override, std::string scheme, bool background_writes);
-extern "C" SEXP _arrow_fs___S3FileSystem__create(SEXP anonymous_sexp, SEXP access_key_sexp, SEXP secret_key_sexp, SEXP session_token_sexp, SEXP role_arn_sexp, SEXP session_name_sexp, SEXP external_id_sexp, SEXP load_frequency_sexp, SEXP region_sexp, SEXP endpoint_override_sexp, SEXP scheme_sexp, SEXP background_writes_sexp){
+std::shared_ptr<fs::S3FileSystem> fs___S3FileSystem__create(bool anonymous, std::string access_key, std::string secret_key, std::string session_token, std::string role_arn, std::string session_name, std::string external_id, int load_frequency, std::string region, std::string endpoint_override, std::string scheme, std::string proxy_options, bool background_writes);
+extern "C" SEXP _arrow_fs___S3FileSystem__create(SEXP anonymous_sexp, SEXP access_key_sexp, SEXP secret_key_sexp, SEXP session_token_sexp, SEXP role_arn_sexp, SEXP session_name_sexp, SEXP external_id_sexp, SEXP load_frequency_sexp, SEXP region_sexp, SEXP endpoint_override_sexp, SEXP scheme_sexp, SEXP proxy_options_sexp, SEXP background_writes_sexp){
 BEGIN_CPP11
 	arrow::r::Input<bool>::type anonymous(anonymous_sexp);
 	arrow::r::Input<std::string>::type access_key(access_key_sexp);
@@ -4172,12 +4172,13 @@ BEGIN_CPP11
 	arrow::r::Input<std::string>::type region(region_sexp);
 	arrow::r::Input<std::string>::type endpoint_override(endpoint_override_sexp);
 	arrow::r::Input<std::string>::type scheme(scheme_sexp);
+	arrow::r::Input<std::string>::type proxy_options(proxy_options_sexp);
 	arrow::r::Input<bool>::type background_writes(background_writes_sexp);
-	return cpp11::as_sexp(fs___S3FileSystem__create(anonymous, access_key, secret_key, session_token, role_arn, session_name, external_id, load_frequency, region, endpoint_override, scheme, background_writes));
+	return cpp11::as_sexp(fs___S3FileSystem__create(anonymous, access_key, secret_key, session_token, role_arn, session_name, external_id, load_frequency, region, endpoint_override, scheme, proxy_options, background_writes));
 END_CPP11
 }
 #else
-extern "C" SEXP _arrow_fs___S3FileSystem__create(SEXP anonymous_sexp, SEXP access_key_sexp, SEXP secret_key_sexp, SEXP session_token_sexp, SEXP role_arn_sexp, SEXP session_name_sexp, SEXP external_id_sexp, SEXP load_frequency_sexp, SEXP region_sexp, SEXP endpoint_override_sexp, SEXP scheme_sexp, SEXP background_writes_sexp){
+extern "C" SEXP _arrow_fs___S3FileSystem__create(SEXP anonymous_sexp, SEXP access_key_sexp, SEXP secret_key_sexp, SEXP session_token_sexp, SEXP role_arn_sexp, SEXP session_name_sexp, SEXP external_id_sexp, SEXP load_frequency_sexp, SEXP region_sexp, SEXP endpoint_override_sexp, SEXP scheme_sexp, SEXP proxy_options_sexp, SEXP background_writes_sexp){
 	Rf_error("Cannot call fs___S3FileSystem__create(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
 }
 #endif
@@ -7435,7 +7436,7 @@ static const R_CallMethodDef CallEntries[] = {
 		{ "_arrow_fs___SubTreeFileSystem__base_path", (DL_FUNC) &_arrow_fs___SubTreeFileSystem__base_path, 1}, 
 		{ "_arrow_fs___FileSystemFromUri", (DL_FUNC) &_arrow_fs___FileSystemFromUri, 1}, 
 		{ "_arrow_fs___CopyFiles", (DL_FUNC) &_arrow_fs___CopyFiles, 6}, 
-		{ "_arrow_fs___S3FileSystem__create", (DL_FUNC) &_arrow_fs___S3FileSystem__create, 12}, 
+		{ "_arrow_fs___S3FileSystem__create", (DL_FUNC) &_arrow_fs___S3FileSystem__create, 13}, 
 		{ "_arrow_fs___S3FileSystem__region", (DL_FUNC) &_arrow_fs___S3FileSystem__region, 1}, 
 		{ "_arrow_io___Readable__Read", (DL_FUNC) &_arrow_io___Readable__Read, 2}, 
 		{ "_arrow_io___InputStream__Close", (DL_FUNC) &_arrow_io___InputStream__Close, 1}, 
diff --git a/r/src/filesystem.cpp b/r/src/filesystem.cpp
index e523202747a60..7f9b23ed207df 100644
--- a/r/src/filesystem.cpp
+++ b/r/src/filesystem.cpp
@@ -286,7 +286,7 @@ std::shared_ptr<fs::S3FileSystem> fs___S3FileSystem__create(
     std::string session_token = "", std::string role_arn = "",
     std::string session_name = "", std::string external_id = "", int load_frequency = 900,
     std::string region = "", std::string endpoint_override = "", std::string scheme = "",
-    bool background_writes = true) {
+    std::string proxy_options = "", bool background_writes = true) {
   // We need to ensure that S3 is initialized before we start messing with the
   // options
   StopIfNotOk(fs::EnsureS3Initialized());
@@ -315,6 +315,12 @@ std::shared_ptr<fs::S3FileSystem> fs___S3FileSystem__create(
   if (scheme != "") {
     s3_opts.scheme = scheme;
   }
+
+  if (proxy_options != "") {
+    auto s3_proxy_opts = fs::S3ProxyOptions::FromUri(proxy_options);
+    s3_opts.proxy_options = ValueOrStop(s3_proxy_opts);
+  }
+
   /// Whether OutputStream writes will be issued in the background, without blocking
   /// default true
   s3_opts.background_writes = background_writes;
diff --git a/r/tests/testthat/test-filesystem.R b/r/tests/testthat/test-filesystem.R
index 5dd84a7202fd3..f512cd129f472 100644
--- a/r/tests/testthat/test-filesystem.R
+++ b/r/tests/testthat/test-filesystem.R
@@ -166,6 +166,17 @@ test_that("SubTreeFileSystem$create() with URI", {
   )
 })
 
+test_that("S3FileSystem$create() with proxy_options", {
+  skip_on_cran()
+  skip_if_not_available("s3")
+  skip_if_offline()
+
+  expect_error(
+    S3FileSystem$create(proxy_options = "definitely not a valid proxy URI"),
+    "Cannot parse URI"
+  )
+})
+
 test_that("s3_bucket", {
   skip_on_cran()
   skip_if_not_available("s3")

From 34b860463d2c50bc2262fc8c8fa57ec5c89b2896 Mon Sep 17 00:00:00 2001
From: Will Jones <willjones127@gmail.com>
Date: Wed, 17 Nov 2021 11:25:24 -0600
Subject: [PATCH 164/194] ARROW-11938: [R] Enable R build process to find
 locally built C++ library on Windows

## Changes

- [x] Allow Windows R build to detect ARROW_HOME variable and use those binaries
- [x] Get Windows R build working with static libraries and RTools 4.0
- [x] Get Windows R build working with dynamic libraries ~~and older RTools~~.
- [x] Update [R developer documentation](https://arrow.apache.org/docs/r/articles/developing.html) to include Windows instructions

Closes #11644 from wjones127/ARROW-11938-r-windows-local-build

Authored-by: Will Jones <willjones127@gmail.com>
Signed-off-by: Jonathan Keane <jkeane@gmail.com>
---
 cpp/.gitignore                 |   1 +
 dev/tasks/r/github.devdocs.yml |  18 +++++-
 r/configure.win                | 115 ++++++++++++++++++++++++---------
 r/vignettes/developing.Rmd     |  98 ++++++++++++++++++++++++++--
 4 files changed, 194 insertions(+), 38 deletions(-)

diff --git a/cpp/.gitignore b/cpp/.gitignore
index 0fa5ae3ff2cf4..e1e921762f9ce 100644
--- a/cpp/.gitignore
+++ b/cpp/.gitignore
@@ -26,6 +26,7 @@ build/
 *-build/
 Testing/
 build-support/boost_*
+vcpkg_installed/
 
 # Build directories created by Clion
 cmake-build-*/
diff --git a/dev/tasks/r/github.devdocs.yml b/dev/tasks/r/github.devdocs.yml
index 5591e6587ae5e..b447dd2144191 100644
--- a/dev/tasks/r/github.devdocs.yml
+++ b/dev/tasks/r/github.devdocs.yml
@@ -26,7 +26,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [macOS-latest, ubuntu-20.04]
+        os: [macOS-latest, ubuntu-20.04, windows-latest]
 
     steps:
       {{ macros.github_checkout_arrow()|indent }}
@@ -45,11 +45,13 @@ jobs:
         shell: Rscript {0}
       - name: Remove system gfortran so that brew can install gcc successfully
         run: rm -f /usr/local/bin/gfortran
+        if: {{ "${{ !contains(matrix.os, 'windows') }}" }}
       - name: Write the install script
         env:
           RUN_DEVDOCS: TRUE
           DEVDOCS_MACOS: {{ "${{contains(matrix.os, 'macOS')}}" }}
           DEVDOCS_UBUNTU: {{ "${{contains(matrix.os, 'ubuntu')}}" }}
+          DEVDOCS_WINDOWS: {{ "${{contains(matrix.os, 'windows')}}" }}
         run: |
           # This isn't actually rendering the docs, but will save arrow/r/vignettes/script.sh
           # which can be sourced to install arrow.
@@ -59,9 +61,21 @@ jobs:
         env:
           LIBARROW_BINARY: FALSE
           ARROW_R_DEV: TRUE
-        run: bash arrow/r/vignettes/script.sh
+          DEVDOCS_WINDOWS: {{ "${{contains(matrix.os, 'windows')}}" }}
+        run: |
+          if [ $DEVDOCS_WINDOWS == "true" ]; then
+            # Part of Path, including the path to R's bin, needs to be reformatted
+            # from Windows paths (C:\R\bin) to Unix paths (/c/r/bin)
+            echo export PATH=\"$(cygpath --path "$PATH")\" >> ~/.bash_profile
+            # This starts a special mingw64 Git Bash shell
+            $RTOOLS40_HOME/msys2_shell.cmd -here -mingw64 -no-start -defterm -full-path arrow/r/vignettes/script.sh
+          else 
+            bash arrow/r/vignettes/script.sh
+          fi
         shell: bash
       - name: Ensure that the Arrow package is loadable and we have the correct one
+        env:
+          DEVDOCS_WINDOWS: {{ "${{contains(matrix.os, 'windows')}}" }}
         run: |
           echo $LD_LIBRARY_PATH
           R --no-save <<EOF
diff --git a/r/configure.win b/r/configure.win
index 6d731bb098818..d3de6da7c0c85 100644
--- a/r/configure.win
+++ b/r/configure.win
@@ -24,44 +24,99 @@ if [ "$ARROW_R_DEV" == "TRUE" ]; then
   "${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" data-raw/codegen.R
 fi
 
-VERSION=$(grep ^Version DESCRIPTION | sed s/Version:\ //)
-# Try to find/download a C++ Arrow binary,
-# including possibly a local .zip file if RWINLIB_LOCAL is set
-"${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" "tools/winlibs.R" $VERSION $RWINLIB_LOCAL
-# If binary not found, script exits nonzero
-if [ $? -ne 0 ]; then
-  echo "Arrow C++ library was not found"
-fi
-
-# Set the right flags to point to and enable arrow/parquet
-if [ -d "windows/arrow-$VERSION" ]; then
-  RWINLIB="../windows/arrow-$VERSION"
-else
-  # It's possible that the version of the libarrow binary is not identical to the
-  # R version, e.g. if the R build is a patch release, so find what the dir is
-  # actually called. If there is more than one version present, use the one
-  # with the highest version:
-  RWINLIB="../windows/$(ls windows/ | grep ^arrow- | tail -n 1)"
-fi
 OPENSSL_LIBS="-lcrypto -lcrypt32"
 MIMALLOC_LIBS="-lbcrypt -lpsapi"
 AWS_LIBS="-laws-cpp-sdk-config -laws-cpp-sdk-transfer -laws-cpp-sdk-identity-management -laws-cpp-sdk-cognito-identity -laws-cpp-sdk-sts -laws-cpp-sdk-s3 -laws-cpp-sdk-core -laws-c-event-stream -laws-checksums -laws-c-common -lUserenv -lversion -lws2_32 -lBcrypt -lWininet -lwinhttp"
 
-# NOTE: If you make changes to the libraries below, you should also change
-# ci/scripts/r_windows_build.sh and ci/scripts/PKGBUILD
-PKG_CFLAGS="-I${RWINLIB}/include -DARROW_STATIC -DPARQUET_STATIC -DARROW_DS_STATIC -DARROW_R_WITH_ARROW -DARROW_R_WITH_PARQUET -DARROW_R_WITH_DATASET -DARROW_R_WITH_JSON"
-PKG_LIBS="-L${RWINLIB}/lib"'$(subst gcc,,$(COMPILED_BY))$(R_ARCH) '"-L${RWINLIB}/lib"'$(R_ARCH)$(CRT) '"-lparquet -larrow_dataset -larrow -larrow_bundled_dependencies -lutf8proc -lthrift -lsnappy -lz -lzstd -llz4 -lole32 ${MIMALLOC_LIBS} ${OPENSSL_LIBS}"
+function configure_release() {
+  VERSION=$(grep ^Version DESCRIPTION | sed s/Version:\ //)
+  # Try to find/download a C++ Arrow binary,
+  # including possibly a local .zip file if RWINLIB_LOCAL is set
+  "${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" "tools/winlibs.R" $VERSION $RWINLIB_LOCAL
+  # If binary not found, script exits nonzero
+  if [ $? -ne 0 ]; then
+    echo "Arrow C++ library was not found"
+  fi
+
+  # Set the right flags to point to and enable arrow/parquet
+  if [ -d "windows/arrow-$VERSION" ]; then
+    RWINLIB="../windows/arrow-$VERSION"
+  else
+    # It's possible that the version of the libarrow binary is not identical to the
+    # R version, e.g. if the R build is a patch release, so find what the dir is
+    # actually called. If there is more than one version present, use the one
+    # with the highest version:
+    RWINLIB="../windows/$(ls windows/ | grep ^arrow- | tail -n 1)"
+  fi
+
+  # NOTE: If you make changes to the libraries below, you should also change
+  # ci/scripts/r_windows_build.sh and ci/scripts/PKGBUILD
+  PKG_CFLAGS="-I${RWINLIB}/include -DARROW_STATIC -DPARQUET_STATIC -DARROW_DS_STATIC -DARROW_R_WITH_ARROW -DARROW_R_WITH_PARQUET -DARROW_R_WITH_DATASET -DARROW_R_WITH_JSON"
+  PKG_LIBS="-L${RWINLIB}/lib"'$(subst gcc,,$(COMPILED_BY))$(R_ARCH) '"-L${RWINLIB}/lib"'$(R_ARCH)$(CRT) '"-lparquet -larrow_dataset -larrow -larrow_bundled_dependencies -lutf8proc -lthrift -lsnappy -lz -lzstd -llz4 -lole32 ${MIMALLOC_LIBS} ${OPENSSL_LIBS}"
+
+  # S3 and re2 support only for Rtools40 (i.e. R >= 4.0)
+  "${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" -e 'R.version$major >= 4' | grep TRUE >/dev/null 2>&1
+  if [ $? -eq 0 ]; then
+    PKG_CFLAGS="${PKG_CFLAGS} -DARROW_R_WITH_S3"
+    PKG_LIBS="${PKG_LIBS} -lre2 ${AWS_LIBS}"
+  else
+    # It seems that order matters
+    PKG_LIBS="${PKG_LIBS} -lws2_32"
+  fi
+}
+
+# Returns 1 if CMAKE options is set "ON", otherwise 0
+function cmake_option() {
+  ARROW_OPTS_CMAKE="$ARROW_HOME/lib/cmake/arrow/ArrowOptions.cmake"
+  grep -cm1 "set($1 \"ON\")" $ARROW_OPTS_CMAKE 
+}
 
-# S3 and re2 support only for Rtools40 (i.e. R >= 4.0)
-"${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" -e 'R.version$major >= 4' | grep TRUE >/dev/null 2>&1
-if [ $? -eq 0 ]; then
-  PKG_CFLAGS="${PKG_CFLAGS} -DARROW_R_WITH_S3"
-  PKG_LIBS="${PKG_LIBS} -lre2 ${AWS_LIBS}"
+function configure_dev() {
+  echo "*** Using locally built Arrow at $ARROW_HOME"
+  RWINLIB=$(cygpath $ARROW_HOME)
+
+  PKG_CFLAGS="-I${RWINLIB}/include -DARROW_R_WITH_ARROW"
+  PKG_LIBS=" -L${RWINLIB}/lib -L$MSYSTEM_PREFIX/lib -larrow -larrow_bundled_dependencies -lz -lole32 ${MIMALLOC_LIBS} ${OPENSSL_LIBS}"
+  # These are already in arrow_bundled_dependencies:
+  # -lutf8proc -lthrift -lsnappy -lzstd -llz4
+
+  if [ $(cmake_option ARROW_PARQUET) -eq 1 ]; then 
+    PKG_CFLAGS="$PKG_CFLAGS -DARROW_R_WITH_PARQUET"
+    PKG_LIBS="-lparquet $PKG_LIBS"
+    # NOTE: parquet is assumed to have the same -L flag as arrow
+    # so there is no need to add its location to PKG_DIRS
+  fi
+
+  if [ $(cmake_option ARROW_DATASET) -eq 1 ]; then
+    PKG_CFLAGS="$PKG_CFLAGS -DARROW_R_WITH_DATASET"
+    PKG_LIBS="-larrow_dataset $PKG_LIBS"
+    # NOTE: arrow-dataset is assumed to have the same -L flag as arrow
+    # so there is no need to add its location to PKG_DIRS
+  fi
+
+  if [ $(cmake_option ARROW_S3) -eq 1 ]; then
+    PKG_CFLAGS="$PKG_CFLAGS -DARROW_R_WITH_S3"
+    if [ "$BUNDLED_LIBS" != "" ]; then
+      # We're depending on openssl/curl from the system, so they're not in the bundled deps
+      BUNDLED_LIBS="$BUNDLED_LIBS -lssl -lcrypto -lcurl"
+    fi
+  fi
+
+  if [ $(cmake_option ARROW_JSON) -eq 1 ]; then
+    PKG_CFLAGS="$PKG_CFLAGS -DARROW_R_WITH_JSON"
+  fi
+}
+
+
+if [ ! -z ${ARROW_HOME} ]; then
+  # Build Arrow based on local build of libarrow.
+  configure_dev
 else
-  # It seems that order matters
-  PKG_LIBS="${PKG_LIBS} -lws2_32"
+  # Build Arrow based on precompiled zip of static libraries.
+  configure_release
 fi
 
+
 # Set any user-defined CXXFLAGS
 if [ "$ARROW_R_CXXFLAGS" ]; then
   PKG_CFLAGS="$PKG_CFLAGS $ARROW_R_CXXFLAGS"
diff --git a/r/vignettes/developing.Rmd b/r/vignettes/developing.Rmd
index 9cd76ed33a9dd..f8bff4f5dcd40 100644
--- a/r/vignettes/developing.Rmd
+++ b/r/vignettes/developing.Rmd
@@ -13,6 +13,7 @@ knitr::opts_chunk$set(error = TRUE, eval = FALSE)
 run <- tolower(Sys.getenv("RUN_DEVDOCS", "false")) == "true"
 macos <- tolower(Sys.getenv("DEVDOCS_MACOS", "false")) == "true"
 ubuntu <- tolower(Sys.getenv("DEVDOCS_UBUNTU", "false")) == "true"
+windows <- tolower(Sys.getenv("DEVDOCS_WINDOWS", "false")) == "true"
 sys_install <- tolower(Sys.getenv("DEVDOCS_SYSTEM_INSTALL", "false")) == "true"
 # Update the source knit_hook to save the chunk (if it is marked to be saved)
 knit_hooks_source <- knitr::knit_hooks$get("source")
@@ -38,6 +39,12 @@ set -e
 set -x
 ```
 
+
+```{bash, save=run & windows, hide=TRUE}
+# For some reason CRAN Mirror goes missing in CI
+echo 'options(repos=structure(c(CRAN="https://cloud.r-project.org")))' > $HOME/.Rprofile
+```
+
 If you're looking to contribute to arrow, this vignette can help you set up a development environment that will enable you to write code and run tests locally. It outlines:
 
 * how to build the components that make up the Arrow project and R package
@@ -122,7 +129,41 @@ brew install cmake openssl
 
 #### Windows
 
-Currently, the R package cannot be made to work with a local libarrow build. This will be resolved in a future release.
+The package can be built on Windows using [RTools 4](https://cran.r-project.org/bin/windows/Rtools/). It can be built for mingw32 (i386), mingw64 (x64), or ucrt64 (UCRT x64). mingw64 is the recommended 64-bit installation.
+
+Open the corresponding RTools Bash, for example "Rtools MinGW 64-bit" for mingw64.
+
+Install CMake and Ninja with:
+
+```{bash, save=run & windows}
+pacman --sync --refresh --noconfirm \
+  ${MINGW_PACKAGE_PREFIX}-cmake \
+  ${MINGW_PACKAGE_PREFIX}-ninja \
+  ${MINGW_PACKAGE_PREFIX}-openssl
+export CMAKE_GENERATOR=Ninja
+```
+
+You will need to add R to your path. For a user-level installation, R will be at something like `~/Documents/R/R-4.1.2/bin`. For a global installation, R will be at something like `/c/Program\ Files/R/R-4.1.2/bin`. The R on your path needs to match the architecture you are compiling for, so if you are compiling on 32-bit specify `.../bin/i386` instead of `.../bin/x64`.
+
+```{bash}
+export PATH=~/Documents/R/R-4.1.2/bin/x64:$PATH
+```
+
+You can install additional dependencies like so. Note that you are limited to the packages in [the RTools repo](https://github.com/r-windows/rtools-packages), which does not contain every dependency used by Arrow.
+
+```{bash}
+pacman --sync --refresh --noconfirm \
+  ${MINGW_PACKAGE_PREFIX}-boost \
+  ${MINGW_PACKAGE_PREFIX}-lz4 \
+  ${MINGW_PACKAGE_PREFIX}-protobuf \
+  ${MINGW_PACKAGE_PREFIX}-snappy \
+  ${MINGW_PACKAGE_PREFIX}-thrift \
+  ${MINGW_PACKAGE_PREFIX}-zlib \
+  ${MINGW_PACKAGE_PREFIX}-zstd \
+  ${MINGW_PACKAGE_PREFIX}-aws-sdk-cpp \
+  ${MINGW_PACKAGE_PREFIX}-re2 \
+  ${MINGW_PACKAGE_PREFIX}-libutf8proc
+```
 
 ### Step 2 - Configure the libarrow build
 
@@ -142,6 +183,13 @@ export LD_LIBRARY_PATH=$ARROW_HOME/lib:$LD_LIBRARY_PATH
 echo "export LD_LIBRARY_PATH=$ARROW_HOME/lib:$LD_LIBRARY_PATH" >> ~/.bash_profile
 ```
 
+_Special instructions on Windows:_ You will need to add `$ARROW_HOME/bin` to your `PATH` if you are using dynamic libraries (which is recommended).
+
+```{bash, save=run & windows}
+export PATH=$ARROW_HOME/bin:$PATH
+echo "export PATH=\"$ARROW_HOME/bin:$PATH\"" >> ~/.bash_profile
+```
+
 Start by navigating in a terminal to the `arrow` repository. You will need to create a directory into which the C++ build will put its contents. We recommend that you make a `build` directory inside of the `cpp` directory of the Arrow git repository (it is git-ignored, so you won't accidentally check it in). Next, change directories to be inside `cpp/build`:
 
 ```{bash, save=run & !sys_install}
@@ -152,7 +200,11 @@ pushd cpp/build
 
 You'll first call `cmake` to configure the build and then `make install`. For the R package, you'll need to enable several features in libarrow using `-D` flags:
 
-```{bash, save=run & !sys_install}
+#### {.tabset}
+
+##### Linux / Mac OS
+
+```{bash, save=run & !sys_install & !windows}
 cmake \
   -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \
   -DCMAKE_INSTALL_LIBDIR=lib \
@@ -170,8 +222,38 @@ cmake \
   ..
 ```
 
+##### Windows
+
+```{bash, save=run & !sys_install & windows}
+cmake \
+  -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \
+  -DCMAKE_INSTALL_LIBDIR=lib \
+  -DARROW_COMPUTE=ON \
+  -DARROW_CSV=ON \
+  -DARROW_DATASET=ON \
+  -DARROW_EXTRA_ERROR_CONTEXT=ON \
+  -DARROW_FILESYSTEM=ON \
+  -DARROW_INSTALL_NAME_RPATH=OFF \
+  -DARROW_JSON=OFF \
+  -DARROW_PARQUET=ON \
+  -DARROW_WITH_LZ4=OFF \
+  -DARROW_WITH_SNAPPY=OFF \
+  -DARROW_WITH_ZLIB=ON \
+  -DARROW_MIMALLOC=OFF \
+  -DARROW_S3=OFF \
+  -DARROW_WITH_BROTLI=ON \
+  -DARROW_WITH_BZ2=ON \
+  -DARROW_WITH_SNAPPY=ON \
+  -DARROW_WITH_ZSTD=ON \
+  ..
+```
+
+#### {-}
+
 `..` refers to the C++ source directory: you're in `cpp/build` and the source is in `cpp`.
 
+**For Windows**: some options, including `-DARROW_JEMALLOC`, are not supported on Windows. 
+
 #### Enabling more Arrow features
 
 To enable optional features including: S3 support, an alternative memory allocator, and additional compression libraries, add some or all of these flags to your call to `cmake` (the trailing `\` makes them easier to paste into a bash shell on a new line):
@@ -192,14 +274,16 @@ Other flags that may be useful:
 
 * `-DCMAKE_BUILD_TYPE=debug` or `-DCMAKE_BUILD_TYPE=relwithdebinfo` can be useful for debugging. You probably don't want to do this generally because a debug build is much slower at runtime than the default `release` build.
 
+* `-DARROW_BUILD_STATIC=ON` and `-DARROW_BUILD_SHARED=OFF` if you want to use static libraries instead of dynamic libraries. With static libraries there isn't a risk of the R package linking to the wrong library, but it does mean if you change the C++ code you have to recompile both the C++ libraries and the R package. Compilers typically will link to static libraries only if the dynamic ones are not present, which is why we need to set `-DARROW_BUILD_SHARED=OFF`. If you are switching after compiling and installing previously, you may need to remove the `.dll` or `.so` files from `$ARROW_HOME/dist/bin`.
+
 _Note_ `cmake` is particularly sensitive to whitespacing, if you see errors, check that you don't have any errant whitespace.
 
 ### Step 3 - Building libarrow
 
-You can add `-j#` between `make` and `install` here too to speed up compilation by running in parallel (where `#` is the number of cores you have available).
+You can add `-j#` at the end of the command here too to speed up compilation by running in parallel (where `#` is the number of cores you have available).
 
 ```{bash, save=run & !(sys_install & ubuntu)}
-make -j8 install
+cmake --build . --target install -j8
 ```
 
 ### Step 4 - Build the Arrow R package
@@ -211,10 +295,12 @@ checkout:
 ```{bash, save=run}
 popd # To go back to the root directory of the project, from cpp/build
 pushd r
-R -e 'install.packages("remotes"); remotes::install_deps(dependencies = TRUE)'
-R CMD INSTALL .
+R -e "install.packages('remotes'); remotes::install_deps(dependencies = TRUE)"
+R CMD INSTALL --no-multiarch .
 ```
 
+The `--no-multiarch` flag makes it only compile on the "main" architecture. This will compile for the architecture that the R in your path corresponds to. If you compile on one architecture and then switch to another, make sure to pass the `--preclean` flag so that the R package code is recompiled for the new architecture. Otherwise, you may see errors like `LoadLibrary failure: %1 is not a valid Win32 application`.
+
 #### Compilation flags
 
 If you need to set any compilation flags while building the C++

From 641554b0bcce587549bfcfd0cde3cb4bc23054aa Mon Sep 17 00:00:00 2001
From: Jonathan Keane <jkeane@gmail.com>
Date: Wed, 17 Nov 2021 13:57:01 -0600
Subject: [PATCH 165/194] ARROW-14395: [R] Re-enable duckdb autocleaning

Don't merge until duckdb has a cran release

Closes #11482 from jonkeane/duckdb_autocleanup

Authored-by: Jonathan Keane <jkeane@gmail.com>
Signed-off-by: Jonathan Keane <jkeane@gmail.com>
---
 r/R/duckdb.R                   | 11 +++--------
 r/man/to_duckdb.Rd             |  2 +-
 r/tests/testthat/test-duckdb.R | 19 ++++++++-----------
 3 files changed, 12 insertions(+), 20 deletions(-)

diff --git a/r/R/duckdb.R b/r/R/duckdb.R
index c772d4fbdabc6..c29fafe244b89 100644
--- a/r/R/duckdb.R
+++ b/r/R/duckdb.R
@@ -55,7 +55,7 @@
 to_duckdb <- function(.data,
                       con = arrow_duck_connection(),
                       table_name = unique_arrow_tablename(),
-                      auto_disconnect = FALSE) {
+                      auto_disconnect = TRUE) {
   .data <- as_adq(.data)
   duckdb::duckdb_register_arrow(con, table_name, .data)
 
@@ -67,8 +67,8 @@ to_duckdb <- function(.data,
 
   if (auto_disconnect) {
     # this will add the correct connection disconnection when the tbl is gced.
-    # we should probably confirm that this use of src$disco is kosher.
-    tbl$src$disco <- duckdb_disconnector(con, table_name)
+    # this is similar to what dbplyr does, though it calls it tbl$src$disco
+    tbl$src$.arrow_finalizer_environment <- duckdb_disconnector(con, table_name)
   }
 
   tbl
@@ -110,11 +110,6 @@ duckdb_disconnector <- function(con, tbl_name) {
     if (DBI::dbIsValid(con)) {
       duckdb::duckdb_unregister_arrow(con, tbl_name)
     }
-
-    # and there are no more tables, so we can safely shutdown
-    if (length(DBI::dbListTables(con)) == 0) {
-      DBI::dbDisconnect(con, shutdown = TRUE)
-    }
   })
   environment()
 }
diff --git a/r/man/to_duckdb.Rd b/r/man/to_duckdb.Rd
index 12186d4329db1..8d6a9e5c62734 100644
--- a/r/man/to_duckdb.Rd
+++ b/r/man/to_duckdb.Rd
@@ -8,7 +8,7 @@ to_duckdb(
   .data,
   con = arrow_duck_connection(),
   table_name = unique_arrow_tablename(),
-  auto_disconnect = FALSE
+  auto_disconnect = TRUE
 )
 }
 \arguments{
diff --git a/r/tests/testthat/test-duckdb.R b/r/tests/testthat/test-duckdb.R
index decd6e80e58b1..1b3012bcef8b6 100644
--- a/r/tests/testthat/test-duckdb.R
+++ b/r/tests/testthat/test-duckdb.R
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-skip_if_not_installed("duckdb", minimum_version = "0.2.8")
+skip_if_not_installed("duckdb", minimum_version = "0.3.1")
 skip_if_not_installed("dbplyr")
 skip_if_not_available("dataset")
 skip_on_cran()
@@ -120,16 +120,13 @@ con <- dbConnect(duckdb::duckdb())
 dbExecute(con, "PRAGMA threads=2")
 on.exit(dbDisconnect(con, shutdown = TRUE), add = TRUE)
 
-# write one table to the connection so it is kept open
-DBI::dbWriteTable(con, "mtcars", mtcars)
-
 test_that("Joining, auto-cleanup enabled", {
   ds <- InMemoryDataset$create(example_data)
 
   table_one_name <- "my_arrow_table_1"
-  table_one <- to_duckdb(ds, con = con, table_name = table_one_name, auto_disconnect = TRUE)
+  table_one <- to_duckdb(ds, con = con, table_name = table_one_name)
   table_two_name <- "my_arrow_table_2"
-  table_two <- to_duckdb(ds, con = con, table_name = table_two_name, auto_disconnect = TRUE)
+  table_two <- to_duckdb(ds, con = con, table_name = table_two_name)
 
   res <- dbGetQuery(
     con,
@@ -142,24 +139,24 @@ test_that("Joining, auto-cleanup enabled", {
   expect_identical(dim(res), c(9L, 14L))
 
   # clean up cleans up the tables
-  expect_true(all(c(table_one_name, table_two_name) %in% DBI::dbListTables(con)))
+  expect_true(all(c(table_one_name, table_two_name) %in% duckdb::duckdb_list_arrow(con)))
   rm(table_one, table_two)
   gc()
-  expect_false(any(c(table_one_name, table_two_name) %in% DBI::dbListTables(con)))
+  expect_false(any(c(table_one_name, table_two_name) %in% duckdb::duckdb_list_arrow(con)))
 })
 
 test_that("Joining, auto-cleanup disabled", {
   ds <- InMemoryDataset$create(example_data)
 
   table_three_name <- "my_arrow_table_3"
-  table_three <- to_duckdb(ds, con = con, table_name = table_three_name)
+  table_three <- to_duckdb(ds, con = con, table_name = table_three_name, auto_disconnect = FALSE)
 
   # clean up does *not* clean these tables
-  expect_true(table_three_name %in% DBI::dbListTables(con))
+  expect_true(table_three_name %in% duckdb::duckdb_list_arrow(con))
   rm(table_three)
   gc()
   # but because we aren't auto_disconnecting then we still have this table.
-  expect_true(table_three_name %in% DBI::dbListTables(con))
+  expect_true(table_three_name %in% duckdb::duckdb_list_arrow(con))
 })
 
 test_that("to_duckdb with a table", {

From 7371ea958ebb17b1d65615176456de103b9bba5a Mon Sep 17 00:00:00 2001
From: Augusto Silva <augusto.a.silva@hotmail.com>
Date: Thu, 18 Nov 2021 13:36:35 +0530
Subject: [PATCH 166/194] ARROW-14032: [C++][Gandiva] Add concat_ws hive
 function to gandiva

Returns the string resulting from concatenating the stringses passed in as parameters in order with a custom separator.

Closes #11180 from augustoasilva/feature/add-concat-ws-function

Authored-by: Augusto Silva <augusto.a.silva@hotmail.com>
Signed-off-by: Pindikura Ravindra <ravindra@dremio.com>
---
 cpp/src/gandiva/function_registry_string.cc   |  18 +++
 cpp/src/gandiva/precompiled/string_ops.cc     | 149 ++++++++++++++++++
 .../gandiva/precompiled/string_ops_test.cc    |  50 ++++++
 cpp/src/gandiva/precompiled/types.h           |  26 +++
 cpp/src/gandiva/tests/projector_test.cc       |  40 +++++
 5 files changed, 283 insertions(+)

diff --git a/cpp/src/gandiva/function_registry_string.cc b/cpp/src/gandiva/function_registry_string.cc
index 3ea426c85f489..6fc022d71a019 100644
--- a/cpp/src/gandiva/function_registry_string.cc
+++ b/cpp/src/gandiva/function_registry_string.cc
@@ -88,6 +88,24 @@ std::vector<NativeFunction> GetStringFunctionRegistry() {
                      "gdv_fn_initcap_utf8",
                      NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),
 
+      NativeFunction("concat_ws", {}, DataTypeVector{utf8(), utf8(), utf8()}, utf8(),
+                     kResultNullIfNull, "concat_ws_utf8_utf8",
+                     NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),
+
+      NativeFunction("concat_ws", {}, DataTypeVector{utf8(), utf8(), utf8(), utf8()},
+                     utf8(), kResultNullIfNull, "concat_ws_utf8_utf8_utf8",
+                     NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),
+
+      NativeFunction("concat_ws", {},
+                     DataTypeVector{utf8(), utf8(), utf8(), utf8(), utf8()}, utf8(),
+                     kResultNullIfNull, "concat_ws_utf8_utf8_utf8_utf8",
+                     NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),
+
+      NativeFunction("concat_ws", {},
+                     DataTypeVector{utf8(), utf8(), utf8(), utf8(), utf8(), utf8()},
+                     utf8(), kResultNullIfNull, "concat_ws_utf8_utf8_utf8_utf8_utf8",
+                     NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),
+
       NativeFunction("castBIT", {"castBOOLEAN"}, DataTypeVector{utf8()}, boolean(),
                      kResultNullIfNull, "castBIT_utf8", NativeFunction::kNeedsContext),
 
diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc
index 48c24b862b8fe..779086073f097 100644
--- a/cpp/src/gandiva/precompiled/string_ops.cc
+++ b/cpp/src/gandiva/precompiled/string_ops.cc
@@ -2195,4 +2195,153 @@ const char* byte_substr_binary_int32_int32(gdv_int64 context, const char* text,
   memcpy(ret, text + startPos, *out_len);
   return ret;
 }
+
+FORCE_INLINE
+const char* concat_ws_utf8_utf8(int64_t context, const char* separator,
+                                int32_t separator_len, const char* word1,
+                                int32_t word1_len, const char* word2, int32_t word2_len,
+                                int32_t* out_len) {
+  if (word1_len < 0 || word2_len < 0 || separator_len < 0) {
+    gdv_fn_context_set_error_msg(context, "All words can not be null.");
+    *out_len = 0;
+    return "";
+  }
+
+  *out_len = word1_len + separator_len + word2_len;
+  char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
+  if (out == nullptr) {
+    gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
+    *out_len = 0;
+    return "";
+  }
+
+  char* tmp = out;
+  memcpy(tmp, word1, word1_len);
+  tmp += word1_len;
+  memcpy(tmp, separator, separator_len);
+  tmp += separator_len;
+  memcpy(tmp, word2, word2_len);
+
+  return out;
+}
+
+FORCE_INLINE
+const char* concat_ws_utf8_utf8_utf8(int64_t context, const char* separator,
+                                     int32_t separator_len, const char* word1,
+                                     int32_t word1_len, const char* word2,
+                                     int32_t word2_len, const char* word3,
+                                     int32_t word3_len, int32_t* out_len) {
+  if (word1_len < 0 || word2_len < 0 || word3_len < 0 || separator_len < 0) {
+    gdv_fn_context_set_error_msg(context, "All words can not be null.");
+    *out_len = 0;
+    return "";
+  }
+
+  *out_len = word1_len + word2_len + word3_len + (2 * separator_len);
+  char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
+  if (out == nullptr) {
+    gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
+    *out_len = 0;
+    return "";
+  }
+
+  char* tmp = out;
+  memcpy(tmp, word1, word1_len);
+  tmp += word1_len;
+  memcpy(tmp, separator, separator_len);
+  tmp += separator_len;
+  memcpy(tmp, word2, word2_len);
+  tmp += word2_len;
+  memcpy(tmp, separator, separator_len);
+  tmp += separator_len;
+  memcpy(tmp, word3, word3_len);
+
+  return out;
+}
+
+FORCE_INLINE
+const char* concat_ws_utf8_utf8_utf8_utf8(int64_t context, const char* separator,
+                                          int32_t separator_len, const char* word1,
+                                          int32_t word1_len, const char* word2,
+                                          int32_t word2_len, const char* word3,
+                                          int32_t word3_len, const char* word4,
+                                          int32_t word4_len, int32_t* out_len) {
+  if (word1_len < 0 || word2_len < 0 || word3_len < 0 || word4_len < 0 ||
+      separator_len < 0) {
+    gdv_fn_context_set_error_msg(context, "All words can not be null.");
+    *out_len = 0;
+    return "";
+  }
+
+  *out_len = word1_len + word2_len + word3_len + word4_len + (3 * separator_len);
+  char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
+  if (out == nullptr) {
+    gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
+    *out_len = 0;
+    return "";
+  }
+
+  char* tmp = out;
+  memcpy(tmp, word1, word1_len);
+  tmp += word1_len;
+  memcpy(tmp, separator, separator_len);
+  tmp += separator_len;
+  memcpy(tmp, word2, word2_len);
+  tmp += word2_len;
+  memcpy(tmp, separator, separator_len);
+  tmp += separator_len;
+  memcpy(tmp, word3, word3_len);
+  tmp += word3_len;
+  memcpy(tmp, separator, separator_len);
+  tmp += separator_len;
+  memcpy(tmp, word4, word4_len);
+
+  return out;
+}
+
+FORCE_INLINE
+const char* concat_ws_utf8_utf8_utf8_utf8_utf8(int64_t context, const char* separator,
+                                               int32_t separator_len, const char* word1,
+                                               int32_t word1_len, const char* word2,
+                                               int32_t word2_len, const char* word3,
+                                               int32_t word3_len, const char* word4,
+                                               int32_t word4_len, const char* word5,
+                                               int32_t word5_len, int32_t* out_len) {
+  if (word1_len < 0 || word2_len < 0 || word3_len < 0 || word4_len < 0 || word5_len < 0 ||
+      separator_len < 0) {
+    gdv_fn_context_set_error_msg(context, "All words can not be null.");
+    *out_len = 0;
+    return "";
+  }
+
+  *out_len =
+      word1_len + word2_len + word3_len + word4_len + word5_len + (4 * separator_len);
+  char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
+  if (out == nullptr) {
+    gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
+    *out_len = 0;
+    return "";
+  }
+
+  char* tmp = out;
+  memcpy(tmp, word1, word1_len);
+  tmp += word1_len;
+  memcpy(tmp, separator, separator_len);
+  tmp += separator_len;
+  memcpy(tmp, word2, word2_len);
+  tmp += word2_len;
+  memcpy(tmp, separator, separator_len);
+  tmp += separator_len;
+  memcpy(tmp, word3, word3_len);
+  tmp += word3_len;
+  memcpy(tmp, separator, separator_len);
+  tmp += separator_len;
+  memcpy(tmp, word4, word4_len);
+  tmp += word4_len;
+  memcpy(tmp, separator, separator_len);
+  tmp += separator_len;
+  memcpy(tmp, word5, word5_len);
+
+  return out;
+}
 }  // extern "C"
diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc b/cpp/src/gandiva/precompiled/string_ops_test.cc
index 6221dffb30224..15ab5b6911cc2 100644
--- a/cpp/src/gandiva/precompiled/string_ops_test.cc
+++ b/cpp/src/gandiva/precompiled/string_ops_test.cc
@@ -1755,4 +1755,54 @@ TEST(TestStringOps, TestConvertToBigEndian) {
 #endif
 }
 
+TEST(TestStringOps, TestConcatWs) {
+  gandiva::ExecutionContext ctx;
+
+  auto ctx_ptr = reinterpret_cast<int64_t>(&ctx);
+
+  const char* separator = "-";
+  auto sep_len = static_cast<int32_t>(strlen(separator));
+  int32_t out_len;
+  const char* word1 = "hey";
+  int32_t word1_len = static_cast<int32_t>(strlen(word1));
+  const char* word2 = "hello";
+  int32_t word2_len = static_cast<int32_t>(strlen(word2));
+
+  const char* out = concat_ws_utf8_utf8(ctx_ptr, separator, sep_len, word1, word1_len,
+                                        word2, word2_len, &out_len);
+  EXPECT_EQ(std::string(out, out_len), "hey-hello");
+
+  separator = "#";
+  sep_len = static_cast<int32_t>(strlen(separator));
+  const char* word3 = "wow";
+  int32_t word3_len = static_cast<int32_t>(strlen(word3));
+
+  out = concat_ws_utf8_utf8_utf8(ctx_ptr, separator, sep_len, word1, word1_len, word2,
+                                 word2_len, word3, word3_len, &out_len);
+  EXPECT_EQ(std::string(out, out_len), "hey#hello#wow");
+
+  separator = "=";
+  sep_len = static_cast<int32_t>(strlen(separator));
+  const char* word4 = "awesome";
+  int32_t word4_len = static_cast<int32_t>(strlen(word4));
+
+  out = concat_ws_utf8_utf8_utf8_utf8(ctx_ptr, separator, sep_len, word1, word1_len,
+                                      word2, word2_len, word3, word3_len, word4,
+                                      word4_len, &out_len);
+  EXPECT_EQ(std::string(out, out_len), "hey=hello=wow=awesome");
+
+  separator = "&&";
+  sep_len = static_cast<int32_t>(strlen(separator));
+  const char* word5 = "super";
+  int32_t word5_len = static_cast<int32_t>(strlen(word5));
+
+  out = concat_ws_utf8_utf8_utf8_utf8_utf8(ctx_ptr, separator, sep_len, word1, word1_len,
+                                           word2, word2_len, word3, word3_len, word4,
+                                           word4_len, word5, word5_len, &out_len);
+  EXPECT_EQ(std::string(out, out_len), "hey&&hello&&wow&&awesome&&super");
+
+  out = concat_ws_utf8_utf8(ctx_ptr, "", 0, "", 0, "", 0, &out_len);
+  EXPECT_EQ(std::string(out, out_len), "");
+}
+
 }  // namespace gandiva
diff --git a/cpp/src/gandiva/precompiled/types.h b/cpp/src/gandiva/precompiled/types.h
index 2e6e9c6eb7a76..a35d3d3f6fd60 100644
--- a/cpp/src/gandiva/precompiled/types.h
+++ b/cpp/src/gandiva/precompiled/types.h
@@ -590,4 +590,30 @@ gdv_month_interval castNULLABLEINTERVALYEAR_int32(int64_t context, gdv_int32 in)
 
 gdv_month_interval castNULLABLEINTERVALYEAR_int64(int64_t context, gdv_int64 in);
 
+const char* concat_ws_utf8_utf8(int64_t context, const char* separator,
+                                int32_t separator_len, const char* word1,
+                                int32_t word1_len, const char* word2, int32_t word2_len,
+                                int32_t* out_len);
+
+const char* concat_ws_utf8_utf8_utf8(int64_t context, const char* separator,
+                                     int32_t separator_len, const char* word1,
+                                     int32_t word1_len, const char* word2,
+                                     int32_t word2_len, const char* word3,
+                                     int32_t word3_len, int32_t* out_len);
+
+const char* concat_ws_utf8_utf8_utf8_utf8(int64_t context, const char* separator,
+                                          int32_t separator_len, const char* word1,
+                                          int32_t word1_len, const char* word2,
+                                          int32_t word2_len, const char* word3,
+                                          int32_t word3_len, const char* word4,
+                                          int32_t word4_len, int32_t* out_len);
+
+const char* concat_ws_utf8_utf8_utf8_utf8_utf8(int64_t context, const char* separator,
+                                               int32_t separator_len, const char* word1,
+                                               int32_t word1_len, const char* word2,
+                                               int32_t word2_len, const char* word3,
+                                               int32_t word3_len, const char* word4,
+                                               int32_t word4_len, const char* word5,
+                                               int32_t word5_len, int32_t* out_len);
+
 }  // extern "C"
diff --git a/cpp/src/gandiva/tests/projector_test.cc b/cpp/src/gandiva/tests/projector_test.cc
index dea66a792ba3d..945b9303b5aea 100644
--- a/cpp/src/gandiva/tests/projector_test.cc
+++ b/cpp/src/gandiva/tests/projector_test.cc
@@ -1643,4 +1643,44 @@ TEST_F(TestProjector, TestBround) {
   EXPECT_ARROW_ARRAY_EQUALS(exp_bround, outputs.at(0));
 }
 
+TEST_F(TestProjector, TestConcatWsFunction) {
+  auto field0 = field("f0", arrow::utf8());
+  auto field1 = field("f1", arrow::utf8());
+  auto field2 = field("f2", arrow::utf8());
+
+  auto schema0 = arrow::schema({field0, field1, field2});
+
+  // output fields
+  auto out_field0 = field("out_field0", arrow::utf8());
+
+  // Build expression
+  auto concat_ws_expr0 =
+      TreeExprBuilder::MakeExpression("concat_ws", {field0, field1, field2}, out_field0);
+
+  std::shared_ptr<Projector> projector1;
+
+  auto status =
+      Projector::Make(schema0, {concat_ws_expr0}, TestConfiguration(), &projector1);
+  EXPECT_TRUE(status.ok());
+
+  // Create a row-batch with some sample data
+  int num_records = 2;
+
+  auto array0 = MakeArrowArrayUtf8({"-", "<>"}, {true, true});
+  auto array1 = MakeArrowArrayUtf8({"john", "hello"}, {true, true});
+  auto array2 = MakeArrowArrayUtf8({"doe", "world"}, {true, true});
+  auto in_batch0 =
+      arrow::RecordBatch::Make(schema0, num_records, {array0, array1, array2});
+
+  auto expected_out0 = MakeArrowArrayUtf8({"john-doe", "hello<>world"}, {true, true});
+
+  arrow::ArrayVector outputs;
+
+  // Evaluate expression
+  status = projector1->Evaluate(*in_batch0, pool_, &outputs);
+  EXPECT_TRUE(status.ok());
+
+  EXPECT_ARROW_ARRAY_EQUALS(expected_out0, outputs.at(0));
+}
+
 }  // namespace gandiva

From 2fec2fa3dd6f0c078a4ed83466269258ea8338cc Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Thu, 18 Nov 2021 10:02:58 +0100
Subject: [PATCH 167/194] ARROW-14704: [C++] Fix Valgrind failure in
 parquet-arrow-test

Error log of Valgrind failure:
```
[----------] 3 tests from TestArrowReadDeltaEncoding
[ RUN      ] TestArrowReadDeltaEncoding.DeltaBinaryPacked
[       OK ] TestArrowReadDeltaEncoding.DeltaBinaryPacked (812 ms)
[ RUN      ] TestArrowReadDeltaEncoding.DeltaByteArray
==12587== Conditional jump or move depends on uninitialised value(s)
==12587==    at 0x4F12C57: Advance (bit_stream_utils.h:426)
==12587==    by 0x4F12C57: parquet::(anonymous namespace)::DeltaBitPackDecoder<parquet::PhysicalType<(parquet::Type::type)1> >::GetInternal(int*, int) (encoding.cc:2216)
==12587==    by 0x4F13823: Decode (encoding.cc:2091)
==12587==    by 0x4F13823: parquet::(anonymous namespace)::DeltaByteArrayDecoder::SetData(int, unsigned char const*, int) (encoding.cc:2360)
==12587==    by 0x4E89EF5: parquet::(anonymous namespace)::ColumnReaderImplBase<parquet::PhysicalType<(parquet::Type::type)6> >::InitializeDataDecoder(parquet::DataPage const&, long) (column_reader.cc:797)
==12587==    by 0x4E9AE63: ReadNewPage (column_reader.cc:614)
==12587==    by 0x4E9AE63: HasNextInternal (column_reader.cc:576)
==12587==    by 0x4E9AE63: parquet::internal::(anonymous namespace)::TypedRecordReader<parquet::PhysicalType<(parquet::Type::type)6> >::ReadRecords(long) (column_reader.cc:1228)
==12587==    by 0x4DFB19F: parquet::arrow::(anonymous namespace)::LeafReader::LoadBatch(long) (reader.cc:467)
==12587==    by 0x4DF513C: parquet::arrow::ColumnReaderImpl::NextBatch(long, std::shared_ptr<arrow::ChunkedArray>*) (reader.cc:108)
==12587==    by 0x4DFB74D: parquet::arrow::(anonymous namespace)::FileReaderImpl::ReadColumn(int, std::vector<int, std::allocator<int> > const&, parquet::arrow::ColumnReader*, std::shared_ptr<arrow::ChunkedArray>*) (reader.cc:273)
==12587==    by 0x4E11FDA: operator() (reader.cc:1180)
==12587==    by 0x4E11FDA: arrow::Future<std::vector<std::shared_ptr<arrow::ChunkedArray>, std::allocator<arrow::Future> > > arrow::internal::OptionalParallelForAsync<parquet::arrow::(anonymous namespace)::FileReaderImpl::DecodeRowGroups(std::shared_ptr<parquet::arrow::(anonymous namespace)::FileReaderImpl>, std::vector<int, std::allocator<int> > const&, std::vector<int, std::allocator<int> > const&, arrow::internal::Executor*)::{lambda(unsigned long, std::shared_ptr<parquet::arrow::ColumnReaderImpl>)#1}&, std::shared_ptr<parquet::arrow::ColumnReaderImpl>, std::shared_ptr<arrow::ChunkedArray> >(bool, std::vector<std::shared_ptr<parquet::arrow::ColumnReaderImpl>, std::allocator<arrow::Future<std::vector<std::shared_ptr<arrow::ChunkedArray>, std::allocator<arrow::Future> > > > >, parquet::arrow::(anonymous namespace)::FileReaderImpl::DecodeRowGroups(std::shared_ptr<parquet::arrow::(anonymous namespace)::FileReaderImpl>, std::vector<int, std::allocator<int> > const&, std::vector<int, std::allocator<int> > const&, arrow::internal::Executor*)::{lambda(unsigned long, std::shared_ptr<parquet::arrow::ColumnReaderImpl>)#1}&, arrow::internal::Executor*) (parallel.h:95)
==12587==    by 0x4E126A9: parquet::arrow::(anonymous namespace)::FileReaderImpl::DecodeRowGroups(std::shared_ptr<parquet::arrow::(anonymous namespace)::FileReaderImpl>, std::vector<int, std::allocator<int> > const&, std::vector<int, std::allocator<int> > const&, arrow::internal::Executor*) (reader.cc:1198)
==12587==    by 0x4E12F50: parquet::arrow::(anonymous namespace)::FileReaderImpl::ReadRowGroups(std::vector<int, std::allocator<int> > const&, std::vector<int, std::allocator<int> > const&, std::shared_ptr<arrow::Table>*) (reader.cc:1160)
==12587==    by 0x4DFA2BC: parquet::arrow::(anonymous namespace)::FileReaderImpl::ReadTable(std::vector<int, std::allocator<int> > const&, std::shared_ptr<arrow::Table>*) (reader.cc:198)
==12587==    by 0x4DFA392: parquet::arrow::(anonymous namespace)::FileReaderImpl::ReadTable(std::shared_ptr<arrow::Table>*) (reader.cc:289)
==12587==    by 0x1DCE62: parquet::arrow::TestArrowReadDeltaEncoding::ReadTableFromParquetFile(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::shared_ptr<arrow::Table>*) (arrow_reader_writer_test.cc:4174)
==12587==    by 0x2266D2: parquet::arrow::TestArrowReadDeltaEncoding_DeltaByteArray_Test::TestBody() (arrow_reader_writer_test.cc:4209)
==12587==    by 0x4AD2C9B: void testing::internal::HandleSehExceptionsInMethodIfSupported<testing::Test, void>(testing::Test*, void (testing::Test::*)(), char const*) (gtest.cc:2607)
==12587==    by 0x4AC9DD1: void testing::internal::HandleExceptionsInMethodIfSupported<testing::Test, void>(testing::Test*, void (testing::Test::*)(), char const*) (gtest.cc:2643)
==12587==    by 0x4AA4C02: testing::Test::Run() (gtest.cc:2682)
==12587==    by 0x4AA563A: testing::TestInfo::Run() (gtest.cc:2861)
==12587==    by 0x4AA600F: testing::TestSuite::Run() (gtest.cc:3015)
==12587==    by 0x4AB631B: testing::internal::UnitTestImpl::RunAllTests() (gtest.cc:5855)
==12587==    by 0x4AD3CE7: bool testing::internal::HandleSehExceptionsInMethodIfSupported<testing::internal::UnitTestImpl, bool>(testing::internal::UnitTestImpl*, bool (testing::internal::UnitTestImpl::*)(), char const*) (gtest.cc:2607)
==12587==    by 0x4ACB063: bool testing::internal::HandleExceptionsInMethodIfSupported<testing::internal::UnitTestImpl, bool>(testing::internal::UnitTestImpl*, bool (testing::internal::UnitTestImpl::*)(), char const*) (gtest.cc:2643)
==12587==    by 0x4AB47B6: testing::UnitTest::Run() (gtest.cc:5438)
==12587==    by 0x4218918: RUN_ALL_TESTS() (gtest.h:2490)
==12587==    by 0x421895B: main (gtest_main.cc:52)
```

Closes #11725 from pitrou/ARROW-14704-parquet-valgrind

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/parquet/encoding.cc | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index 4c980dfe9384b..783e8680e8e3e 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -2175,6 +2175,10 @@ class DeltaBitPackDecoder : public DecoderImpl, virtual public TypedDecoder<DTyp
 
   int GetInternal(T* buffer, int max_values) {
     max_values = std::min(max_values, this->num_values_);
+    if (max_values == 0) {
+      return 0;
+    }
+
     DCHECK_LE(static_cast<uint32_t>(max_values), total_value_count_);
     int i = 0;
     while (i < max_values) {
@@ -2272,6 +2276,9 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl,
     // Decode up to `max_values` strings into an internal buffer
     // and reference them into `buffer`.
     max_values = std::min(max_values, num_valid_values_);
+    if (max_values == 0) {
+      return 0;
+    }
 
     int32_t data_size = 0;
     const int32_t* length_ptr =
@@ -2406,6 +2413,10 @@ class DeltaByteArrayDecoder : public DecoderImpl,
     // Decode up to `max_values` strings into an internal buffer
     // and reference them into `buffer`.
     max_values = std::min(max_values, num_valid_values_);
+    if (max_values == 0) {
+      return max_values;
+    }
+
     suffix_decoder_.Decode(buffer, max_values);
 
     int64_t data_size = 0;

From c038e611e95978d0e8a28feb10b2cec54c1b50ed Mon Sep 17 00:00:00 2001
From: David Li <li.davidm96@gmail.com>
Date: Thu, 18 Nov 2021 20:49:51 +0800
Subject: [PATCH 168/194] ARROW-14662: [Docs] Add note about linking
 Flight/gRPC/Protobuf

Adds a note about linking Flight/gRPC/Protobuf as a reference for issues like those seen in apache/arrow#11657.

Also fixes a number of build warnings.

Closes #11698 from lidavidm/arrow-14662

Authored-by: David Li <li.davidm96@gmail.com>
Signed-off-by: Yibo Cai <yibo.cai@arm.com>
---
 cpp/examples/arrow/flight_grpc_example.cc | 17 +++++++++++++
 cpp/src/arrow/flight/server.h             |  3 ++-
 docs/source/cpp/build_system.rst          | 29 +++++++++++++++++++++++
 docs/source/cpp/streaming_execution.rst   | 10 ++++----
 docs/source/developers/archery.rst        |  4 ++--
 docs/source/python/api/dataset.rst        |  6 +++--
 python/pyarrow/_dataset.pyx               | 10 ++++----
 7 files changed, 64 insertions(+), 15 deletions(-)

diff --git a/cpp/examples/arrow/flight_grpc_example.cc b/cpp/examples/arrow/flight_grpc_example.cc
index b0ecd10dd9d95..db9cc177a5f74 100644
--- a/cpp/examples/arrow/flight_grpc_example.cc
+++ b/cpp/examples/arrow/flight_grpc_example.cc
@@ -37,6 +37,23 @@
 // For example, with grpcurl (https://github.com/fullstorydev/grpcurl):
 //
 // grpcurl -d '{"name": "Rakka"}' -plaintext localhost:31337 HelloWorldService/SayHello
+//
+// Note that for applications that wish to follow the example here,
+// care must be taken to ensure that Protobuf and gRPC are not
+// multiply linked, else the resulting program may crash or silently
+// corrupt data. In particular:
+//
+// * If dynamically linking Arrow Flight, then your application and
+//   Arrow Flight must also dynamically link Protobuf and gRPC. (The
+//   same goes for static linking.)
+// * The Flight packages on some platforms may make this difficult,
+//   because the Flight dynamic library will itself have statically
+//   linked Protobuf and gRPC since the platform does not ship a
+//   recent enough version of those dependencies.
+// * The versions of Protobuf and gRPC must be the same between Flight
+//   and your application.
+//
+// See "Using Arrow C++ in your own project" in the documentation.
 
 DEFINE_int32(port, -1, "Server port to listen on");
 
diff --git a/cpp/src/arrow/flight/server.h b/cpp/src/arrow/flight/server.h
index 96b2da488ee9a..218b640adc83d 100644
--- a/cpp/src/arrow/flight/server.h
+++ b/cpp/src/arrow/flight/server.h
@@ -155,7 +155,8 @@ class ARROW_FLIGHT_EXPORT FlightServerOptions {
   /// Not guaranteed to be called. The type of the parameter is
   /// specific to the Flight implementation. Users should take care to
   /// link to the same transport implementation as Flight to avoid
-  /// runtime problems.
+  /// runtime problems. See "Using Arrow C++ in your own project" in
+  /// the documentation for more details.
   std::function<void(void*)> builder_hook;
 };
 
diff --git a/docs/source/cpp/build_system.rst b/docs/source/cpp/build_system.rst
index c0d05e9dab366..fbdd2eafbd2af 100644
--- a/docs/source/cpp/build_system.rst
+++ b/docs/source/cpp/build_system.rst
@@ -134,3 +134,32 @@ all available packages:
   * ``gandiva``
   * ``parquet``
   * ``plasma``
+
+A Note on Linking
+=================
+
+Some Arrow components have dependencies that you may want to use in your own
+project. Care must be taken to ensure that your project links the same version
+of these dependencies in the same way (statically or dynamically) as Arrow,
+else `ODR <https://en.wikipedia.org/wiki/One_Definition_Rule>`_ violations may
+result and your program may crash or silently corrupt data.
+
+In particular, Arrow Flight and its dependencies `Protocol Buffers (Protobuf)
+<https://developers.google.com/protocol-buffers/>`_ and `gRPC
+<https://grpc.io/>`_ are likely to cause issues. When using Arrow Flight, note
+the following guidelines:
+
+* If statically linking Arrow Flight, Protobuf and gRPC must also be statically
+  linked, and the same goes for dynamic linking.
+* Some platforms (e.g. Ubuntu 20.04 at the time of this writing) may ship a
+  version of Protobuf and/or gRPC that is not recent enough for Arrow
+  Flight. In that case, Arrow Flight bundles these dependencies, so care must
+  be taken not to mix the Arrow Flight library with the platform Protobuf/gRPC
+  libraries (as then you will have two versions of Protobuf and/or gRPC linked
+  into your application).
+
+It may be easiest to depend on a version of Arrow built from source, where you
+can control the source of each dependency and whether it is statically or
+dynamically linked. See :doc:`/developers/cpp/building` for instructions. Or
+alternatively, use Arrow from a package manager such as Conda or vcpkg which
+will manage consistent versions of Arrow and its dependencies.
diff --git a/docs/source/cpp/streaming_execution.rst b/docs/source/cpp/streaming_execution.rst
index a3406265b6a6e..5864857c177f2 100644
--- a/docs/source/cpp/streaming_execution.rst
+++ b/docs/source/cpp/streaming_execution.rst
@@ -50,12 +50,12 @@ as they are pushed to it along an edge of the graph by upstream nodes
 (its inputs), and pushes batches along an edge of the graph to downstream
 nodes (its outputs) as they are finalized.
 
-..seealso::
+.. seealso::
 
-  `SHAIKHHA, A., DASHTI, M., & KOCH, C.
-  (2018). Push versus pull-based loop fusion in query engines.
-  Journal of Functional Programming, 28.
-  <https://doi.org/10.1017/s0956796818000102>`_
+   `SHAIKHHA, A., DASHTI, M., & KOCH, C.
+   (2018). Push versus pull-based loop fusion in query engines.
+   Journal of Functional Programming, 28.
+   <https://doi.org/10.1017/s0956796818000102>`_
 
 Overview
 --------
diff --git a/docs/source/developers/archery.rst b/docs/source/developers/archery.rst
index 3f7cbee8fb4e1..1b6db34ec7119 100644
--- a/docs/source/developers/archery.rst
+++ b/docs/source/developers/archery.rst
@@ -90,5 +90,5 @@ help output, for example:
      push    Push the generated docker-compose image.
      run     Execute docker-compose builds.
 
-A more detailed introduction to using docker with
-Archery is available in a separate :ref:`page <docker>`.
+A more detailed introduction to using Docker with
+Archery is available at :doc:`docker`.
diff --git a/docs/source/python/api/dataset.rst b/docs/source/python/api/dataset.rst
index 9718006abf87b..2211cad0e1c36 100644
--- a/docs/source/python/api/dataset.rst
+++ b/docs/source/python/api/dataset.rst
@@ -47,10 +47,12 @@ Classes
    :toctree: ../generated/
 
    FileFormat
+   CsvFileFormat
+   CsvFragmentScanOptions
+   IpcFileFormat
    ParquetFileFormat
+   ParquetFragmentScanOptions
    ORCFileFormat
-   IpcFileFormat
-   CsvFileFormat
    Partitioning
    PartitioningFactory
    DirectoryPartitioning
diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx
index 459c3b8fb764c..0dc70e228b79c 100644
--- a/python/pyarrow/_dataset.pyx
+++ b/python/pyarrow/_dataset.pyx
@@ -1840,11 +1840,11 @@ cdef class CsvFileFormat(FileFormat):
 
     Parameters
     ----------
-    parse_options : ParseOptions
+    parse_options : csv.ParseOptions
         Options regarding CSV parsing.
-    convert_options : ConvertOptions
+    convert_options : csv.ConvertOptions
         Options regarding value conversion.
-    read_options : ReadOptions
+    read_options : csv.ReadOptions
         General read options.
     default_fragment_scan_options : CsvFragmentScanOptions
         Default options for fragments scan.
@@ -1923,9 +1923,9 @@ cdef class CsvFragmentScanOptions(FragmentScanOptions):
 
     Parameters
     ----------
-    convert_options : ConvertOptions
+    convert_options : csv.ConvertOptions
         Options regarding value conversion.
-    read_options : ReadOptions
+    read_options : csv.ReadOptions
         General read options.
     """
 

From 7e25ef2eb203436ef3814ea71d0b3dea19f6bfc3 Mon Sep 17 00:00:00 2001
From: Romain Francois <romain@rstudio.com>
Date: Thu, 18 Nov 2021 14:02:51 +0100
Subject: [PATCH 169/194] ARROW-13111: [R] altrep vectors for ChunkedArray

``` r
library(arrow, warn.conflicts = FALSE)
#> See arrow_info() for available features

(c_str <- ChunkedArray$create(c("un", "deux"), c("trois", "quatre")))
#> ChunkedArray
#> [
#>   [
#>     "un",
#>     "deux"
#>   ],
#>   [
#>     "trois",
#>     "quatre"
#>   ]
#> ]
(v_str <- c_str$as_vector())
#> [1] "un"     "deux"   "trois"  "quatre"

.Internal(inspect(v_str))
#> @7ff3ff98f540 16 STRSXP g0c0 [REF(65535)] arrow::ChunkedArray<0x7ff3f64a2128, string, 2 chunks, 0 nulls> len=4
```

<sup>Created on 2021-10-08 by the [reprex package](https://reprex.tidyverse.org) (v2.0.1.9000)</sup>

Only dealing with strings for now, int and double coming next.

Closes #11369 from romainfrancois/ARROW_13111_chunked_array_altrep

Lead-authored-by: Romain Francois <romain@rstudio.com>
Co-authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 dev/release/rat_exclude_files.txt |   1 +
 r/DESCRIPTION                     |   2 +-
 r/R/array.R                       |   3 +-
 r/R/arrowExports.R                |  12 +-
 r/src/altrep.cpp                  | 223 +++++++++++++++++-------------
 r/src/array.cpp                   |   5 +
 r/src/array_to_vector.cpp         |  24 ++--
 r/src/arrowExports.cpp            |  46 +++---
 r/src/arrow_cpp11.h               |  10 +-
 r/src/arrow_types.h               |   9 +-
 r/src/chunkedarray.cpp            |  11 +-
 r/src/r_to_arrow.cpp              |  62 ++++++---
 r/src/recordbatch.cpp             |   5 +-
 r/src/type_infer.cpp              |   5 +
 r/tests/testthat/test-altrep.R    | 157 +++++++++++++++++----
 15 files changed, 380 insertions(+), 195 deletions(-)

diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt
index a1d9139165172..1d9eb887b3418 100644
--- a/dev/release/rat_exclude_files.txt
+++ b/dev/release/rat_exclude_files.txt
@@ -193,6 +193,7 @@ r/man/*.Rd
 r/cran-comments.md
 r/vignettes/*.Rmd
 r/tests/testthat/test-*.txt
+r/tests/testthat/_snaps/*.md
 r/inst/include/cpp11.hpp
 r/inst/include/cpp11/*.hpp
 .gitattributes
diff --git a/r/DESCRIPTION b/r/DESCRIPTION
index 91d35fd8d65ab..e3bd5139d5757 100644
--- a/r/DESCRIPTION
+++ b/r/DESCRIPTION
@@ -56,7 +56,7 @@ Suggests:
     rmarkdown,
     stringi,
     stringr,
-    testthat,
+    testthat (>= 3.1.0),
     tibble,
     withr
 Collate:
diff --git a/r/R/array.R b/r/R/array.R
index 46acc14ff0e43..65fb450f8de86 100644
--- a/r/R/array.R
+++ b/r/R/array.R
@@ -166,6 +166,7 @@ Array <- R6Class("Array",
     View = function(type) {
       Array$create(Array__View(self, as_type(type)))
     },
+    Same = function(other) Array__Same(self, other),
     Validate = function() Array__Validate(self),
     export_to_c = function(array_ptr, schema_ptr) ExportArray(self, array_ptr, schema_ptr)
   ),
@@ -186,7 +187,7 @@ Array$create <- function(x, type = NULL) {
     }
     return(out)
   }
-  vec_to_arrow(x, type)
+  vec_to_Array(x, type)
 }
 #' @include arrowExports.R
 Array$import_from_c <- ImportArray
diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R
index 821c73a99f109..c1e0fca788ddd 100644
--- a/r/R/arrowExports.R
+++ b/r/R/arrowExports.R
@@ -4,10 +4,6 @@ test_SET_STRING_ELT <- function(s) {
   invisible(.Call(`_arrow_test_SET_STRING_ELT`, s))
 }
 
-test_same_Array <- function(x, y) {
-  .Call(`_arrow_test_same_Array`, x, y)
-}
-
 is_arrow_altrep <- function(x) {
   .Call(`_arrow_is_arrow_altrep`, x)
 }
@@ -148,6 +144,10 @@ LargeListArray__raw_value_offsets <- function(array) {
   .Call(`_arrow_LargeListArray__raw_value_offsets`, array)
 }
 
+Array__Same <- function(x, y) {
+  .Call(`_arrow_Array__Same`, x, y)
+}
+
 Array__as_vector <- function(array) {
   .Call(`_arrow_Array__as_vector`, array)
 }
@@ -1440,8 +1440,8 @@ Table__from_dots <- function(lst, schema_sxp, use_threads) {
   .Call(`_arrow_Table__from_dots`, lst, schema_sxp, use_threads)
 }
 
-vec_to_arrow <- function(x, s_type) {
-  .Call(`_arrow_vec_to_arrow`, x, s_type)
+vec_to_Array <- function(x, s_type) {
+  .Call(`_arrow_vec_to_Array`, x, s_type)
 }
 
 DictionaryArray__FromArrays <- function(type, indices, dict) {
diff --git a/r/src/altrep.cpp b/r/src/altrep.cpp
index 81407be3e17b0..9305ce826742c 100644
--- a/r/src/altrep.cpp
+++ b/r/src/altrep.cpp
@@ -69,14 +69,35 @@ R_xlen_t Standard_Get_region<int>(SEXP data2, R_xlen_t i, R_xlen_t n, int* buf)
   return INTEGER_GET_REGION(data2, i, n, buf);
 }
 
-void DeleteArray(std::shared_ptr<Array>* ptr) { delete ptr; }
-using Pointer = cpp11::external_pointer<std::shared_ptr<Array>, DeleteArray>;
+void DeleteChunkedArray(std::shared_ptr<ChunkedArray>* ptr) { delete ptr; }
+using Pointer =
+    cpp11::external_pointer<std::shared_ptr<ChunkedArray>, DeleteChunkedArray>;
 
-// the Array that is being wrapped by the altrep object
-static const std::shared_ptr<Array>& GetArray(SEXP alt) {
+// the ChunkedArray that is being wrapped by the altrep object
+const std::shared_ptr<ChunkedArray>& GetChunkedArray(SEXP alt) {
   return *Pointer(R_altrep_data1(alt));
 }
 
+struct ArrayResolve {
+  // TODO: ARROW-11989
+  ArrayResolve(const std::shared_ptr<ChunkedArray>& chunked_array, int64_t i) {
+    for (int idx_chunk = 0; idx_chunk < chunked_array->num_chunks(); idx_chunk++) {
+      std::shared_ptr<Array> chunk = chunked_array->chunk(idx_chunk);
+      auto chunk_size = chunk->length();
+      if (i < chunk_size) {
+        index_ = i;
+        array_ = chunk;
+        break;
+      }
+
+      i -= chunk_size;
+    }
+  }
+
+  std::shared_ptr<Array> array_;
+  int64_t index_ = 0;
+};
+
 // base class for all altrep vectors
 //
 // data1: the Array as an external pointer.
@@ -86,8 +107,9 @@ static const std::shared_ptr<Array>& GetArray(SEXP alt) {
 template <typename Impl>
 struct AltrepVectorBase {
   // store the Array as an external pointer in data1, mark as immutable
-  static SEXP Make(const std::shared_ptr<Array>& array) {
-    SEXP alt = R_new_altrep(Impl::class_t, Pointer(new std::shared_ptr<Array>(array)),
+  static SEXP Make(const std::shared_ptr<ChunkedArray>& chunked_array) {
+    SEXP alt = R_new_altrep(Impl::class_t,
+                            Pointer(new std::shared_ptr<ChunkedArray>(chunked_array)),
                             R_NilValue);
     MARK_NOT_MUTABLE(alt);
 
@@ -98,19 +120,20 @@ struct AltrepVectorBase {
   // standard R vector with the same data as the array.
   static bool IsMaterialized(SEXP alt) { return !Rf_isNull(R_altrep_data2(alt)); }
 
-  static R_xlen_t Length(SEXP alt) { return GetArray(alt)->length(); }
+  static R_xlen_t Length(SEXP alt) { return GetChunkedArray(alt)->length(); }
 
-  static int No_NA(SEXP alt) { return GetArray(alt)->null_count() == 0; }
+  static int No_NA(SEXP alt) { return GetChunkedArray(alt)->null_count() == 0; }
 
   static int Is_sorted(SEXP alt) { return UNKNOWN_SORTEDNESS; }
 
   // What gets printed on .Internal(inspect(<the altrep object>))
   static Rboolean Inspect(SEXP alt, int pre, int deep, int pvec,
                           void (*inspect_subtree)(SEXP, int, int, int)) {
-    const auto& array = GetArray(alt);
-    Rprintf("arrow::Array<%s, %d nulls> len=%d, Array=<%p>\n",
-            array->type()->ToString().c_str(), array->null_count(), array->length(),
-            array.get());
+    const auto& chunked_array = GetChunkedArray(alt);
+    Rprintf("arrow::ChunkedArray<%p, %s, %d chunks, %d nulls> len=%d\n",
+            chunked_array.get(), chunked_array->type()->ToString().c_str(),
+            chunked_array->num_chunks(), chunked_array->null_count(),
+            chunked_array->length());
     return TRUE;
   }
 
@@ -175,10 +198,11 @@ struct AltrepVectorPrimitive : public AltrepVectorBase<AltrepVectorPrimitive<sex
       return DATAPTR_RO(R_altrep_data2(alt));
     }
 
-    // the Array has no nulls, we can directly return the start of its data
-    const auto& array = GetArray(alt);
-    if (array->null_count() == 0) {
-      return reinterpret_cast<const void*>(array->data()->template GetValues<c_type>(1));
+    // there is only one chunk with no nulls, we can directly return the start of its data
+    auto chunked_array = GetChunkedArray(alt);
+    if (chunked_array->num_chunks() == 1 && chunked_array->null_count() == 0) {
+      return reinterpret_cast<const void*>(
+          chunked_array->chunk(0)->data()->template GetValues<c_type>(1));
     }
 
     // Otherwise: if the array has nulls and data2 has not been generated: give up
@@ -190,11 +214,11 @@ struct AltrepVectorPrimitive : public AltrepVectorBase<AltrepVectorPrimitive<sex
     // If the object hasn't been materialized, and the array has no
     // nulls we can directly point to the array data.
     if (!Base::IsMaterialized(alt)) {
-      const auto& array = GetArray(alt);
+      const auto& chunked_array = GetChunkedArray(alt);
 
-      if (array->null_count() == 0) {
-        return reinterpret_cast<void*>(
-            const_cast<c_type*>(array->data()->template GetValues<c_type>(1)));
+      if (chunked_array->num_chunks() == 1 && chunked_array->null_count() == 0) {
+        return reinterpret_cast<void*>(const_cast<c_type*>(
+            chunked_array->chunk(0)->data()->template GetValues<c_type>(1)));
       }
     }
 
@@ -215,9 +239,12 @@ struct AltrepVectorPrimitive : public AltrepVectorBase<AltrepVectorPrimitive<sex
 
   // The value at position i
   static c_type Elt(SEXP alt, R_xlen_t i) {
-    const auto& array = GetArray(alt);
-    return array->IsNull(i) ? cpp11::na<c_type>()
-                            : array->data()->template GetValues<c_type>(1)[i];
+    ArrayResolve resolve(GetChunkedArray(alt), i);
+    auto array = resolve.array_;
+    auto j = resolve.index_;
+
+    return array->IsNull(j) ? cpp11::na<c_type>()
+                            : array->data()->template GetValues<c_type>(1)[j];
   }
 
   // R calls this when it wants data from position `i` to `i + n` copied into `buf`
@@ -237,29 +264,35 @@ struct AltrepVectorPrimitive : public AltrepVectorBase<AltrepVectorPrimitive<sex
     // array has nulls
     //
     // This only materialize the region, into buf. Not the entire vector.
-    auto slice = GetArray(alt)->Slice(i, n);
+    auto slice = GetChunkedArray(alt)->Slice(i, n);
     R_xlen_t ncopy = slice->length();
 
-    // first copy the data buffer
-    memcpy(buf, slice->data()->template GetValues<c_type>(1), ncopy * sizeof(c_type));
+    c_type* out = buf;
+    for (const auto& array : slice->chunks()) {
+      auto n_i = array->length();
+
+      // first copy the data buffer
+      memcpy(out, array->data()->template GetValues<c_type>(1), n_i * sizeof(c_type));
 
-    // then set the R NA sentinels if needed
-    if (slice->null_count() > 0) {
-      internal::BitmapReader bitmap_reader(slice->null_bitmap()->data(), slice->offset(),
-                                           ncopy);
+      // then set the R NA sentinels if needed
+      if (array->null_count() > 0) {
+        internal::BitmapReader bitmap_reader(array->null_bitmap()->data(),
+                                             array->offset(), n_i);
 
-      for (R_xlen_t j = 0; j < ncopy; j++, bitmap_reader.Next()) {
-        if (bitmap_reader.IsNotSet()) {
-          buf[j] = cpp11::na<c_type>();
+        for (R_xlen_t j = 0; j < n_i; j++, bitmap_reader.Next()) {
+          if (bitmap_reader.IsNotSet()) {
+            out[j] = cpp11::na<c_type>();
+          }
         }
       }
+
+      out += n_i;
     }
 
     return ncopy;
   }
 
-  static std::shared_ptr<arrow::compute::ScalarAggregateOptions> NaRmOptions(
-      const std::shared_ptr<Array>& array, bool na_rm) {
+  static std::shared_ptr<arrow::compute::ScalarAggregateOptions> NaRmOptions(bool na_rm) {
     auto options = std::make_shared<arrow::compute::ScalarAggregateOptions>(
         arrow::compute::ScalarAggregateOptions::Defaults());
     options->min_count = 0;
@@ -273,10 +306,10 @@ struct AltrepVectorPrimitive : public AltrepVectorBase<AltrepVectorPrimitive<sex
     using scalar_type =
         typename std::conditional<sexp_type == INTSXP, Int32Scalar, DoubleScalar>::type;
 
-    const auto& array = GetArray(alt);
+    const auto& chunked_array = GetChunkedArray(alt);
     bool na_rm = narm == TRUE;
-    auto n = array->length();
-    auto null_count = array->null_count();
+    auto n = chunked_array->length();
+    auto null_count = chunked_array->null_count();
     if ((na_rm || n == 0) && null_count == n) {
       return Rf_ScalarReal(Min ? R_PosInf : R_NegInf);
     }
@@ -284,10 +317,10 @@ struct AltrepVectorPrimitive : public AltrepVectorBase<AltrepVectorPrimitive<sex
       return cpp11::as_sexp(cpp11::na<data_type>());
     }
 
-    auto options = NaRmOptions(array, na_rm);
+    auto options = NaRmOptions(na_rm);
 
-    const auto& minmax =
-        ValueOrStop(arrow::compute::CallFunction("min_max", {array}, options.get()));
+    const auto& minmax = ValueOrStop(
+        arrow::compute::CallFunction("min_max", {chunked_array}, options.get()));
     const auto& minmax_scalar =
         internal::checked_cast<const StructScalar&>(*minmax.scalar());
 
@@ -303,17 +336,17 @@ struct AltrepVectorPrimitive : public AltrepVectorBase<AltrepVectorPrimitive<sex
   static SEXP Sum(SEXP alt, Rboolean narm) {
     using data_type = typename std::conditional<sexp_type == REALSXP, double, int>::type;
 
-    const auto& array = GetArray(alt);
+    const auto& chunked_array = GetChunkedArray(alt);
     bool na_rm = narm == TRUE;
-    auto null_count = array->null_count();
+    auto null_count = chunked_array->null_count();
 
     if (!na_rm && null_count > 0) {
       return cpp11::as_sexp(cpp11::na<data_type>());
     }
-    auto options = NaRmOptions(array, na_rm);
+    auto options = NaRmOptions(na_rm);
 
     const auto& sum =
-        ValueOrStop(arrow::compute::CallFunction("sum", {array}, options.get()));
+        ValueOrStop(arrow::compute::CallFunction("sum", {chunked_array}, options.get()));
 
     if (sexp_type == INTSXP) {
       // When calling the "sum" function on an int32 array, we get an Int64 scalar
@@ -343,10 +376,8 @@ struct AltrepVectorString : public AltrepVectorBase<AltrepVectorString<Type>> {
 
   // Helper class to convert to R strings
   struct RStringViewer {
-    explicit RStringViewer(const std::shared_ptr<Array>& array)
-        : array_(array),
-          string_array_(internal::checked_cast<const StringArrayType*>(array.get())),
-          strip_out_nuls_(GetBoolOption("arrow.skip_nul", false)),
+    RStringViewer()
+        : strip_out_nuls_(GetBoolOption("arrow.skip_nul", false)),
           nul_was_stripped_(false) {}
 
     // convert the i'th string of the Array to an R string (CHARSXP)
@@ -419,7 +450,12 @@ struct AltrepVectorString : public AltrepVectorBase<AltrepVectorString<Type>> {
       Rf_error(stripped_string_.c_str());
     }
 
-    const std::shared_ptr<Array>& array_;
+    void SetArray(const std::shared_ptr<Array>& array) {
+      array_ = array;
+      string_array_ = internal::checked_cast<const StringArrayType*>(array.get());
+    }
+
+    std::shared_ptr<Array> array_;
     const StringArrayType* string_array_;
     std::string stripped_string_;
     const bool strip_out_nuls_;
@@ -433,18 +469,21 @@ struct AltrepVectorString : public AltrepVectorBase<AltrepVectorString<Type>> {
     if (Base::IsMaterialized(alt)) {
       return STRING_ELT(R_altrep_data2(alt), i);
     }
-
     BEGIN_CPP11
 
-    const auto& array = GetArray(alt);
-    RStringViewer r_string_viewer(array);
+    ArrayResolve resolve(GetChunkedArray(alt), i);
+    auto array = resolve.array_;
+    auto j = resolve.index_;
 
-    // r_string_viewer.Convert(i) might jump so it's wrapped
+    RStringViewer r_string_viewer;
+    r_string_viewer.SetArray(array);
+
+    // r_string_viewer.Convert() might jump so it's wrapped
     // in cpp11::unwind_protect() so that string_viewer
     // can be properly destructed before the unwinding continues
     SEXP s = NA_STRING;
     cpp11::unwind_protect([&]() {
-      s = r_string_viewer.Convert(i);
+      s = r_string_viewer.Convert(j);
       if (r_string_viewer.nul_was_stripped()) {
         cpp11::warning("Stripping '\\0' (nul) from character vector");
       }
@@ -463,20 +502,25 @@ struct AltrepVectorString : public AltrepVectorBase<AltrepVectorString<Type>> {
 
     BEGIN_CPP11
 
-    const auto& array = GetArray(alt);
-    R_xlen_t n = array->length();
-    SEXP data2 = PROTECT(Rf_allocVector(STRSXP, n));
+    const auto& chunked_array = GetChunkedArray(alt);
+    SEXP data2 = PROTECT(Rf_allocVector(STRSXP, chunked_array->length()));
     MARK_NOT_MUTABLE(data2);
 
-    RStringViewer r_string_viewer(array);
+    RStringViewer r_string_viewer;
 
-    // r_string_viewer.Convert(i) might jump so we have to
+    // r_string_viewer.Convert() might jump so we have to
     // wrap it in unwind_protect() to:
     // - correctly destruct the C++ objects
     // - resume the unwinding
     cpp11::unwind_protect([&]() {
-      for (R_xlen_t i = 0; i < n; i++) {
-        SET_STRING_ELT(data2, i, r_string_viewer.Convert(i));
+      R_xlen_t i = 0;
+      for (const auto& array : chunked_array->chunks()) {
+        r_string_viewer.SetArray(array);
+
+        auto ni = array->length();
+        for (R_xlen_t j = 0; j < ni; j++, i++) {
+          SET_STRING_ELT(data2, i, r_string_viewer.Convert(j));
+        }
       }
 
       if (r_string_viewer.nul_was_stripped()) {
@@ -601,31 +645,28 @@ void Init_Altrep_classes(DllInfo* dll) {
 
 // return an altrep R vector that shadows the array if possible
 SEXP MakeAltrepVector(const std::shared_ptr<ChunkedArray>& chunked_array) {
-  // special case when there is only one array
-  if (chunked_array->num_chunks() == 1) {
-    const auto& array = chunked_array->chunk(0);
-    // using altrep if
-    // - the arrow.use_altrep is set to TRUE or unset (implicit TRUE)
-    // - the array has at least one element
-    if (arrow::r::GetBoolOption("arrow.use_altrep", true) && array->length() > 0) {
-      switch (array->type()->id()) {
-        case arrow::Type::DOUBLE:
-          return altrep::AltrepVectorPrimitive<REALSXP>::Make(array);
-
-        case arrow::Type::INT32:
-          return altrep::AltrepVectorPrimitive<INTSXP>::Make(array);
-
-        case arrow::Type::STRING:
-          return altrep::AltrepVectorString<StringType>::Make(array);
-
-        case arrow::Type::LARGE_STRING:
-          return altrep::AltrepVectorString<LargeStringType>::Make(array);
-
-        default:
-          break;
-      }
+  // using altrep if
+  // - the arrow.use_altrep is set to TRUE or unset (implicit TRUE)
+  // - the chunked array has at least one element
+  if (arrow::r::GetBoolOption("arrow.use_altrep", true) && chunked_array->length() > 0) {
+    switch (chunked_array->type()->id()) {
+      case arrow::Type::DOUBLE:
+        return altrep::AltrepVectorPrimitive<REALSXP>::Make(chunked_array);
+
+      case arrow::Type::INT32:
+        return altrep::AltrepVectorPrimitive<INTSXP>::Make(chunked_array);
+
+      case arrow::Type::STRING:
+        return altrep::AltrepVectorString<StringType>::Make(chunked_array);
+
+      case arrow::Type::LARGE_STRING:
+        return altrep::AltrepVectorString<LargeStringType>::Make(chunked_array);
+
+      default:
+        break;
     }
   }
+
   return R_NilValue;
 }
 
@@ -640,9 +681,9 @@ bool is_arrow_altrep(SEXP x) {
   return false;
 }
 
-std::shared_ptr<Array> vec_to_arrow_altrep_bypass(SEXP x) {
+std::shared_ptr<ChunkedArray> vec_to_arrow_altrep_bypass(SEXP x) {
   if (is_arrow_altrep(x)) {
-    return GetArray(x);
+    return GetChunkedArray(x);
   }
 
   return nullptr;
@@ -676,14 +717,6 @@ std::shared_ptr<Array> vec_to_arrow_altrep_bypass(SEXP x) { return nullptr; }
 // [[arrow::export]]
 void test_SET_STRING_ELT(SEXP s) { SET_STRING_ELT(s, 0, Rf_mkChar("forbidden")); }
 
-// [[arrow::export]]
-bool test_same_Array(SEXP x, SEXP y) {
-  auto* p_x = reinterpret_cast<std::shared_ptr<arrow::Array>*>(x);
-  auto* p_y = reinterpret_cast<std::shared_ptr<arrow::Array>*>(y);
-
-  return p_x->get() == p_y->get();
-}
-
 // [[arrow::export]]
 bool is_arrow_altrep(SEXP x) { return arrow::r::altrep::is_arrow_altrep(x); }
 
diff --git a/r/src/array.cpp b/r/src/array.cpp
index 9601ee43c033a..c6dd4331123d9 100644
--- a/r/src/array.cpp
+++ b/r/src/array.cpp
@@ -283,4 +283,9 @@ cpp11::writable::integers LargeListArray__raw_value_offsets(
   return cpp11::writable::integers(offsets, offsets + array->length());
 }
 
+// [[arrow::export]]
+bool Array__Same(const std::shared_ptr<arrow::Array>& x,
+                 const std::shared_ptr<arrow::Array>& y) {
+  return x.get() == y.get();
+}
 #endif
diff --git a/r/src/array_to_vector.cpp b/r/src/array_to_vector.cpp
index 480eb862915d4..49f6d987f857f 100644
--- a/r/src/array_to_vector.cpp
+++ b/r/src/array_to_vector.cpp
@@ -731,20 +731,16 @@ class Converter_Struct : public Converter {
 
   SEXP Allocate(R_xlen_t n) const {
     // allocate a data frame column to host each array
+    // If possible, a column is dealt with directly with altrep
     auto type =
         checked_cast<const arrow::StructType*>(this->chunked_array_->type().get());
-    auto out = arrow::r::to_r_list(
-        converters, [n, this](const std::shared_ptr<Converter>& converter) {
-          // when there is only one chunk, perhaps this field
-          // can be dealt with upfront with altrep
-          if (this->chunked_array_->num_chunks() == 1) {
-            SEXP alt = converter->MaybeAltrep();
-            if (!Rf_isNull(alt)) {
-              return alt;
-            }
+    auto out =
+        arrow::r::to_r_list(converters, [n](const std::shared_ptr<Converter>& converter) {
+          SEXP out = converter->MaybeAltrep();
+          if (Rf_isNull(out)) {
+            out = converter->Allocate(n);
           }
-
-          return converter->Allocate(n);
+          return out;
         });
     auto colnames = arrow::r::to_r_strings(
         type->fields(),
@@ -762,7 +758,7 @@ class Converter_Struct : public Converter {
       SEXP data_i = VECTOR_ELT(data, i);
 
       // only ingest if the column is not altrep
-      if (!is_altrep(data_i)) {
+      if (!altrep::is_arrow_altrep(data_i)) {
         StopIfNotOk(converters[i]->Ingest_all_nulls(data_i, start, n));
       }
     }
@@ -779,7 +775,7 @@ class Converter_Struct : public Converter {
       SEXP data_i = VECTOR_ELT(data, i);
 
       // only ingest if the column is not altrep
-      if (!is_altrep(data_i)) {
+      if (!altrep::is_arrow_altrep(data_i)) {
         StopIfNotOk(converters[i]->Ingest_some_nulls(VECTOR_ELT(data, i), arrays[i],
                                                      start, n, chunk_index));
       }
@@ -799,8 +795,6 @@ class Converter_Struct : public Converter {
 
  private:
   std::vector<std::shared_ptr<Converter>> converters;
-
-  bool is_altrep(SEXP x) const { return ALTREP(x); }
 };
 
 double ms_to_seconds(int64_t ms) { return static_cast<double>(ms) / 1000; }
diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp
index 7c9bb7cdc5aae..ef388b0920818 100644
--- a/r/src/arrowExports.cpp
+++ b/r/src/arrowExports.cpp
@@ -20,22 +20,6 @@ extern "C" SEXP _arrow_test_SET_STRING_ELT(SEXP s_sexp){
 }
 #endif
 
-// altrep.cpp
-#if defined(ARROW_R_WITH_ARROW)
-bool test_same_Array(SEXP x, SEXP y);
-extern "C" SEXP _arrow_test_same_Array(SEXP x_sexp, SEXP y_sexp){
-BEGIN_CPP11
-	arrow::r::Input<SEXP>::type x(x_sexp);
-	arrow::r::Input<SEXP>::type y(y_sexp);
-	return cpp11::as_sexp(test_same_Array(x, y));
-END_CPP11
-}
-#else
-extern "C" SEXP _arrow_test_same_Array(SEXP x_sexp, SEXP y_sexp){
-	Rf_error("Cannot call test_same_Array(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
-}
-#endif
-
 // altrep.cpp
 #if defined(ARROW_R_WITH_ARROW)
 bool is_arrow_altrep(SEXP x);
@@ -583,6 +567,22 @@ extern "C" SEXP _arrow_LargeListArray__raw_value_offsets(SEXP array_sexp){
 }
 #endif
 
+// array.cpp
+#if defined(ARROW_R_WITH_ARROW)
+bool Array__Same(const std::shared_ptr<arrow::Array>& x, const std::shared_ptr<arrow::Array>& y);
+extern "C" SEXP _arrow_Array__Same(SEXP x_sexp, SEXP y_sexp){
+BEGIN_CPP11
+	arrow::r::Input<const std::shared_ptr<arrow::Array>&>::type x(x_sexp);
+	arrow::r::Input<const std::shared_ptr<arrow::Array>&>::type y(y_sexp);
+	return cpp11::as_sexp(Array__Same(x, y));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Array__Same(SEXP x_sexp, SEXP y_sexp){
+	Rf_error("Cannot call Array__Same(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
 // array_to_vector.cpp
 #if defined(ARROW_R_WITH_ARROW)
 SEXP Array__as_vector(const std::shared_ptr<arrow::Array>& array);
@@ -5676,17 +5676,17 @@ extern "C" SEXP _arrow_Table__from_dots(SEXP lst_sexp, SEXP schema_sxp_sexp, SEX
 
 // r_to_arrow.cpp
 #if defined(ARROW_R_WITH_ARROW)
-SEXP vec_to_arrow(SEXP x, SEXP s_type);
-extern "C" SEXP _arrow_vec_to_arrow(SEXP x_sexp, SEXP s_type_sexp){
+SEXP vec_to_Array(SEXP x, SEXP s_type);
+extern "C" SEXP _arrow_vec_to_Array(SEXP x_sexp, SEXP s_type_sexp){
 BEGIN_CPP11
 	arrow::r::Input<SEXP>::type x(x_sexp);
 	arrow::r::Input<SEXP>::type s_type(s_type_sexp);
-	return cpp11::as_sexp(vec_to_arrow(x, s_type));
+	return cpp11::as_sexp(vec_to_Array(x, s_type));
 END_CPP11
 }
 #else
-extern "C" SEXP _arrow_vec_to_arrow(SEXP x_sexp, SEXP s_type_sexp){
-	Rf_error("Cannot call vec_to_arrow(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+extern "C" SEXP _arrow_vec_to_Array(SEXP x_sexp, SEXP s_type_sexp){
+	Rf_error("Cannot call vec_to_Array(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
 }
 #endif
 
@@ -7172,7 +7172,6 @@ static const R_CallMethodDef CallEntries[] = {
 		{ "_s3_available", (DL_FUNC)& _s3_available, 0 },
 		{ "_json_available", (DL_FUNC)& _json_available, 0 },
 		{ "_arrow_test_SET_STRING_ELT", (DL_FUNC) &_arrow_test_SET_STRING_ELT, 1}, 
-		{ "_arrow_test_same_Array", (DL_FUNC) &_arrow_test_same_Array, 2}, 
 		{ "_arrow_is_arrow_altrep", (DL_FUNC) &_arrow_is_arrow_altrep, 1}, 
 		{ "_arrow_Array__Slice1", (DL_FUNC) &_arrow_Array__Slice1, 2}, 
 		{ "_arrow_Array__Slice2", (DL_FUNC) &_arrow_Array__Slice2, 3}, 
@@ -7208,6 +7207,7 @@ static const R_CallMethodDef CallEntries[] = {
 		{ "_arrow_FixedSizeListArray__value_offset", (DL_FUNC) &_arrow_FixedSizeListArray__value_offset, 2}, 
 		{ "_arrow_ListArray__raw_value_offsets", (DL_FUNC) &_arrow_ListArray__raw_value_offsets, 1}, 
 		{ "_arrow_LargeListArray__raw_value_offsets", (DL_FUNC) &_arrow_LargeListArray__raw_value_offsets, 1}, 
+		{ "_arrow_Array__Same", (DL_FUNC) &_arrow_Array__Same, 2}, 
 		{ "_arrow_Array__as_vector", (DL_FUNC) &_arrow_Array__as_vector, 1}, 
 		{ "_arrow_ChunkedArray__as_vector", (DL_FUNC) &_arrow_ChunkedArray__as_vector, 2}, 
 		{ "_arrow_RecordBatch__to_dataframe", (DL_FUNC) &_arrow_RecordBatch__to_dataframe, 2}, 
@@ -7531,7 +7531,7 @@ static const R_CallMethodDef CallEntries[] = {
 		{ "_arrow_ExportRecordBatch", (DL_FUNC) &_arrow_ExportRecordBatch, 3}, 
 		{ "_arrow_ExportRecordBatchReader", (DL_FUNC) &_arrow_ExportRecordBatchReader, 2}, 
 		{ "_arrow_Table__from_dots", (DL_FUNC) &_arrow_Table__from_dots, 3}, 
-		{ "_arrow_vec_to_arrow", (DL_FUNC) &_arrow_vec_to_arrow, 2}, 
+		{ "_arrow_vec_to_Array", (DL_FUNC) &_arrow_vec_to_Array, 2}, 
 		{ "_arrow_DictionaryArray__FromArrays", (DL_FUNC) &_arrow_DictionaryArray__FromArrays, 3}, 
 		{ "_arrow_RecordBatch__num_columns", (DL_FUNC) &_arrow_RecordBatch__num_columns, 1}, 
 		{ "_arrow_RecordBatch__num_rows", (DL_FUNC) &_arrow_RecordBatch__num_rows, 1}, 
diff --git a/r/src/arrow_cpp11.h b/r/src/arrow_cpp11.h
index c35948867a8b2..15f31cd65b0e1 100644
--- a/r/src/arrow_cpp11.h
+++ b/r/src/arrow_cpp11.h
@@ -101,8 +101,14 @@ inline R_xlen_t r_string_size(SEXP s) {
 inline SEXP utf8_strings(SEXP x) {
   return cpp11::unwind_protect([x] {
     R_xlen_t n = XLENGTH(x);
-    for (R_xlen_t i = 0; i < n; i++) {
-      SEXP s = STRING_ELT(x, i);
+
+    // if `x` is an altrep of some sort, this will
+    // materialize upfront. That's usually better because
+    // the loop touches all strings
+    const SEXP* p_x = STRING_PTR_RO(x);
+
+    for (R_xlen_t i = 0; i < n; i++, ++p_x) {
+      SEXP s = *p_x;
       if (s != NA_STRING && !IS_UTF8(s) && !IS_ASCII(s)) {
         SET_STRING_ELT(x, i, Rf_mkCharCE(Rf_translateCharUTF8(s), CE_UTF8));
       }
diff --git a/r/src/arrow_types.h b/r/src/arrow_types.h
index bf620bb78c629..088e7fe82039b 100644
--- a/r/src/arrow_types.h
+++ b/r/src/arrow_types.h
@@ -114,9 +114,10 @@ bool can_reuse_memory(SEXP x, const std::shared_ptr<arrow::DataType>& type);
 Status count_fields(SEXP lst, int* out);
 
 void inspect(SEXP obj);
-std::shared_ptr<arrow::Array> vec_to_arrow(SEXP x,
-                                           const std::shared_ptr<arrow::DataType>& type,
-                                           bool type_inferred);
+std::shared_ptr<arrow::Array> vec_to_arrow_Array(
+    SEXP x, const std::shared_ptr<arrow::DataType>& type, bool type_inferred);
+std::shared_ptr<arrow::ChunkedArray> vec_to_arrow_ChunkedArray(
+    SEXP x, const std::shared_ptr<arrow::DataType>& type, bool type_inferred);
 
 // the integer64 sentinel
 constexpr int64_t NA_INT64 = std::numeric_limits<int64_t>::min();
@@ -185,6 +186,8 @@ void Init_Altrep_classes(DllInfo* dll);
 #endif
 
 SEXP MakeAltrepVector(const std::shared_ptr<ChunkedArray>& chunked_array);
+bool is_arrow_altrep(SEXP x);
+std::shared_ptr<ChunkedArray> vec_to_arrow_altrep_bypass(SEXP);
 
 }  // namespace altrep
 
diff --git a/r/src/chunkedarray.cpp b/r/src/chunkedarray.cpp
index 10c6e84b3bbc3..d760743a59eb7 100644
--- a/r/src/chunkedarray.cpp
+++ b/r/src/chunkedarray.cpp
@@ -126,10 +126,17 @@ std::shared_ptr<arrow::ChunkedArray> ChunkedArray__from_list(cpp11::list chunks,
     // because we might have inferred the type from the first element of the list
     //
     // this only really matters for dictionary arrays
-    vec.push_back(arrow::r::vec_to_arrow(chunks[0], type, type_inferred));
+    auto chunked_array =
+        arrow::r::vec_to_arrow_ChunkedArray(chunks[0], type, type_inferred);
+    for (const auto& chunk : chunked_array->chunks()) {
+      vec.push_back(chunk);
+    }
 
     for (R_xlen_t i = 1; i < n; i++) {
-      vec.push_back(arrow::r::vec_to_arrow(chunks[i], type, false));
+      chunked_array = arrow::r::vec_to_arrow_ChunkedArray(chunks[i], type, false);
+      for (const auto& chunk : chunked_array->chunks()) {
+        vec.push_back(chunk);
+      }
     }
   }
 
diff --git a/r/src/r_to_arrow.cpp b/r/src/r_to_arrow.cpp
index d3d3ac69f1754..d4beca96ec7a0 100644
--- a/r/src/r_to_arrow.cpp
+++ b/r/src/r_to_arrow.cpp
@@ -25,6 +25,7 @@
 #include <arrow/array/builder_dict.h>
 #include <arrow/array/builder_nested.h>
 #include <arrow/array/builder_primitive.h>
+#include <arrow/array/concatenate.h>
 #include <arrow/table.h>
 #include <arrow/type_traits.h>
 #include <arrow/util/bitmap_writer.h>
@@ -256,6 +257,11 @@ class RConverter : public Converter<SEXP, RConversionOptions> {
   virtual Status ExtendMasked(SEXP values, SEXP mask, int64_t size, int64_t offset = 0) {
     return Status::NotImplemented("ExtendMasked");
   }
+
+  virtual Result<std::shared_ptr<ChunkedArray>> ToChunkedArray() {
+    ARROW_ASSIGN_OR_RAISE(auto array, this->ToArray())
+    return std::make_shared<ChunkedArray>(array);
+  }
 };
 
 template <typename T, typename Enable = void>
@@ -863,7 +869,7 @@ class RDictionaryConverter<ValueType, enable_if_has_string_view<ValueType>>
     }
   }
 
-  Result<std::shared_ptr<Array>> ToArray() override {
+  Result<std::shared_ptr<ChunkedArray>> ToChunkedArray() override {
     ARROW_ASSIGN_OR_RAISE(auto result, this->builder_->Finish());
 
     auto result_type = checked_cast<DictionaryType*>(result->type().get());
@@ -874,7 +880,8 @@ class RDictionaryConverter<ValueType, enable_if_has_string_view<ValueType>>
           arrow::dictionary(result_type->index_type(), result_type->value_type(), true);
     }
 
-    return std::make_shared<DictionaryArray>(result->data());
+    return std::make_shared<ChunkedArray>(
+        std::make_shared<DictionaryArray>(result->data()));
   }
 
  private:
@@ -898,8 +905,11 @@ class RDictionaryConverter<ValueType, enable_if_has_string_view<ValueType>>
 
     // first we need to handle the levels
     SEXP levels = Rf_getAttrib(x, R_LevelsSymbol);
-    auto memo_array = arrow::r::vec_to_arrow(levels, utf8(), false);
-    RETURN_NOT_OK(this->value_builder_->InsertMemoValues(*memo_array));
+    auto memo_chunked_chunked_array =
+        arrow::r::vec_to_arrow_ChunkedArray(levels, utf8(), false);
+    for (const auto& chunk : memo_chunked_chunked_array->chunks()) {
+      RETURN_NOT_OK(this->value_builder_->InsertMemoValues(*chunk));
+    }
 
     // then we can proceed
     return this->Reserve(size - offset);
@@ -1157,19 +1167,20 @@ std::shared_ptr<arrow::Array> vec_to_arrow__reuse_memory(SEXP x) {
   cpp11::stop("Unreachable: you might need to fix can_reuse_memory()");
 }
 
-namespace altrep {
-std::shared_ptr<Array> vec_to_arrow_altrep_bypass(SEXP);  // in altrep.cpp
-}
+std::shared_ptr<arrow::ChunkedArray> vec_to_arrow_ChunkedArray(
+    SEXP x, const std::shared_ptr<arrow::DataType>& type, bool type_inferred) {
+  // short circuit if `x` is already a chunked array
+  if (Rf_inherits(x, "ChunkedArray")) {
+    return cpp11::as_cpp<std::shared_ptr<arrow::ChunkedArray>>(x);
+  }
 
-std::shared_ptr<arrow::Array> vec_to_arrow(SEXP x,
-                                           const std::shared_ptr<arrow::DataType>& type,
-                                           bool type_inferred) {
-  // short circuit if `x` is already an Array
+  // short circuit if `x` is an Array
   if (Rf_inherits(x, "Array")) {
-    return cpp11::as_cpp<std::shared_ptr<arrow::Array>>(x);
+    return std::make_shared<arrow::ChunkedArray>(
+        cpp11::as_cpp<std::shared_ptr<arrow::Array>>(x));
   }
 
-  // short circuit if `x` is an altrep vector that shells an Array
+  // short circuit if `x` is an altrep vector that shells a chunked Array
   auto maybe = altrep::vec_to_arrow_altrep_bypass(x);
   if (maybe.get()) {
     return maybe;
@@ -1182,7 +1193,7 @@ std::shared_ptr<arrow::Array> vec_to_arrow(SEXP x,
 
   // maybe short circuit when zero-copy is possible
   if (can_reuse_memory(x, options.type)) {
-    return vec_to_arrow__reuse_memory(x);
+    return std::make_shared<arrow::ChunkedArray>(vec_to_arrow__reuse_memory(x));
   }
 
   // otherwise go through the converter api
@@ -1191,7 +1202,17 @@ std::shared_ptr<arrow::Array> vec_to_arrow(SEXP x,
 
   StopIfNotOk(converter->Extend(x, options.size));
 
-  return ValueOrStop(converter->ToArray());
+  return ValueOrStop(converter->ToChunkedArray());
+}
+
+std::shared_ptr<arrow::Array> vec_to_arrow_Array(
+    SEXP x, const std::shared_ptr<arrow::DataType>& type, bool type_inferred) {
+  auto chunked_array = vec_to_arrow_ChunkedArray(x, type, type_inferred);
+  if (chunked_array->num_chunks() == 1) {
+    return chunked_array->chunk(0);
+  }
+
+  return ValueOrStop(arrow::Concatenate(chunked_array->chunks()));
 }
 
 // TODO: most of this is very similar to MakeSimpleArray, just adapted to
@@ -1352,6 +1373,8 @@ std::shared_ptr<arrow::Table> Table__from_dots(SEXP lst, SEXP schema_sxp,
     } else if (Rf_inherits(x, "Array")) {
       columns[j] = std::make_shared<arrow::ChunkedArray>(
           cpp11::as_cpp<std::shared_ptr<arrow::Array>>(x));
+    } else if (arrow::r::altrep::is_arrow_altrep(x)) {
+      columns[j] = arrow::r::altrep::vec_to_arrow_altrep_bypass(x);
     } else {
       arrow::r::RConversionOptions options;
       options.strict = !infer_schema;
@@ -1399,8 +1422,7 @@ std::shared_ptr<arrow::Table> Table__from_dots(SEXP lst, SEXP schema_sxp,
     tasks.Append(true, [&columns, j, &converters]() {
       auto& converter = converters[j];
       if (converter != nullptr) {
-        ARROW_ASSIGN_OR_RAISE(auto array, converter->ToArray());
-        columns[j] = std::make_shared<arrow::ChunkedArray>(array);
+        ARROW_ASSIGN_OR_RAISE(columns[j], converter->ToChunkedArray());
       }
       return arrow::Status::OK();
     });
@@ -1415,8 +1437,9 @@ std::shared_ptr<arrow::Table> Table__from_dots(SEXP lst, SEXP schema_sxp,
 }
 
 // [[arrow::export]]
-SEXP vec_to_arrow(SEXP x, SEXP s_type) {
+SEXP vec_to_Array(SEXP x, SEXP s_type) {
   if (Rf_inherits(x, "Array")) return x;
+
   bool type_inferred = Rf_isNull(s_type);
   std::shared_ptr<arrow::DataType> type;
 
@@ -1425,7 +1448,8 @@ SEXP vec_to_arrow(SEXP x, SEXP s_type) {
   } else {
     type = cpp11::as_cpp<std::shared_ptr<arrow::DataType>>(s_type);
   }
-  return cpp11::to_r6(arrow::r::vec_to_arrow(x, type, type_inferred));
+
+  return cpp11::to_r6(arrow::r::vec_to_arrow_Array(x, type, type_inferred));
 }
 
 // [[arrow::export]]
diff --git a/r/src/recordbatch.cpp b/r/src/recordbatch.cpp
index 81e20e9ec9a30..bd08fe9336bd8 100644
--- a/r/src/recordbatch.cpp
+++ b/r/src/recordbatch.cpp
@@ -25,7 +25,6 @@
 #include <arrow/ipc/writer.h>
 #include <arrow/type.h>
 #include <arrow/util/key_value_metadata.h>
-
 // [[arrow::export]]
 int RecordBatch__num_columns(const std::shared_ptr<arrow::RecordBatch>& x) {
   return x->num_columns();
@@ -251,7 +250,7 @@ std::shared_ptr<arrow::RecordBatch> RecordBatch__from_arrays__known_schema(
       cpp11::stop("field at index %d has name '%s' != '%s'", j + 1,
                   schema->field(j)->name().c_str(), name.c_str());
     }
-    arrays[j] = arrow::r::vec_to_arrow(x, schema->field(j)->type(), false);
+    arrays[j] = arrow::r::vec_to_arrow_Array(x, schema->field(j)->type(), false);
   };
 
   arrow::r::TraverseDots(lst, num_fields, fill_array);
@@ -268,7 +267,7 @@ arrow::Status CollectRecordBatchArrays(
     SEXP lst, const std::shared_ptr<arrow::Schema>& schema, int num_fields, bool inferred,
     std::vector<std::shared_ptr<arrow::Array>>& arrays) {
   auto extract_one_array = [&arrays, &schema, inferred](int j, SEXP x, cpp11::r_string) {
-    arrays[j] = arrow::r::vec_to_arrow(x, schema->field(j)->type(), inferred);
+    arrays[j] = arrow::r::vec_to_arrow_Array(x, schema->field(j)->type(), inferred);
   };
   arrow::r::TraverseDots(lst, num_fields, extract_one_array);
   return arrow::Status::OK();
diff --git a/r/src/type_infer.cpp b/r/src/type_infer.cpp
index 022a29ea5b216..1471a000d57e9 100644
--- a/r/src/type_infer.cpp
+++ b/r/src/type_infer.cpp
@@ -22,6 +22,7 @@
 
 #if defined(ARROW_R_WITH_ARROW)
 #include <arrow/array/array_base.h>
+#include <arrow/chunked_array.h>
 
 namespace arrow {
 namespace r {
@@ -169,6 +170,10 @@ std::shared_ptr<arrow::DataType> InferArrowTypeFromVector<VECSXP>(SEXP x) {
 }
 
 std::shared_ptr<arrow::DataType> InferArrowType(SEXP x) {
+  if (arrow::r::altrep::is_arrow_altrep(x)) {
+    return arrow::r::altrep::vec_to_arrow_altrep_bypass(x)->type();
+  }
+
   switch (TYPEOF(x)) {
     case ENVSXP:
       return InferArrowTypeFromVector<ENVSXP>(x);
diff --git a/r/tests/testthat/test-altrep.R b/r/tests/testthat/test-altrep.R
index dff36943810b1..2aa63ae54f040 100644
--- a/r/tests/testthat/test-altrep.R
+++ b/r/tests/testthat/test-altrep.R
@@ -81,8 +81,8 @@ test_that("altrep vectors from int32 and dbl arrays with nulls", {
   expect_equal(c_int$num_chunks, 2L)
   expect_equal(c_dbl$num_chunks, 2L)
 
-  expect_false(is_arrow_altrep(as.vector(c_int)))
-  expect_false(is_arrow_altrep(as.vector(c_dbl)))
+  expect_true(is_arrow_altrep(as.vector(c_int)))
+  expect_true(is_arrow_altrep(as.vector(c_dbl)))
   expect_true(is_arrow_altrep(as.vector(c_int$Slice(3))))
   expect_true(is_arrow_altrep(as.vector(c_dbl$Slice(3))))
 })
@@ -91,11 +91,47 @@ test_that("empty vectors are not altrep", {
   withr::local_options(list(arrow.use_altrep = TRUE))
   v_int <- Array$create(integer())
   v_dbl <- Array$create(numeric())
+  v_str <- Array$create(character())
 
   expect_false(is_arrow_altrep(as.vector(v_int)))
   expect_false(is_arrow_altrep(as.vector(v_dbl)))
+  expect_false(is_arrow_altrep(as.vector(v_str)))
+})
+
+test_that("ChunkedArray sith 0 chunks are not altrep", {
+  z_int <- ChunkedArray$create(type = int32())
+  z_dbl <- ChunkedArray$create(type = float64())
+  z_str <- ChunkedArray$create(type = utf8())
+
+  expect_false(is_arrow_altrep(as.vector(z_int)))
+  expect_false(is_arrow_altrep(as.vector(z_dbl)))
+  expect_false(is_arrow_altrep(as.vector(z_str)))
+})
+
+test_that("chunked array become altrep", {
+  s1 <- c("un", "deux", NA)
+  s2 <- c("quatre", "cinq")
+  a <- Array$create(s1)
+  v <- a$as_vector()
+  expect_equal(v, s1)
+  expect_true(is_arrow_altrep(v))
+
+  ca <- ChunkedArray$create(s1, s2)
+  cv <- ca$as_vector()
+  expect_equal(cv, c(s1, s2))
+  expect_true(is_arrow_altrep(cv))
+
+  # chunked array with 2 chunks
+  c_int <- ChunkedArray$create(0L, c(1L, NA, 3L))
+  c_dbl <- ChunkedArray$create(0, c(1, NA, 3))
+  expect_equal(c_int$num_chunks, 2L)
+  expect_equal(c_dbl$num_chunks, 2L)
+
+  expect_true(is_arrow_altrep(as.vector(c_int)))
+  expect_true(is_arrow_altrep(as.vector(c_dbl)))
 })
 
+
 test_that("as.data.frame(<Table>, <RecordBatch>) can create altrep vectors", {
   withr::local_options(list(arrow.use_altrep = TRUE))
 
@@ -112,12 +148,21 @@ test_that("as.data.frame(<Table>, <RecordBatch>) can create altrep vectors", {
   expect_true(is_arrow_altrep(df_batch$str))
 })
 
-expect_altrep_rountrip <- function(x, fn, ...) {
+expect_altrep_rountrip <- function(x, fn, ..., .expect_warning = NA) {
   alt <- Array$create(x)$as_vector()
 
   expect_true(is_arrow_altrep(alt))
-  expect_identical(fn(x, ...), fn(alt, ...))
+  expect_warning(
+    expect_identical(fn(x, ...), fn(alt, ...)), .expect_warning
+  )
   expect_true(is_arrow_altrep(alt))
+
+  alt2 <- ChunkedArray$create(x, x)$as_vector()
+  expect_true(is_arrow_altrep(alt2))
+  expect_warning(
+    expect_identical(fn(c(x, x), ...), fn(alt2, ...)), .expect_warning
+  )
+  expect_true(is_arrow_altrep(alt2))
 }
 
 test_that("altrep min/max/sum identical to R versions for double", {
@@ -140,14 +185,8 @@ test_that("altrep min/max/sum identical to R versions for double", {
   expect_altrep_rountrip(x, sum)
 
   x <- rep(NA_real_, 3)
-  expect_warning(
-    expect_altrep_rountrip(x, min, na.rm = TRUE),
-    "no non-missing arguments to min"
-  )
-  expect_warning(
-    expect_altrep_rountrip(x, max, na.rm = TRUE),
-    "no non-missing arguments to max"
-  )
+  expect_altrep_rountrip(x, min, na.rm = TRUE, .expect_warning = "no non-missing arguments to min")
+  expect_altrep_rountrip(x, max, na.rm = TRUE, .expect_warning = "no non-missing arguments to max")
   expect_altrep_rountrip(x, sum, na.rm = TRUE)
 
   expect_altrep_rountrip(x, min)
@@ -175,14 +214,8 @@ test_that("altrep min/max/sum identical to R versions for int", {
   expect_altrep_rountrip(x, sum)
 
   x <- rep(NA_integer_, 3)
-  expect_warning(
-    expect_altrep_rountrip(x, min, na.rm = TRUE),
-    "no non-missing arguments to min"
-  )
-  expect_warning(
-    expect_altrep_rountrip(x, max, na.rm = TRUE),
-    "no non-missing arguments to max"
-  )
+  expect_altrep_rountrip(x, min, na.rm = TRUE, .expect_warning = "no non-missing arguments to min")
+  expect_altrep_rountrip(x, max, na.rm = TRUE, .expect_warning = "no non-missing arguments to max")
   expect_altrep_rountrip(x, sum, na.rm = TRUE)
 
   expect_altrep_rountrip(x, min)
@@ -221,23 +254,97 @@ test_that("altrep vectors handle coercion", {
 })
 
 test_that("columns of struct types may be altrep", {
-  st <- Array$create(data.frame(x = 1:10, y = runif(10)))
+  numbers <- runif(10)
+  st <- Array$create(data.frame(x = 1:10, y = numbers))
   df <- st$as_vector()
 
   expect_true(is_arrow_altrep(df$x))
   expect_true(is_arrow_altrep(df$y))
+
+  expect_equal(df$x, 1:10)
+  expect_equal(df$y, numbers)
+
+  st <- ChunkedArray$create(
+    data.frame(x = 1:10, y = numbers),
+    data.frame(x = 1:10, y = numbers)
+  )
+  df <- st$as_vector()
+  expect_true(is_arrow_altrep(df$x))
+  expect_true(is_arrow_altrep(df$y))
+  expect_equal(df$x, rep(1:10, 2))
+  expect_equal(df$y, rep(numbers, 2))
 })
 
-test_that("Conversion from altrep R vector to Array uses the existing Array", {
+test_that("Conversion from altrep R vector to Array uses the existing Array/ChunkedArray", {
   a_int <- Array$create(c(1L, 2L, 3L))
   b_int <- Array$create(a_int$as_vector())
-  expect_true(test_same_Array(a_int$pointer(), b_int$pointer()))
+  expect_true(a_int$Same(b_int))
 
   a_dbl <- Array$create(c(1, 2, 3))
   b_dbl <- Array$create(a_dbl$as_vector())
-  expect_true(test_same_Array(a_dbl$pointer(), b_dbl$pointer()))
+  expect_true(a_dbl$Same(b_dbl))
 
   a_str <- Array$create(c("un", "deux", "trois"))
   b_str <- Array$create(a_str$as_vector())
-  expect_true(test_same_Array(a_str$pointer(), b_str$pointer()))
+  expect_true(a_str$Same(b_str))
+
+  ca_int <- ChunkedArray$create(c(1L, 2L, 3L), c(4L, 5L, 6L))
+  cb_int <- ChunkedArray$create(ca_int$as_vector())
+  expect_true(ca_int$chunk(0)$Same(cb_int$chunk(0)))
+  expect_true(ca_int$chunk(1)$Same(cb_int$chunk(1)))
+
+  ca_dbl <- ChunkedArray$create(c(1, 2, 3), c(4, 5, 6))
+  cb_dbl <- ChunkedArray$create(ca_dbl$as_vector())
+  expect_true(ca_dbl$chunk(0)$Same(cb_dbl$chunk(0)))
+  expect_true(ca_dbl$chunk(1)$Same(cb_dbl$chunk(1)))
+
+  ca_str <- ChunkedArray$create(c("un", "deux", "trois"), c("quatre", "cinq", "six"))
+  cb_str <- ChunkedArray$create(ca_str$as_vector())
+  expect_true(ca_str$chunk(0)$Same(cb_str$chunk(0)))
+  expect_true(ca_str$chunk(1)$Same(cb_str$chunk(1)))
+})
+
+test_that("ChunkedArray$create(...) keeps Array even when from altrep vectors", {
+  a <- ChunkedArray$create(c(1, 2, 3), c(4, 5, 6))
+  b <- ChunkedArray$create(c(7, 8, 9))
+  c <- Array$create(c(10, 11, 12))
+  d <- Array$create(c(13, 14, 15))
+  e <- ChunkedArray$create(c(16, 17), c(18, 19))
+
+  x <- ChunkedArray$create(
+    # converter to R vectors (with altrep but keeping underlying arrays)
+    a$as_vector(), # 2 chunks
+    b$as_vector(), # 1 chunk
+    c$as_vector(), # 1 array
+
+    # passed in directly
+    d,
+    e
+  )
+
+  expect_true(x$chunk(0)$Same(a$chunk(0)))
+  expect_true(x$chunk(1)$Same(a$chunk(1)))
+  expect_true(x$chunk(2)$Same(b$chunk(0)))
+  expect_true(x$chunk(3)$Same(c))
+  expect_true(x$chunk(4)$Same(d))
+  expect_true(x$chunk(5)$Same(e$chunk(0)))
+  expect_true(x$chunk(6)$Same(e$chunk(1)))
+
+})
+
+test_that("R checks for bounds", {
+  v_int <- Array$create(c(1, 2, 3))$as_vector()
+  v_dbl <- Array$create(c(1L, 2L, 3L))$as_vector()
+  v_str <- Array$create(c("un", "deux", "trois"))$as_vector()
+
+  expect_error(v_int[[5]], "subscript out of bounds")
+  expect_error(v_dbl[[5]], "subscript out of bounds")
+  expect_error(v_str[[5]], "subscript out of bounds")
+
+  # excluded from the snapshot because something has changed in R at some point
+  # not really worth investigating when/where
+  # https://github.com/apache/arrow/runs/3870446814#step:17:38473
+  expect_error(v_int[[-1]])
+  expect_error(v_dbl[[-1]])
+  expect_error(v_str[[-1]])
 })

From c9495abbd0ad36575714808c563599c498969087 Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Thu, 18 Nov 2021 18:11:27 +0100
Subject: [PATCH 170/194] ARROW-14751: [C++] Add doc for set lookup "meta"
 compute functions

`is_in_meta_binary` and `index_in_meta_binary` were lacking a FunctionDoc.

Closes #11737 from pitrou/ARROW-14751-set-lookup-meta

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 .../compute/kernels/scalar_set_lookup.cc      | 56 +++++++++++--------
 1 file changed, 34 insertions(+), 22 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
index 31533ed1268a2..719a7bf2ffd54 100644
--- a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
@@ -444,11 +444,43 @@ void AddBasicSetLookupKernels(ScalarKernel kernel,
   }
 }
 
+const FunctionDoc is_in_doc{
+    "Find each element in a set of values",
+    ("For each element in `values`, return true if it is found in a given\n"
+     "set of values, false otherwise.\n"
+     "The set of values to look for must be given in SetLookupOptions.\n"
+     "By default, nulls are matched against the value set, this can be\n"
+     "changed in SetLookupOptions."),
+    {"values"},
+    "SetLookupOptions"};
+
+const FunctionDoc is_in_meta_doc{
+    "Find each element in a set of values",
+    ("For each element in `values`, return true if it is found in `value_set`,\n"
+     "false otherwise."),
+    {"values", "value_set"}};
+
+const FunctionDoc index_in_doc{
+    "Return index of each element in a set of values",
+    ("For each element in `values`, return its index in a given set of\n"
+     "values, or null if it is not found there.\n"
+     "The set of values to look for must be given in SetLookupOptions.\n"
+     "By default, nulls are matched against the value set, this can be\n"
+     "changed in SetLookupOptions."),
+    {"values"},
+    "SetLookupOptions"};
+
+const FunctionDoc index_in_meta_doc{
+    "Return index of each element in a set of values",
+    ("For each element in `values`, return its index in the `value_set`,\n"
+     "or null if it is not found there."),
+    {"values", "value_set"}};
+
 // Enables calling is_in with CallFunction as though it were binary.
 class IsInMetaBinary : public MetaFunction {
  public:
   IsInMetaBinary()
-      : MetaFunction("is_in_meta_binary", Arity::Binary(), /*doc=*/nullptr) {}
+      : MetaFunction("is_in_meta_binary", Arity::Binary(), &is_in_meta_doc) {}
 
   Result<Datum> ExecuteImpl(const std::vector<Datum>& args,
                             const FunctionOptions* options,
@@ -464,7 +496,7 @@ class IsInMetaBinary : public MetaFunction {
 class IndexInMetaBinary : public MetaFunction {
  public:
   IndexInMetaBinary()
-      : MetaFunction("index_in_meta_binary", Arity::Binary(), /*doc=*/nullptr) {}
+      : MetaFunction("index_in_meta_binary", Arity::Binary(), &index_in_meta_doc) {}
 
   Result<Datum> ExecuteImpl(const std::vector<Datum>& args,
                             const FunctionOptions* options,
@@ -485,26 +517,6 @@ struct SetLookupFunction : ScalarFunction {
   }
 };
 
-const FunctionDoc is_in_doc{
-    "Find each element in a set of values",
-    ("For each element in `values`, return true if it is found in a given\n"
-     "set of values, false otherwise.\n"
-     "The set of values to look for must be given in SetLookupOptions.\n"
-     "By default, nulls are matched against the value set, this can be\n"
-     "changed in SetLookupOptions."),
-    {"values"},
-    "SetLookupOptions"};
-
-const FunctionDoc index_in_doc{
-    "Return index of each element in a set of values",
-    ("For each element in `values`, return its index in a given set of\n"
-     "values, or null if it is not found there.\n"
-     "The set of values to look for must be given in SetLookupOptions.\n"
-     "By default, nulls are matched against the value set, this can be\n"
-     "changed in SetLookupOptions."),
-    {"values"},
-    "SetLookupOptions"};
-
 }  // namespace
 
 void RegisterScalarSetLookup(FunctionRegistry* registry) {

From d211b4448f5e139fec1e4c563eb968bf2492387e Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Thu, 18 Nov 2021 18:58:06 +0100
Subject: [PATCH 171/194] ARROW-14768: [C++] Validate compute function
 docstring formatting

Also streamline / improve the wording in some compute function docstrings.

Closes #11740 from pitrou/ARROW-14768-validate-compute-docstrings

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/arrow/compute/function.cc             |  51 ++++-
 .../arrow/compute/kernels/aggregate_basic.cc  |  36 ++--
 .../arrow/compute/kernels/aggregate_mode.cc   |  13 +-
 .../arrow/compute/kernels/hash_aggregate.cc   |  47 ++--
 .../compute/kernels/scalar_arithmetic.cc      | 204 ++++++++----------
 .../arrow/compute/kernels/scalar_compare.cc   |   9 +-
 .../arrow/compute/kernels/scalar_if_else.cc   |  33 +--
 .../arrow/compute/kernels/scalar_nested.cc    |  19 +-
 .../arrow/compute/kernels/scalar_string.cc    | 194 +++++++++--------
 .../compute/kernels/scalar_temporal_unary.cc  |  20 +-
 .../arrow/compute/kernels/scalar_validity.cc  |   6 +-
 .../arrow/compute/kernels/vector_replace.cc   |  12 +-
 cpp/src/arrow/compute/kernels/vector_sort.cc  |  13 +-
 cpp/src/arrow/compute/registry.cc             |   4 +
 14 files changed, 354 insertions(+), 307 deletions(-)

diff --git a/cpp/src/arrow/compute/function.cc b/cpp/src/arrow/compute/function.cc
index dda5788c54cee..32ffe3bbbdeb1 100644
--- a/cpp/src/arrow/compute/function.cc
+++ b/cpp/src/arrow/compute/function.cc
@@ -244,19 +244,56 @@ Result<Datum> Function::Execute(const std::vector<Datum>& args,
   return out;
 }
 
+namespace {
+
+Status ValidateFunctionSummary(const std::string& s) {
+  if (s.find('\n') != s.npos) {
+    return Status::Invalid("summary contains a newline");
+  }
+  if (s.back() == '.') {
+    return Status::Invalid("summary ends with a point");
+  }
+  return Status::OK();
+}
+
+Status ValidateFunctionDescription(const std::string& s) {
+  if (!s.empty() && s.back() == '\n') {
+    return Status::Invalid("description ends with a newline");
+  }
+  constexpr int kMaxLineSize = 78;
+  int cur_line_size = 0;
+  for (const auto c : s) {
+    cur_line_size = (c == '\n') ? 0 : cur_line_size + 1;
+    if (cur_line_size > kMaxLineSize) {
+      return Status::Invalid("description line length exceeds ", kMaxLineSize,
+                             " characters");
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
 Status Function::Validate() const {
   if (!doc_->summary.empty()) {
     // Documentation given, check its contents
     int arg_count = static_cast<int>(doc_->arg_names.size());
-    if (arg_count == arity_.num_args) {
-      return Status::OK();
+    // Some varargs functions allow 0 vararg, others expect at least 1,
+    // hence the two possible values below.
+    bool arg_count_match = (arg_count == arity_.num_args) ||
+                           (arity_.is_varargs && arg_count == arity_.num_args + 1);
+    if (!arg_count_match) {
+      return Status::Invalid(
+          "In function '", name_,
+          "': ", "number of argument names for function documentation != function arity");
+    }
+    Status st = ValidateFunctionSummary(doc_->summary);
+    if (st.ok()) {
+      st &= ValidateFunctionDescription(doc_->description);
     }
-    if (arity_.is_varargs && arg_count == arity_.num_args + 1) {
-      return Status::OK();
+    if (!st.ok()) {
+      return st.WithMessage("In function '", name_, "': ", st.message());
     }
-    return Status::Invalid(
-        "In function '", name_,
-        "': ", "number of argument names for function documentation != function arity");
   }
   return Status::OK();
 }
diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc
index 53fa2d2d7f702..38575553b3e69 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc
@@ -874,27 +874,25 @@ const FunctionDoc min_or_max_doc{
     {"array"},
     "ScalarAggregateOptions"};
 
-const FunctionDoc any_doc{"Test whether any element in a boolean array evaluates to true",
-                          ("Null values are ignored by default.\n"
-                           "If null values are taken into account by setting "
-                           "ScalarAggregateOptions parameter skip_nulls = false then "
-                           "Kleene logic is used.\n"
-                           "See KleeneOr for more details on Kleene logic."),
-                          {"array"},
-                          "ScalarAggregateOptions"};
-
-const FunctionDoc all_doc{"Test whether all elements in a boolean array evaluate to true",
-                          ("Null values are ignored by default.\n"
-                           "If null values are taken into account by setting "
-                           "ScalarAggregateOptions parameter skip_nulls = false then "
-                           "Kleene logic is used.\n"
-                           "See KleeneAnd for more details on Kleene logic."),
-                          {"array"},
-                          "ScalarAggregateOptions"};
+const FunctionDoc any_doc{
+    "Test whether any element in a boolean array evaluates to true",
+    ("Null values are ignored by default.\n"
+     "If the `skip_nulls` option is set to false, then Kleene logic is used.\n"
+     "See \"kleene_or\" for more details on Kleene logic."),
+    {"array"},
+    "ScalarAggregateOptions"};
+
+const FunctionDoc all_doc{
+    "Test whether all elements in a boolean array evaluate to true",
+    ("Null values are ignored by default.\n"
+     "If the `skip_nulls` option is set to false, then Kleene logic is used.\n"
+     "See \"kleene_and\" for more details on Kleene logic."),
+    {"array"},
+    "ScalarAggregateOptions"};
 
 const FunctionDoc index_doc{"Find the index of the first occurrence of a given value",
-                            ("The result is always computed as an int64_t, regardless\n"
-                             "of the offset type of the input array."),
+                            ("-1 is returned if the value is not found in the array.\n"
+                             "The search value is specified in IndexOptions."),
                             {"array"},
                             "IndexOptions"};
 
diff --git a/cpp/src/arrow/compute/kernels/aggregate_mode.cc b/cpp/src/arrow/compute/kernels/aggregate_mode.cc
index 6a50556a13efd..f35d025f02ed2 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_mode.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_mode.cc
@@ -413,13 +413,14 @@ VectorKernel NewModeKernel(const std::shared_ptr<DataType>& in_type,
 }
 
 const FunctionDoc mode_doc{
-    "Calculate the modal (most common) values of a numeric array",
-    ("Returns top-n most common values and number of times they occur in an array.\n"
-     "Result is an array of `struct<mode: T, count: int64>`, where T is the input type.\n"
-     "Values with larger counts are returned before smaller counts.\n"
-     "If there are more than one values with same count, smaller one is returned first.\n"
+    "Compute the modal (most common) values of a numeric array",
+    ("Compute the n most common values and their respective occurrence counts.\n"
+     "The output has type `struct<mode: T, count: int64>`, where T is the\n"
+     "input type.\n"
+     "The results are ordered by descending `count` first, and ascending `mode`\n"
+     "when breaking ties.\n"
      "Nulls are ignored.  If there are no non-null values in the array,\n"
-     "empty array is returned."),
+     "an empty array is returned."),
     {"array"},
     "ModeOptions"};
 
diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate.cc b/cpp/src/arrow/compute/kernels/hash_aggregate.cc
index e2868a51ee01c..768352845526e 100644
--- a/cpp/src/arrow/compute/kernels/hash_aggregate.cc
+++ b/cpp/src/arrow/compute/kernels/hash_aggregate.cc
@@ -2585,26 +2585,28 @@ Result<std::shared_ptr<ListArray>> Grouper::MakeGroupings(const UInt32Array& ids
 }
 
 namespace {
-const FunctionDoc hash_count_doc{"Count the number of null / non-null values",
-                                 ("By default, non-null values are counted.\n"
-                                  "This can be changed through ScalarAggregateOptions."),
-                                 {"array", "group_id_array"},
-                                 "CountOptions"};
+const FunctionDoc hash_count_doc{
+    "Count the number of null / non-null values in each group",
+    ("By default, non-null values are counted.\n"
+     "This can be changed through ScalarAggregateOptions."),
+    {"array", "group_id_array"},
+    "CountOptions"};
 
-const FunctionDoc hash_sum_doc{"Sum values of a numeric array",
+const FunctionDoc hash_sum_doc{"Sum values in each group",
                                ("Null values are ignored."),
                                {"array", "group_id_array"},
                                "ScalarAggregateOptions"};
 
 const FunctionDoc hash_product_doc{
-    "Compute product of values of a numeric array",
+    "Compute the product of values in each group",
     ("Null values are ignored.\n"
-     "Overflow will wrap around as if the calculation was done with unsigned integers."),
+     "On integer overflow, the result will wrap around as if the calculation\n"
+     "was done with unsigned integers."),
     {"array", "group_id_array"},
     "ScalarAggregateOptions"};
 
 const FunctionDoc hash_mean_doc{
-    "Average values of a numeric array",
+    "Compute the mean of values in each group",
     ("Null values are ignored.\n"
      "For integers and floats, NaN is returned if min_count = 0 and\n"
      "there are no values. For decimals, null is returned instead."),
@@ -2612,7 +2614,7 @@ const FunctionDoc hash_mean_doc{
     "ScalarAggregateOptions"};
 
 const FunctionDoc hash_stddev_doc{
-    "Calculate the standard deviation of a numeric array",
+    "Compute the standard deviation of values in each group",
     ("The number of degrees of freedom can be controlled using VarianceOptions.\n"
      "By default (`ddof` = 0), the population standard deviation is calculated.\n"
      "Nulls are ignored.  If there are not enough non-null values in the array\n"
@@ -2620,7 +2622,7 @@ const FunctionDoc hash_stddev_doc{
     {"array", "group_id_array"}};
 
 const FunctionDoc hash_variance_doc{
-    "Calculate the variance of a numeric array",
+    "Compute the variance of values in each group",
     ("The number of degrees of freedom can be controlled using VarianceOptions.\n"
      "By default (`ddof` = 0), the population variance is calculated.\n"
      "Nulls are ignored.  If there are not enough non-null values in the array\n"
@@ -2628,40 +2630,42 @@ const FunctionDoc hash_variance_doc{
     {"array", "group_id_array"}};
 
 const FunctionDoc hash_tdigest_doc{
-    "Calculate approximate quantiles of a numeric array with the T-Digest algorithm",
-    ("By default, the 0.5 quantile (median) is returned.\n"
+    "Compute approximate quantiles of values in each group",
+    ("The T-Digest algorithm is used for a fast approximation.\n"
+     "By default, the 0.5 quantile (i.e. median) is returned.\n"
      "Nulls and NaNs are ignored.\n"
-     "A array of nulls is returned if there are no valid data points."),
+     "Nulls are returned if there are no valid data points."),
     {"array", "group_id_array"},
     "TDigestOptions"};
 
 const FunctionDoc hash_approximate_median_doc{
-    "Calculate approximate medians of a numeric array with the T-Digest algorithm",
-    ("Nulls and NaNs are ignored.\n"
-     "Null is emitted for a group if there are no valid data points."),
+    "Compute approximate medians of values in each group",
+    ("The T-Digest algorithm is used for a fast approximation.\n"
+     "Nulls and NaNs are ignored.\n"
+     "Nulls are returned if there are no valid data points."),
     {"array", "group_id_array"},
     "ScalarAggregateOptions"};
 
 const FunctionDoc hash_min_max_doc{
-    "Compute the minimum and maximum values of a numeric array",
+    "Compute the minimum and maximum of values in each group",
     ("Null values are ignored by default.\n"
      "This can be changed through ScalarAggregateOptions."),
     {"array", "group_id_array"},
     "ScalarAggregateOptions"};
 
 const FunctionDoc hash_min_or_max_doc{
-    "Compute the minimum or maximum values of a numeric array",
+    "Compute the minimum or maximum of values in each group",
     ("Null values are ignored by default.\n"
      "This can be changed through ScalarAggregateOptions."),
     {"array", "group_id_array"},
     "ScalarAggregateOptions"};
 
-const FunctionDoc hash_any_doc{"Test whether any element evaluates to true",
+const FunctionDoc hash_any_doc{"Whether any element in each group evaluates to true",
                                ("Null values are ignored."),
                                {"array", "group_id_array"},
                                "ScalarAggregateOptions"};
 
-const FunctionDoc hash_all_doc{"Test whether all elements evaluate to true",
+const FunctionDoc hash_all_doc{"Whether all elements in each group evaluate to true",
                                ("Null values are ignored."),
                                {"array", "group_id_array"},
                                "ScalarAggregateOptions"};
@@ -2679,6 +2683,7 @@ const FunctionDoc hash_distinct_doc{
      "NaNs and signed zeroes are not normalized."),
     {"array", "group_id_array"},
     "CountOptions"};
+
 }  // namespace
 
 void RegisterHashAggregateBasic(FunctionRegistry* registry) {
diff --git a/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc b/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc
index db122ca81f3eb..34cbacb627b82 100644
--- a/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc
@@ -2197,178 +2197,165 @@ const FunctionDoc bit_wise_xor_doc{
 
 const FunctionDoc shift_left_doc{
     "Left shift `x` by `y`",
-    ("This function will return `x` if `y` (the amount to shift by) is: "
-     "(1) negative or (2) greater than or equal to the precision of `x`.\n"
-     "The shift operates as if on the two's complement representation of the number. "
-     "In other words, this is equivalent to multiplying `x` by 2 to the power `y`, "
+    ("The shift operates as if on the two's complement representation of the number.\n"
+     "In other words, this is equivalent to multiplying `x` by 2 to the power `y`,\n"
      "even if overflow occurs.\n"
-     "Use function \"shift_left_checked\" if you want an invalid shift amount to "
-     "return an error."),
+     "`x` is returned if `y` (the amount to shift by) is (1) negative or\n"
+     "(2) greater than or equal to the precision of `x`.\n"
+     "Use function \"shift_left_checked\" if you want an invalid shift amount\n"
+     "to return an error."),
     {"x", "y"}};
 
 const FunctionDoc shift_left_checked_doc{
-    "Left shift `x` by `y` with invalid shift check",
-    ("This function will raise an error if `y` (the amount to shift by) is: "
-     "(1) negative or (2) greater than or equal to the precision of `x`. "
-     "The shift operates as if on the two's complement representation of the number. "
-     "In other words, this is equivalent to multiplying `x` by 2 to the power `y`, "
+    "Left shift `x` by `y`",
+    ("The shift operates as if on the two's complement representation of the number.\n"
+     "In other words, this is equivalent to multiplying `x` by 2 to the power `y`,\n"
      "even if overflow occurs.\n"
+     "An error is raised if `y` (the amount to shift by) is (1) negative or\n"
+     "(2) greater than or equal to the precision of `x`.\n"
      "See \"shift_left\" for a variant that doesn't fail for an invalid shift amount."),
     {"x", "y"}};
 
 const FunctionDoc shift_right_doc{
     "Right shift `x` by `y`",
-    ("Perform a logical shift for unsigned `x` and an arithmetic shift for signed `x`.\n"
-     "This function will return `x` if `y` (the amount to shift by) is: "
-     "(1) negative or (2) greater than or equal to the precision of `x`.\n"
-     "Use function \"shift_right_checked\" if you want an invalid shift amount to return "
-     "an error."),
+    ("This is equivalent to dividing `x` by 2 to the power `y`.\n"
+     "`x` is returned if `y` (the amount to shift by) is: (1) negative or\n"
+     "(2) greater than or equal to the precision of `x`.\n"
+     "Use function \"shift_right_checked\" if you want an invalid shift amount\n"
+     "to return an error."),
     {"x", "y"}};
 
 const FunctionDoc shift_right_checked_doc{
-    "Right shift `x` by `y` with invalid shift check",
-    ("Perform a logical shift for unsigned `x` and an arithmetic shift for signed `x`.\n"
-     "This function will raise an error if `y` (the amount to shift by) is: "
-     "(1) negative or (2) greater than or equal to the precision of `x`.\n"
+    "Right shift `x` by `y`",
+    ("This is equivalent to dividing `x` by 2 to the power `y`.\n"
+     "An error is raised if `y` (the amount to shift by) is (1) negative or\n"
+     "(2) greater than or equal to the precision of `x`.\n"
      "See \"shift_right\" for a variant that doesn't fail for an invalid shift amount"),
     {"x", "y"}};
 
-const FunctionDoc sin_doc{"Compute the sine of the elements argument-wise",
-                          ("Integer arguments return double values. "
-                           "This function returns NaN on values outside its domain. "
-                           "To raise an error instead, see \"sin_checked\"."),
+const FunctionDoc sin_doc{"Compute the sine",
+                          ("NaN is returned for invalid input values;\n"
+                           "to raise an error instead, see \"sin_checked\"."),
                           {"x"}};
 
-const FunctionDoc sin_checked_doc{
-    "Compute the sine of the elements argument-wise",
-    ("Integer arguments return double values. "
-     "This function raises an error on values outside its domain. "
-     "To return NaN instead, see \"sin\"."),
-    {"x"}};
+const FunctionDoc sin_checked_doc{"Compute the sine",
+                                  ("Invalid input values raise an error;\n"
+                                   "to return NaN instead, see \"sin\"."),
+                                  {"x"}};
 
-const FunctionDoc cos_doc{"Compute the cosine of the elements argument-wise",
-                          ("Integer arguments return double values. "
-                           "This function returns NaN on values outside its domain. "
-                           "To raise an error instead, see \"cos_checked\"."),
+const FunctionDoc cos_doc{"Compute the cosine",
+                          ("NaN is returned for invalid input values;\n"
+                           "to raise an error instead, see \"cos_checked\"."),
                           {"x"}};
 
-const FunctionDoc cos_checked_doc{
-    "Compute the cosine of the elements argument-wise",
-    ("Integer arguments return double values. "
-     "This function raises an error on values outside its domain. "
-     "To return NaN instead, see \"cos\"."),
-    {"x"}};
+const FunctionDoc cos_checked_doc{"Compute the cosine",
+                                  ("Infinite values raise an error;\n"
+                                   "to return NaN instead, see \"cos\"."),
+                                  {"x"}};
 
-const FunctionDoc tan_doc{"Compute the tangent of the elements argument-wise",
-                          ("Integer arguments return double values. "
-                           "This function returns NaN on values outside its domain. "
-                           "To raise an error instead, see \"tan_checked\"."),
+const FunctionDoc tan_doc{"Compute the tangent",
+                          ("NaN is returned for invalid input values;\n"
+                           "to raise an error instead, see \"tan_checked\"."),
                           {"x"}};
 
-const FunctionDoc tan_checked_doc{
-    "Compute the tangent of the elements argument-wise",
-    ("Integer arguments return double values. "
-     "This function raises an error on values outside its domain. "
-     "To return NaN instead, see \"tan\"."),
-    {"x"}};
+const FunctionDoc tan_checked_doc{"Compute the tangent",
+                                  ("Infinite values raise an error;\n"
+                                   "to return NaN instead, see \"tan\"."),
+                                  {"x"}};
 
-const FunctionDoc asin_doc{"Compute the inverse sine of the elements argument-wise",
-                           ("Integer arguments return double values. "
-                            "This function returns NaN on values outside its domain. "
-                            "To raise an error instead, see \"asin_checked\"."),
+const FunctionDoc asin_doc{"Compute the inverse sine",
+                           ("NaN is returned for invalid input values;\n"
+                            "to raise an error instead, see \"asin_checked\"."),
                            {"x"}};
 
-const FunctionDoc asin_checked_doc{
-    "Compute the inverse sine of the elements argument-wise",
-    ("Integer arguments return double values. "
-     "This function raises an error on values outside its domain. "
-     "To return NaN instead, see \"asin\"."),
-    {"x"}};
+const FunctionDoc asin_checked_doc{"Compute the inverse sine",
+                                   ("Invalid input values raise an error;\n"
+                                    "to return NaN instead, see \"asin\"."),
+                                   {"x"}};
 
-const FunctionDoc acos_doc{"Compute the inverse cosine of the elements argument-wise",
-                           ("Integer arguments return double values. "
-                            "This function returns NaN on values outside its domain. "
-                            "To raise an error instead, see \"acos_checked\"."),
+const FunctionDoc acos_doc{"Compute the inverse cosine",
+                           ("NaN is returned for invalid input values;\n"
+                            "to raise an error instead, see \"acos_checked\"."),
                            {"x"}};
 
-const FunctionDoc acos_checked_doc{
-    "Compute the inverse cosine of the elements argument-wise",
-    ("Integer arguments return double values. "
-     "This function raises an error on values outside its domain. "
-     "To return NaN instead, see \"acos\"."),
-    {"x"}};
+const FunctionDoc acos_checked_doc{"Compute the inverse cosine",
+                                   ("Invalid input values raise an error;\n"
+                                    "to return NaN instead, see \"acos\"."),
+                                   {"x"}};
 
-const FunctionDoc atan_doc{"Compute the principal value of the inverse tangent",
-                           "Integer arguments return double values.",
+const FunctionDoc atan_doc{"Compute the inverse tangent of x",
+                           ("The return value is in the range [-pi/2, pi/2];\n"
+                            "for a full return range [-pi, pi], see \"atan2\"."),
                            {"x"}};
 
-const FunctionDoc atan2_doc{
-    "Compute the inverse tangent using argument signs to determine the quadrant",
-    "Integer arguments return double values.",
-    {"y", "x"}};
+const FunctionDoc atan2_doc{"Compute the inverse tangent of y/x",
+                            ("The return value is in the range [-pi, pi]."),
+                            {"y", "x"}};
 
 const FunctionDoc ln_doc{
-    "Compute natural log of arguments element-wise",
+    "Compute natural logarithm",
     ("Non-positive values return -inf or NaN. Null values return null.\n"
      "Use function \"ln_checked\" if you want non-positive values to raise an error."),
     {"x"}};
 
 const FunctionDoc ln_checked_doc{
-    "Compute natural log of arguments element-wise",
-    ("Non-positive values return -inf or NaN. Null values return null.\n"
+    "Compute natural logarithm",
+    ("Non-positive values raise an error. Null values return null.\n"
      "Use function \"ln\" if you want non-positive values to return "
      "-inf or NaN."),
     {"x"}};
 
 const FunctionDoc log10_doc{
-    "Compute log base 10 of arguments element-wise",
+    "Compute base 10 logarithm",
     ("Non-positive values return -inf or NaN. Null values return null.\n"
-     "Use function \"log10_checked\" if you want non-positive values to raise an error."),
+     "Use function \"log10_checked\" if you want non-positive values\n"
+     "to raise an error."),
     {"x"}};
 
 const FunctionDoc log10_checked_doc{
-    "Compute log base 10 of arguments element-wise",
-    ("Non-positive values return -inf or NaN. Null values return null.\n"
-     "Use function \"log10\" if you want non-positive values to return "
-     "-inf or NaN."),
+    "Compute base 10 logarithm",
+    ("Non-positive values raise an error. Null values return null.\n"
+     "Use function \"log10\" if you want non-positive values\n"
+     "to return -inf or NaN."),
     {"x"}};
 
 const FunctionDoc log2_doc{
-    "Compute log base 2 of arguments element-wise",
+    "Compute base 2 logarithm",
     ("Non-positive values return -inf or NaN. Null values return null.\n"
-     "Use function \"log2_checked\" if you want non-positive values to raise an error."),
+     "Use function \"log2_checked\" if you want non-positive values\n"
+     "to raise an error."),
     {"x"}};
 
 const FunctionDoc log2_checked_doc{
-    "Compute log base 2 of arguments element-wise",
-    ("Non-positive values return -inf or NaN. Null values return null.\n"
-     "Use function \"log2\" if you want non-positive values to return "
-     "-inf or NaN."),
+    "Compute base 2 logarithm",
+    ("Non-positive values raise an error. Null values return null.\n"
+     "Use function \"log2\" if you want non-positive values\n"
+     "to return -inf or NaN."),
     {"x"}};
 
 const FunctionDoc log1p_doc{
-    "Compute natural log of (1+x) element-wise",
+    "Compute natural log of (1+x)",
     ("Values <= -1 return -inf or NaN. Null values return null.\n"
-     "This function may be more precise than log(1 + x) for x close to zero."
-     "Use function \"log1p_checked\" if you want non-positive values to raise an error."),
+     "This function may be more precise than log(1 + x) for x close to zero.\n"
+     "Use function \"log1p_checked\" if you want invalid values to raise an error."),
     {"x"}};
 
 const FunctionDoc log1p_checked_doc{
-    "Compute natural log of (1+x) element-wise",
+    "Compute natural log of (1+x)",
     ("Values <= -1 return -inf or NaN. Null values return null.\n"
-     "This function may be more precise than log(1 + x) for x close to zero."
-     "Use function \"log1p\" if you want non-positive values to return "
+     "This function may be more precise than log(1 + x) for x close to zero.\n"
+     "Use function \"log1p\" if you want invalid values to return "
      "-inf or NaN."),
     {"x"}};
 
 const FunctionDoc logb_doc{
-    "Compute log of x to base b of arguments element-wise",
+    "Compute base `b` logarithm",
     ("Values <= 0 return -inf or NaN. Null values return null.\n"
      "Use function \"logb_checked\" if you want non-positive values to raise an error."),
     {"x", "b"}};
 
 const FunctionDoc logb_checked_doc{
-    "Compute log of x to base b of arguments element-wise",
+    "Compute base `b` logarithm",
     ("Values <= 0 return -inf or NaN. Null values return null.\n"
      "Use function \"logb\" if you want non-positive values to return "
      "-inf or NaN."),
@@ -2376,35 +2363,32 @@ const FunctionDoc logb_checked_doc{
 
 const FunctionDoc floor_doc{
     "Round down to the nearest integer",
-    ("Calculate the nearest integer less than or equal in magnitude to the "
-     "argument element-wise"),
+    ("Compute the largest integer value not greater in magnitude than `x`."),
     {"x"}};
 
 const FunctionDoc ceil_doc{
     "Round up to the nearest integer",
-    ("Calculate the nearest integer greater than or equal in magnitude to the "
-     "argument element-wise"),
+    ("Compute the smallest integer value not less in magnitude than `x`."),
     {"x"}};
 
 const FunctionDoc trunc_doc{
-    "Get the integral part without fractional digits",
-    ("Calculate the nearest integer not greater in magnitude than to the "
-     "argument element-wise."),
+    "Compute the integral part",
+    ("Compute the nearest integer not greater in magnitude than `x`."),
     {"x"}};
 
 const FunctionDoc round_doc{
     "Round to a given precision",
     ("Options are used to control the number of digits and rounding mode.\n"
-     "Default behavior is to round to the nearest integer and use half-to-even "
-     "rule to break ties."),
+     "Default behavior is to round to the nearest integer and\n"
+     "use half-to-even rule to break ties."),
     {"x"},
     "RoundOptions"};
 
 const FunctionDoc round_to_multiple_doc{
     "Round to a given multiple",
     ("Options are used to control the rounding multiple and rounding mode.\n"
-     "Default behavior is to round to the nearest integer and use half-to-even "
-     "rule to break ties."),
+     "Default behavior is to round to the nearest integer and\n"
+     "use half-to-even rule to break ties."),
     {"x"},
     "RoundToMultipleOptions"};
 }  // namespace
diff --git a/cpp/src/arrow/compute/kernels/scalar_compare.cc b/cpp/src/arrow/compute/kernels/scalar_compare.cc
index b23261528ff33..c31dc05fc3681 100644
--- a/cpp/src/arrow/compute/kernels/scalar_compare.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_compare.cc
@@ -494,17 +494,18 @@ const FunctionDoc less_equal_doc{
 
 const FunctionDoc min_element_wise_doc{
     "Find the element-wise minimum value",
-    ("Nulls will be ignored (default) or propagated. "
-     "NaN will be taken over null, but not over any valid float."),
+    ("Nulls are ignored (by default) or propagated.\n"
+     "NaN is preferred over null, but not over any valid value."),
     {"*args"},
     "ElementWiseAggregateOptions"};
 
 const FunctionDoc max_element_wise_doc{
     "Find the element-wise maximum value",
-    ("Nulls will be ignored (default) or propagated. "
-     "NaN will be taken over null, but not over any valid float."),
+    ("Nulls are ignored (by default) or propagated.\n"
+     "NaN is preferred over null, but not over any valid value."),
     {"*args"},
     "ElementWiseAggregateOptions"};
+
 }  // namespace
 
 void RegisterScalarComparison(FunctionRegistry* registry) {
diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else.cc b/cpp/src/arrow/compute/kernels/scalar_if_else.cc
index ac33c856208ab..cb4ac873b56fd 100644
--- a/cpp/src/arrow/compute/kernels/scalar_if_else.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_if_else.cc
@@ -2799,31 +2799,34 @@ const FunctionDoc if_else_doc{"Choose values based on a condition",
 
 const FunctionDoc case_when_doc{
     "Choose values based on multiple conditions",
-    ("`cond` must be a struct of Boolean values. `cases` can be a mix "
-     "of scalar and array arguments (of any type, but all must be the "
-     "same type or castable to a common type), with either exactly one "
-     "datum per child of `cond`, or one more `cases` than children of "
+    ("`cond` must be a struct of Boolean values. `cases` can be a mix\n"
+     "of scalar and array arguments (of any type, but all must be the\n"
+     "same type or castable to a common type), with either exactly one\n"
+     "datum per child of `cond`, or one more `cases` than children of\n"
      "`cond` (in which case we have an \"else\" value).\n"
-     "Each row of the output will be the corresponding value of the "
-     "first datum in `cases` for which the corresponding child of `cond` "
-     "is true, or otherwise the \"else\" value (if given), or null. "
+     "\n"
+     "Each row of the output will be the corresponding value of the\n"
+     "first datum in `cases` for which the corresponding child of `cond`\n"
+     "is true, or otherwise the \"else\" value (if given), or null.\n"
+     "\n"
      "Essentially, this implements a switch-case or if-else, if-else... "
      "statement."),
     {"cond", "*cases"}};
 
 const FunctionDoc coalesce_doc{
-    "Select the first non-null value in each slot",
-    ("Each row of the output will be the value from the first corresponding input "
-     "for which the value is not null. If all inputs are null in a row, the output "
+    "Select the first non-null value",
+    ("Each row of the output will be the value from the first corresponding input\n"
+     "for which the value is not null. If all inputs are null in a row, the output\n"
      "will be null."),
     {"*values"}};
 
 const FunctionDoc choose_doc{
-    "Given indices and arrays, choose the value from the corresponding array for each "
-    "index",
-    ("For each row, the value of the first argument is used as a 0-based index into the "
-     "rest of the arguments (i.e. index 0 selects the second argument). The output value "
-     "is the corresponding value of the selected argument.\n"
+    "Choose values from several arrays",
+    ("For each row, the value of the first argument is used as a 0-based index\n"
+     "into the list of `values` arrays (i.e. index 0 selects the first of the\n"
+     "`values` arrays). The output value is the corresponding value of the\n"
+     "selected argument.\n"
+     "\n"
      "If an index is null, the output will be null."),
     {"indices", "*values"}};
 }  // namespace
diff --git a/cpp/src/arrow/compute/kernels/scalar_nested.cc b/cpp/src/arrow/compute/kernels/scalar_nested.cc
index 682f73632b24a..f2e4cfb60305d 100644
--- a/cpp/src/arrow/compute/kernels/scalar_nested.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_nested.cc
@@ -325,14 +325,17 @@ void AddStructFieldKernels(ScalarFunction* func) {
 }
 
 const FunctionDoc struct_field_doc(
-    "Extract children of a struct or union value by index.",
-    ("Given a series of indices (passed via StructFieldOptions), extract the "
-     "child array or scalar referenced by the index. For union values, mask "
-     "the child based on the type codes of the union array. The indices are "
-     "always the child index and not the type code (for unions) - so the "
-     "first child is always index 0. An empty set of indices returns the "
-     "argument unchanged."),
-    {"container"}, "StructFieldOptions");
+    "Extract children of a struct or union by index",
+    ("Given a list of indices (passed via StructFieldOptions), extract\n"
+     "the child array or scalar with the given child index, recursively.\n"
+     "\n"
+     "For union inputs, nulls are emitted for union values that reference\n"
+     "a different child than specified. Also, the indices are always\n"
+     "in physical order, not logical type codes - for example, the first\n"
+     "child is always index 0.\n"
+     "\n"
+     "An empty list of indices returns the argument unchanged."),
+    {"values"}, "StructFieldOptions");
 
 Result<ValueDescr> MakeStructResolve(KernelContext* ctx,
                                      const std::vector<ValueDescr>& descrs) {
diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index b567f4f351b4c..e844218f392ab 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -1488,31 +1488,37 @@ struct MatchSubstring<Type, PlainEndsWithMatcher> {
 const FunctionDoc match_substring_doc(
     "Match strings against literal pattern",
     ("For each string in `strings`, emit true iff it contains a given pattern.\n"
-     "Null inputs emit null.  The pattern must be given in MatchSubstringOptions. "
+     "Null inputs emit null.\n"
+     "The pattern must be given in MatchSubstringOptions.\n"
      "If ignore_case is set, only simple case folding is performed."),
     {"strings"}, "MatchSubstringOptions");
 
 const FunctionDoc starts_with_doc(
     "Check if strings start with a literal pattern",
     ("For each string in `strings`, emit true iff it starts with a given pattern.\n"
-     "Null inputs emit null.  The pattern must be given in MatchSubstringOptions. "
-     "If ignore_case is set, only simple case folding is performed."),
+     "The pattern must be given in MatchSubstringOptions.\n"
+     "If ignore_case is set, only simple case folding is performed.\n"
+     "\n"
+     "Null inputs emit null."),
     {"strings"}, "MatchSubstringOptions");
 
 const FunctionDoc ends_with_doc(
     "Check if strings end with a literal pattern",
     ("For each string in `strings`, emit true iff it ends with a given pattern.\n"
-     "Null inputs emit null.  The pattern must be given in MatchSubstringOptions. "
-     "If ignore_case is set, only simple case folding is performed."),
+     "The pattern must be given in MatchSubstringOptions.\n"
+     "If ignore_case is set, only simple case folding is performed.\n"
+     "\n"
+     "Null inputs emit null."),
     {"strings"}, "MatchSubstringOptions");
 
 #ifdef ARROW_WITH_RE2
 const FunctionDoc match_substring_regex_doc(
     "Match strings against regex pattern",
-    ("For each string in `strings`, emit true iff it matches a given pattern at any "
-     "position.\n"
-     "Null inputs emit null.  The pattern must be given in MatchSubstringOptions. "
-     "If ignore_case is set, only simple case folding is performed."),
+    ("For each string in `strings`, emit true iff it matches a given pattern\n"
+     "at any position. The pattern must be given in MatchSubstringOptions.\n"
+     "If ignore_case is set, only simple case folding is performed.\n"
+     "\n"
+     "Null inputs emit null."),
     {"strings"}, "MatchSubstringOptions");
 
 // SQL LIKE match
@@ -1619,10 +1625,10 @@ struct MatchLike {
 
 const FunctionDoc match_like_doc(
     "Match strings against SQL-style LIKE pattern",
-    ("For each string in `strings`, emit true iff it fully matches a given pattern "
-     "at any position. That is, '%' will match any number of characters, '_' will "
-     "match exactly one character, and any other character matches itself. To "
-     "match a literal '%', '_', or '\\', precede the character with a backslash.\n"
+    ("For each string in `strings`, emit true iff it matches a given pattern\n"
+     "at any position. '%' will match any number of characters, '_' will\n"
+     "match exactly one character, and any other character matches itself.\n"
+     "To match a literal '%', '_', or '\\', precede the character with a backslash.\n"
      "Null inputs emit null.  The pattern must be given in MatchSubstringOptions."),
     {"strings"}, "MatchSubstringOptions");
 
@@ -1746,8 +1752,8 @@ struct FindSubstringExec {
 
 const FunctionDoc find_substring_doc(
     "Find first occurrence of substring",
-    ("For each string in `strings`, emit the index of the first occurrence of the given "
-     "pattern, or -1 if not found.\n"
+    ("For each string in `strings`, emit the index in bytes of the first occurrence\n"
+     "of the given literal pattern, or -1 if not found.\n"
      "Null inputs emit null. The pattern must be given in MatchSubstringOptions."),
     {"strings"}, "MatchSubstringOptions");
 
@@ -1765,8 +1771,8 @@ struct FindSubstringRegexExec {
 
 const FunctionDoc find_substring_regex_doc(
     "Find location of first match of regex pattern",
-    ("For each string in `strings`, emit the index of the first match of the given "
-     "pattern, or -1 if not found.\n"
+    ("For each string in `strings`, emit the index in bytes of the first occurrence\n"
+     "of the given literal pattern, or -1 if not found.\n"
      "Null inputs emit null. The pattern must be given in MatchSubstringOptions."),
     {"strings"}, "MatchSubstringOptions");
 #endif
@@ -1905,16 +1911,16 @@ struct CountSubstringExec {
 
 const FunctionDoc count_substring_doc(
     "Count occurrences of substring",
-    ("For each string in `strings`, emit the number of occurrences of the given "
-     "pattern.\n"
+    ("For each string in `strings`, emit the number of occurrences of the given\n"
+     "literal pattern.\n"
      "Null inputs emit null. The pattern must be given in MatchSubstringOptions."),
     {"strings"}, "MatchSubstringOptions");
 
 #ifdef ARROW_WITH_RE2
 const FunctionDoc count_substring_regex_doc(
     "Count occurrences of substring",
-    ("For each string in `strings`, emit the number of occurrences of the given "
-     "regex pattern.\n"
+    ("For each string in `strings`, emit the number of occurrences of the given\n"
+     "regular expression pattern.\n"
      "Null inputs emit null. The pattern must be given in MatchSubstringOptions."),
     {"strings"}, "MatchSubstringOptions");
 #endif
@@ -2132,12 +2138,13 @@ template <typename Type>
 using SliceCodeunits = StringTransformExec<Type, SliceCodeunitsTransform>;
 
 const FunctionDoc utf8_slice_codeunits_doc(
-    "Slice string ",
-    ("For each string in `strings`, slice into a substring defined by\n"
-     "`start`, `stop`, `step`) as given by `SliceOptions` where `start` is inclusive\n"
-     "and `stop` is exclusive and are measured in codeunits. If step is negative, the\n"
-     "string will be advanced in reversed order. A `step` of zero is considered an\n"
-     "error.\n"
+    "Slice string",
+    ("For each string in `strings`, emit the substring defined by\n"
+     "(`start`, `stop`, `step`) as given by `SliceOptions` where `start` is\n"
+     "inclusive and `stop` is exclusive. All three values are measured in\n"
+     "UTF8 codeunits.\n"
+     "If `step` is negative, the string will be advanced in reversed order.\n"
+     "An error is raised if `step` is zero.\n"
      "Null inputs emit null."),
     {"strings"}, "SliceOptions");
 
@@ -2173,7 +2180,7 @@ struct CharacterPredicateUnicode {
   }
 
   static inline bool PredicateCharacterAny(uint32_t) {
-    return true;  // default condition make sure there is at least 1 charachter
+    return true;  // default condition make sure there is at least 1 character
   }
 };
 
@@ -2196,7 +2203,7 @@ struct CharacterPredicateAscii {
   }
 
   static inline bool PredicateCharacterAny(uint8_t) {
-    return true;  // default condition make sure there is at least 1 charachter
+    return true;  // default condition make sure there is at least 1 character
   }
 };
 
@@ -3195,11 +3202,12 @@ template <typename Type>
 using ReplaceSubstringPlain = ReplaceSubstring<Type, PlainSubstringReplacer>;
 
 const FunctionDoc replace_substring_doc(
-    "Replace non-overlapping substrings that match pattern by replacement",
+    "Replace matching non-overlapping substrings with replacement",
     ("For each string in `strings`, replace non-overlapping substrings that match\n"
-     "`pattern` by `replacement`. If `max_replacements != -1`, it determines the\n"
-     "maximum amount of replacements made, counting from the left. Null values emit\n"
-     "null."),
+     "the given literal `pattern` with the given `replacement`.\n"
+     "If `max_replacements` is given and not equal to -1, it limits the\n"
+     "maximum amount replacements per input, counted from the left.\n"
+     "Null values emit null."),
     {"strings"}, "ReplaceSubstringOptions");
 
 #ifdef ARROW_WITH_RE2
@@ -3207,12 +3215,12 @@ template <typename Type>
 using ReplaceSubstringRegex = ReplaceSubstring<Type, RegexSubstringReplacer<Type>>;
 
 const FunctionDoc replace_substring_regex_doc(
-    "Replace non-overlapping substrings that match regex `pattern` by `replacement`",
-    ("For each string in `strings`, replace non-overlapping substrings that match the\n"
-     "regular expression `pattern` by `replacement` using the Google RE2 library.\n"
-     "If `max_replacements != -1`, it determines the maximum amount of replacements\n"
-     "made, counting from the left. Note that if the pattern contains groups,\n"
-     "backreferencing macan be used. Null values emit null."),
+    "Replace matching non-overlapping substrings with replacement",
+    ("For each string in `strings`, replace non-overlapping substrings that match\n"
+     "the given regular expression `pattern` with the given `replacement`.\n"
+     "If `max_replacements` is given and not equal to -1, it limits the\n"
+     "maximum amount replacements per input, counted from the left.\n"
+     "Null values emit null."),
     {"strings"}, "ReplaceSubstringOptions");
 #endif
 
@@ -3359,18 +3367,18 @@ template <typename Type>
 using Utf8ReplaceSlice = StringTransformExecWithState<Type, Utf8ReplaceSliceTransform>;
 
 const FunctionDoc binary_replace_slice_doc(
-    "Replace a slice of a binary string with `replacement`",
-    ("For each string in `strings`, replace a slice of the string defined by `start`"
-     "and `stop` with `replacement`. `start` is inclusive and `stop` is exclusive, "
-     "and both are measured in bytes.\n"
+    "Replace a slice of a binary string",
+    ("For each string in `strings`, replace a slice of the string defined by `start`\n"
+     "and `stop` indices with the given `replacement`. `start` is inclusive\n"
+     "and `stop` is exclusive, and both are measured in bytes.\n"
      "Null values emit null."),
     {"strings"}, "ReplaceSliceOptions");
 
 const FunctionDoc utf8_replace_slice_doc(
-    "Replace a slice of a string with `replacement`",
-    ("For each string in `strings`, replace a slice of the string defined by `start`"
-     "and `stop` with `replacement`. `start` is inclusive and `stop` is exclusive, "
-     "and both are measured in codeunits.\n"
+    "Replace a slice of a string",
+    ("For each string in `strings`, replace a slice of the string defined by `start`\n"
+     "and `stop` indices with the given `replacement`. `start` is inclusive\n"
+     "and `stop` is exclusive, and both are measured in UTF8 characters.\n"
      "Null values emit null."),
     {"strings"}, "ReplaceSliceOptions");
 
@@ -3991,28 +3999,28 @@ const FunctionDoc utf8_rpad_doc(
     {"strings"}, "PadOptions");
 
 const FunctionDoc ascii_center_doc(
-    utf8_center_doc.description + "",
+    utf8_center_doc.summary,
     ("For each string in `strings`, emit a centered string by padding both sides \n"
      "with the given ASCII character.\nNull values emit null."),
     {"strings"}, "PadOptions");
 
 const FunctionDoc ascii_lpad_doc(
-    utf8_lpad_doc.description + "",
+    utf8_lpad_doc.summary,
     ("For each string in `strings`, emit a right-aligned string by prepending \n"
      "the given ASCII character.\nNull values emit null."),
     {"strings"}, "PadOptions");
 
 const FunctionDoc ascii_rpad_doc(
-    utf8_rpad_doc.description + "",
+    utf8_rpad_doc.summary,
     ("For each string in `strings`, emit a left-aligned string by appending \n"
      "the given ASCII character.\nNull values emit null."),
     {"strings"}, "PadOptions");
 
 const FunctionDoc utf8_trim_whitespace_doc(
     "Trim leading and trailing whitespace characters",
-    ("For each string in `strings`, emit a string with leading and trailing whitespace\n"
-     "characters removed, where whitespace characters are defined by the Unicode\n"
-     "standard.  Null values emit null."),
+    ("For each string in `strings`, emit a string with leading and trailing\n"
+     "whitespace characters removed, where whitespace characters are defined\n"
+     "by the Unicode standard.  Null values emit null."),
     {"strings"});
 
 const FunctionDoc utf8_ltrim_whitespace_doc(
@@ -4051,45 +4059,45 @@ const FunctionDoc ascii_rtrim_whitespace_doc(
     {"strings"});
 
 const FunctionDoc utf8_trim_doc(
-    "Trim leading and trailing characters present in the `characters` arguments",
-    ("For each string in `strings`, emit a string with leading and trailing\n"
-     "characters removed that are present in the `characters` argument.  Null values\n"
-     "emit null."),
+    "Trim leading and trailing characters",
+    ("For each string in `strings`, remove any leading or trailing characters\n"
+     "from the `characters` option (as given in TrimOptions).\n"
+     "Null values emit null."),
     {"strings"}, "TrimOptions");
 
 const FunctionDoc utf8_ltrim_doc(
-    "Trim leading characters present in the `characters` arguments",
-    ("For each string in `strings`, emit a string with leading\n"
-     "characters removed that are present in the `characters` argument.  Null values\n"
-     "emit null."),
+    "Trim leading characters",
+    ("For each string in `strings`, remove any leading characters\n"
+     "from the `characters` option (as given in TrimOptions).\n"
+     "Null values emit null."),
     {"strings"}, "TrimOptions");
 
 const FunctionDoc utf8_rtrim_doc(
-    "Trim trailing characters present in the `characters` arguments",
-    ("For each string in `strings`, emit a string with leading "
-     "characters removed that are present in the `characters` argument.  Null values\n"
-     "emit null."),
+    "Trim trailing characters",
+    ("For each string in `strings`, remove any trailing characters\n"
+     "from the `characters` option (as given in TrimOptions).\n"
+     "Null values emit null."),
     {"strings"}, "TrimOptions");
 
 const FunctionDoc ascii_trim_doc(
-    utf8_trim_doc.summary + "",
+    utf8_trim_doc.summary,
     utf8_trim_doc.description +
-        ("\nBoth the input string as the `characters` argument are interepreted as\n"
-         "ASCII characters, to trim non-ASCII characters, use `utf8_trim`."),
+        ("\nBoth the `strings` and the `characters` are interpreted as\n"
+         "ASCII; to trim non-ASCII characters, use `utf8_trim`."),
     {"strings"}, "TrimOptions");
 
 const FunctionDoc ascii_ltrim_doc(
-    utf8_ltrim_doc.summary + "",
+    utf8_ltrim_doc.summary,
     utf8_ltrim_doc.description +
-        ("\nBoth the input string as the `characters` argument are interepreted as\n"
-         "ASCII characters, to trim non-ASCII characters, use `utf8_trim`."),
+        ("\nBoth the `strings` and the `characters` are interpreted as\n"
+         "ASCII; to trim non-ASCII characters, use `utf8_ltrim`."),
     {"strings"}, "TrimOptions");
 
 const FunctionDoc ascii_rtrim_doc(
-    utf8_rtrim_doc.summary + "",
+    utf8_rtrim_doc.summary,
     utf8_rtrim_doc.description +
-        ("\nBoth the input string as the `characters` argument are interepreted as\n"
-         "ASCII characters, to trim non-ASCII characters, use `utf8_trim`."),
+        ("\nBoth the `strings` and the `characters` are interpreted as\n"
+         "ASCII; to trim non-ASCII characters, use `utf8_rtrim`."),
     {"strings"}, "TrimOptions");
 
 const FunctionDoc strptime_doc(
@@ -4102,13 +4110,15 @@ const FunctionDoc strptime_doc(
 
 const FunctionDoc binary_length_doc(
     "Compute string lengths",
-    ("For each string in `strings`, emit the number of bytes.  Null values emit null."),
+    ("For each string in `strings`, emit its length of bytes.\n"
+     "Null values emit null."),
     {"strings"});
 
-const FunctionDoc utf8_length_doc("Compute UTF8 string lengths",
-                                  ("For each string in `strings`, emit the number of "
-                                   "UTF8 characters.  Null values emit null."),
-                                  {"strings"});
+const FunctionDoc utf8_length_doc(
+    "Compute UTF8 string lengths",
+    ("For each string in `strings`, emit its length in UTF8 characters.\n"
+     "Null values emit null."),
+    {"strings"});
 
 void AddStrptime(FunctionRegistry* registry) {
   auto func = std::make_shared<ScalarFunction>("strptime", Arity::Unary(), &strptime_doc);
@@ -4617,17 +4627,18 @@ struct BinaryJoinElementWise {
 };
 
 const FunctionDoc binary_join_doc(
-    "Join a list of strings together with a `separator` to form a single string",
-    ("Insert `separator` between `list` elements, and concatenate them.\n"
-     "Any null input and any null `list` element emits a null output.\n"),
-    {"list", "separator"});
+    "Join a list of strings together with a separator",
+    ("Concatenate the strings in `list`. The `separator` is inserted\n"
+     "between each given string.\n"
+     "Any null input and any null `list` element emits a null output."),
+    {"strings", "separator"});
 
 const FunctionDoc binary_join_element_wise_doc(
-    "Join string arguments into one, using the last argument as the separator",
-    ("Insert the last argument of `strings` between the rest of the elements, "
-     "and concatenate them.\n"
-     "Any null separator element emits a null output. Null elements either "
-     "emit a null (the default), are skipped, or replaced with a given string.\n"),
+    "Join string arguments together, with the last argument as separator",
+    ("Concatenate the `strings` except for the last one. The last argument\n"
+     "in `strings` is inserted between each given string.\n"
+     "Any null separator element emits a null output. Null elements either\n"
+     "emit a null (the default), are skipped, or replaced with a given string."),
     {"*strings"}, "JoinOptions");
 
 const JoinOptions* GetDefaultJoinOptions() {
@@ -4837,7 +4848,7 @@ const auto ascii_is_title_doc = StringPredicateDoc(
     ("For each string in `strings`, emit true iff the string is title-cased,\n"
      "i.e. it has at least one cased character, each uppercase character\n"
      "follows an uncased character, and each lowercase character follows\n"
-     "an uppercase character.\n"));
+     "an uppercase character."));
 
 const auto utf8_is_alnum_doc =
     StringClassifyDoc("alphanumeric", "alphanumeric Unicode characters", true);
@@ -4862,7 +4873,7 @@ const auto utf8_is_title_doc = StringPredicateDoc(
     ("For each string in `strings`, emit true iff the string is title-cased,\n"
      "i.e. it has at least one cased character, each uppercase character\n"
      "follows an uncased character, and each lowercase character follows\n"
-     "an uppercase character.\n"));
+     "an uppercase character."));
 
 const FunctionDoc ascii_upper_doc(
     "Transform ASCII input to uppercase",
@@ -4879,8 +4890,7 @@ const FunctionDoc ascii_lower_doc(
     {"strings"});
 
 const FunctionDoc ascii_swapcase_doc(
-    "Transform ASCII input lowercase characters to uppercase and uppercase characters to "
-    "lowercase",
+    "Transform ASCII input by inverting casing",
     ("For each string in `strings`, return a string with opposite casing.\n\n"
      "This function assumes the input is fully ASCII.  If it may contain\n"
      "non-ASCII characters, use \"utf8_swapcase\" instead."),
diff --git a/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc b/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc
index d1c5855d2df1c..da7fec2cb23b4 100644
--- a/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc
@@ -856,7 +856,7 @@ const FunctionDoc day_of_year_doc{
 
 const FunctionDoc iso_year_doc{
     "Extract ISO year number",
-    ("First week of an ISO year has the majority (4 or more) of its days in January."
+    ("First week of an ISO year has the majority (4 or more) of its days in January.\n"
      "Null values emit null.\n"
      "An error is returned if the values have a defined timezone but it\n"
      "cannot be found in the timezone database."),
@@ -864,9 +864,9 @@ const FunctionDoc iso_year_doc{
 
 const FunctionDoc iso_week_doc{
     "Extract ISO week of year number",
-    ("First ISO week has the majority (4 or more) of its days in January."
-     "ISO week starts on Monday.\n"
-     "Week of the year starts with 1 and can run up to 53.\n"
+    ("First ISO week has the majority (4 or more) of its days in January.\n"
+     "ISO week starts on Monday. The week number starts with 1 and can run\n"
+     "up to 53.\n"
      "Null values emit null.\n"
      "An error is returned if the values have a defined timezone but it\n"
      "cannot be found in the timezone database."),
@@ -874,20 +874,20 @@ const FunctionDoc iso_week_doc{
 
 const FunctionDoc us_week_doc{
     "Extract US week of year number",
-    ("First US week has the majority (4 or more) of its days in January."
-     "US week starts on Sunday.\n"
-     "Week of the year starts with 1 and can run up to 53.\n"
+    ("First US week has the majority (4 or more) of its days in January.\n"
+     "US week starts on Monday. The week number starts with 1 and can run\n"
+     "up to 53.\n"
      "Null values emit null.\n"
-     "An error is returned if the timestamps have a defined timezone but it\n"
+     "An error is returned if the values have a defined timezone but it\n"
      "cannot be found in the timezone database."),
     {"values"}};
 
 const FunctionDoc week_doc{
     "Extract week of year number",
     ("First week has the majority (4 or more) of its days in January.\n"
-     "Year can have 52 or 53 weeks. Week numbering can start with 0 or 1 using "
+     "Year can have 52 or 53 weeks. Week numbering can start with 0 or 1 using\n"
      "DayOfWeekOptions.count_from_zero.\n"
-     "An error is returned if the timestamps have a defined timezone but it\n"
+     "An error is returned if the values have a defined timezone but it\n"
      "cannot be found in the timezone database."),
     {"values"},
     "WeekOptions"};
diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc
index 3f0a6a4178383..b4438a48e785a 100644
--- a/cpp/src/arrow/compute/kernels/scalar_validity.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc
@@ -282,11 +282,13 @@ Status IsNullExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
 
 const FunctionDoc is_valid_doc(
     "Return true if non-null",
-    ("For each input value, emit true iff the value is valid (non-null)."), {"values"});
+    ("For each input value, emit true iff the value is valid (i.e. non-null)."),
+    {"values"});
 
 const FunctionDoc is_finite_doc(
     "Return true if value is finite",
-    ("For each input value, emit true iff the value is finite (not NaN, inf, or -inf)."),
+    ("For each input value, emit true iff the value is finite\n"
+     "(i.e. neither NaN, inf, nor -inf)."),
     {"values"});
 
 const FunctionDoc is_inf_doc(
diff --git a/cpp/src/arrow/compute/kernels/vector_replace.cc b/cpp/src/arrow/compute/kernels/vector_replace.cc
index 7f204b529ebfa..02add0c429f18 100644
--- a/cpp/src/arrow/compute/kernels/vector_replace.cc
+++ b/cpp/src/arrow/compute/kernels/vector_replace.cc
@@ -446,12 +446,12 @@ struct ReplaceWithMaskFunctor {
 }  // namespace
 
 const FunctionDoc replace_with_mask_doc(
-    "Replace items using a mask and replacement values",
-    ("Given an array and a Boolean mask (either scalar or of equal length), "
-     "along with replacement values (either scalar or array), "
-     "each element of the array for which the corresponding mask element is "
-     "true will be replaced by the next value from the replacements, "
-     "or with null if the mask is null. "
+    "Replace items selected with a mask",
+    ("Given an array and a boolean mask (either scalar or of equal length),\n"
+     "along with replacement values (either scalar or array),\n"
+     "each element of the array for which the corresponding mask element is\n"
+     "true will be replaced by the next value from the replacements,\n"
+     "or with null if the mask is null.\n"
      "Hence, for replacement arrays, len(replacements) == sum(mask == true)."),
     {"values", "mask", "replacements"});
 
diff --git a/cpp/src/arrow/compute/kernels/vector_sort.cc b/cpp/src/arrow/compute/kernels/vector_sort.cc
index 58c86bfe054a5..687faa281a6f9 100644
--- a/cpp/src/arrow/compute/kernels/vector_sort.cc
+++ b/cpp/src/arrow/compute/kernels/vector_sort.cc
@@ -1317,14 +1317,13 @@ const SelectKOptions* GetDefaultSelectKOptions() {
 }
 
 const FunctionDoc select_k_unstable_doc(
-    "Selects the indices of the first `k` ordered elements from the input",
-    ("This function selects an array of indices of the first `k` ordered elements from\n"
-     "the input array, record batch or table specified in the column keys\n"
+    "Select the indices of the first `k` ordered elements from the input",
+    ("This function selects an array of indices of the first `k` ordered elements\n"
+     "from the `input` array, record batch or table specified in the column keys\n"
      "(`options.sort_keys`). Output is not guaranteed to be stable.\n"
-     "The columns that are not specified are returned as well, but not used for\n"
-     "ordering. Null values are considered  greater than any other value and are\n"
-     "therefore sorted at the end of the array. For floating-point types, ordering of\n"
-     "values is such that: Null > NaN > Inf > number."),
+     "Null values are considered greater than any other value and are\n"
+     "therefore ordered at the end. For floating-point types, NaNs are considered\n"
+     "greater than any other non-null value, but smaller than null values."),
     {"input"}, "SelectKOptions");
 
 Result<std::shared_ptr<ArrayData>> MakeMutableUInt64Array(
diff --git a/cpp/src/arrow/compute/registry.cc b/cpp/src/arrow/compute/registry.cc
index bd303ea42bc76..c6455874dbc90 100644
--- a/cpp/src/arrow/compute/registry.cc
+++ b/cpp/src/arrow/compute/registry.cc
@@ -35,7 +35,11 @@ namespace compute {
 class FunctionRegistry::FunctionRegistryImpl {
  public:
   Status AddFunction(std::shared_ptr<Function> function, bool allow_overwrite) {
+#ifndef NDEBUG
+    // This validates docstrings extensively, so don't waste time on it
+    // in release builds.
     RETURN_NOT_OK(function->Validate());
+#endif
 
     std::lock_guard<std::mutex> mutation_guard(lock_);
 

From de5aa544275c9664d17874b1b93de195fb619825 Mon Sep 17 00:00:00 2001
From: Chris Casola <ccasola@factset.com>
Date: Thu, 18 Nov 2021 15:06:36 -0500
Subject: [PATCH 172/194] ARROW-14717: [Go] Use the ipc.Reader allocator in
 messageReader

Allocate the body of messages from the memory.Allocator instead
of using make.

Closes #11712 from chriscasola/ccasola--jira-14717

Authored-by: Chris Casola <ccasola@factset.com>
Signed-off-by: Matthew Topol <mtopol@factset.com>
---
 go/arrow/internal/arrdata/ioutil.go  |   1 +
 go/arrow/ipc/file_reader.go          |  40 ++++++++---
 go/arrow/ipc/message.go              |  19 +++--
 go/arrow/ipc/message_test.go         | 102 +++++++++++++++++++++++++++
 go/arrow/ipc/reader.go               |   4 +-
 go/arrow/memory/checked_allocator.go |  21 +++---
 6 files changed, 162 insertions(+), 25 deletions(-)
 create mode 100644 go/arrow/ipc/message_test.go

diff --git a/go/arrow/internal/arrdata/ioutil.go b/go/arrow/internal/arrdata/ioutil.go
index a7e3e4135a302..c6a2a399f75ce 100644
--- a/go/arrow/internal/arrdata/ioutil.go
+++ b/go/arrow/internal/arrdata/ioutil.go
@@ -85,6 +85,7 @@ func CheckArrowConcurrentFile(t *testing.T, f *os.File, mem memory.Allocator, sc
 			errs <- fmt.Errorf("could not read record %d: %v", i, err)
 			return
 		}
+		defer rec.Release()
 		if !array.RecordEqual(rec, recs[i]) {
 			errs <- fmt.Errorf("records[%d] differ", i)
 		}
diff --git a/go/arrow/ipc/file_reader.go b/go/arrow/ipc/file_reader.go
index 78d8067c082b0..b258199225537 100644
--- a/go/arrow/ipc/file_reader.go
+++ b/go/arrow/ipc/file_reader.go
@@ -47,6 +47,8 @@ type FileReader struct {
 
 	irec int   // current record index. used for the arrio.Reader interface
 	err  error // last error
+
+	mem memory.Allocator
 }
 
 // NewFileReader opens an Arrow file using the provided reader r.
@@ -59,6 +61,7 @@ func NewFileReader(r ReadAtSeeker, opts ...Option) (*FileReader, error) {
 			r:      r,
 			fields: make(dictTypeMap),
 			memo:   newMemo(),
+			mem:    cfg.alloc,
 		}
 	)
 
@@ -288,7 +291,7 @@ func (f *FileReader) RecordAt(i int) (array.Record, error) {
 		return nil, xerrors.Errorf("arrow/ipc: message %d is not a Record", i)
 	}
 
-	return newRecord(f.schema, msg.meta, bytes.NewReader(msg.body.Bytes())), nil
+	return newRecord(f.schema, msg.meta, bytes.NewReader(msg.body.Bytes()), f.mem), nil
 }
 
 // Read reads the current record from the underlying stream and an error, if any.
@@ -310,7 +313,7 @@ func (f *FileReader) ReadAt(i int64) (array.Record, error) {
 	return f.Record(int(i))
 }
 
-func newRecord(schema *arrow.Schema, meta *memory.Buffer, body ReadAtSeeker) array.Record {
+func newRecord(schema *arrow.Schema, meta *memory.Buffer, body ReadAtSeeker, mem memory.Allocator) array.Record {
 	var (
 		msg   = flatbuf.GetRootAsMessage(meta.Bytes(), 0)
 		md    flatbuf.RecordBatch
@@ -329,6 +332,7 @@ func newRecord(schema *arrow.Schema, meta *memory.Buffer, body ReadAtSeeker) arr
 			meta:  &md,
 			r:     body,
 			codec: codec,
+			mem:   mem,
 		},
 		max: kMaxNestingDepth,
 	}
@@ -336,6 +340,7 @@ func newRecord(schema *arrow.Schema, meta *memory.Buffer, body ReadAtSeeker) arr
 	cols := make([]array.Interface, len(schema.Fields()))
 	for i, field := range schema.Fields() {
 		cols[i] = ctx.loadArray(field.Type)
+		defer cols[i].Release()
 	}
 
 	return array.NewRecord(schema, cols, rows)
@@ -345,6 +350,7 @@ type ipcSource struct {
 	meta  *flatbuf.RecordBatch
 	r     ReadAtSeeker
 	codec decompressor
+	mem   memory.Allocator
 }
 
 func (src *ipcSource) buffer(i int) *memory.Buffer {
@@ -356,10 +362,10 @@ func (src *ipcSource) buffer(i int) *memory.Buffer {
 		return memory.NewBufferBytes(nil)
 	}
 
-	var raw []byte
+	raw := memory.NewResizableBuffer(src.mem)
 	if src.codec == nil {
-		raw = make([]byte, buf.Length())
-		_, err := src.r.ReadAt(raw, buf.Offset())
+		raw.Resize(int(buf.Length()))
+		_, err := src.r.ReadAt(raw.Bytes(), buf.Offset())
 		if err != nil {
 			panic(err)
 		}
@@ -375,19 +381,19 @@ func (src *ipcSource) buffer(i int) *memory.Buffer {
 		var r io.Reader = sr
 		// check for an uncompressed buffer
 		if int64(uncompressedSize) != -1 {
-			raw = make([]byte, uncompressedSize)
+			raw.Resize(int(uncompressedSize))
 			src.codec.Reset(sr)
 			r = src.codec
 		} else {
-			raw = make([]byte, buf.Length())
+			raw.Resize(int(buf.Length()))
 		}
 
-		if _, err = io.ReadFull(r, raw); err != nil {
+		if _, err = io.ReadFull(r, raw.Bytes()); err != nil {
 			panic(err)
 		}
 	}
 
-	return memory.NewBufferBytes(raw)
+	return raw
 }
 
 func (src *ipcSource) fieldMetadata(i int) *flatbuf.FieldNode {
@@ -507,6 +513,8 @@ func (ctx *arrayLoaderContext) loadPrimitive(dt arrow.DataType) array.Interface
 		buffers = append(buffers, ctx.buffer())
 	}
 
+	defer releaseBuffers(buffers)
+
 	data := array.NewData(dt, int(field.Length()), buffers, nil, int(field.NullCount()), 0)
 	defer data.Release()
 
@@ -516,6 +524,7 @@ func (ctx *arrayLoaderContext) loadPrimitive(dt arrow.DataType) array.Interface
 func (ctx *arrayLoaderContext) loadBinary(dt arrow.DataType) array.Interface {
 	field, buffers := ctx.loadCommon(3)
 	buffers = append(buffers, ctx.buffer(), ctx.buffer())
+	defer releaseBuffers(buffers)
 
 	data := array.NewData(dt, int(field.Length()), buffers, nil, int(field.NullCount()), 0)
 	defer data.Release()
@@ -526,6 +535,7 @@ func (ctx *arrayLoaderContext) loadBinary(dt arrow.DataType) array.Interface {
 func (ctx *arrayLoaderContext) loadFixedSizeBinary(dt *arrow.FixedSizeBinaryType) array.Interface {
 	field, buffers := ctx.loadCommon(2)
 	buffers = append(buffers, ctx.buffer())
+	defer releaseBuffers(buffers)
 
 	data := array.NewData(dt, int(field.Length()), buffers, nil, int(field.NullCount()), 0)
 	defer data.Release()
@@ -536,6 +546,7 @@ func (ctx *arrayLoaderContext) loadFixedSizeBinary(dt *arrow.FixedSizeBinaryType
 func (ctx *arrayLoaderContext) loadMap(dt *arrow.MapType) array.Interface {
 	field, buffers := ctx.loadCommon(2)
 	buffers = append(buffers, ctx.buffer())
+	defer releaseBuffers(buffers)
 
 	sub := ctx.loadChild(dt.ValueType())
 	defer sub.Release()
@@ -549,6 +560,7 @@ func (ctx *arrayLoaderContext) loadMap(dt *arrow.MapType) array.Interface {
 func (ctx *arrayLoaderContext) loadList(dt *arrow.ListType) array.Interface {
 	field, buffers := ctx.loadCommon(2)
 	buffers = append(buffers, ctx.buffer())
+	defer releaseBuffers(buffers)
 
 	sub := ctx.loadChild(dt.Elem())
 	defer sub.Release()
@@ -561,6 +573,7 @@ func (ctx *arrayLoaderContext) loadList(dt *arrow.ListType) array.Interface {
 
 func (ctx *arrayLoaderContext) loadFixedSizeList(dt *arrow.FixedSizeListType) array.Interface {
 	field, buffers := ctx.loadCommon(1)
+	defer releaseBuffers(buffers)
 
 	sub := ctx.loadChild(dt.Elem())
 	defer sub.Release()
@@ -573,6 +586,7 @@ func (ctx *arrayLoaderContext) loadFixedSizeList(dt *arrow.FixedSizeListType) ar
 
 func (ctx *arrayLoaderContext) loadStruct(dt *arrow.StructType) array.Interface {
 	field, buffers := ctx.loadCommon(1)
+	defer releaseBuffers(buffers)
 
 	arrs := make([]array.Interface, len(dt.Fields()))
 	subs := make([]*array.Data, len(dt.Fields()))
@@ -634,3 +648,11 @@ func readDictionary(meta *memory.Buffer, types dictTypeMap, r ReadAtSeeker) (int
 
 	panic("not implemented")
 }
+
+func releaseBuffers(buffers []*memory.Buffer) {
+	for _, b := range buffers {
+		if b != nil {
+			b.Release()
+		}
+	}
+}
diff --git a/go/arrow/ipc/message.go b/go/arrow/ipc/message.go
index 2eda586277e01..acaddc2bd7c48 100644
--- a/go/arrow/ipc/message.go
+++ b/go/arrow/ipc/message.go
@@ -154,11 +154,18 @@ type messageReader struct {
 
 	refCount int64
 	msg      *Message
+
+	mem memory.Allocator
 }
 
 // NewMessageReader returns a reader that reads messages from an input stream.
-func NewMessageReader(r io.Reader) MessageReader {
-	return &messageReader{r: r, refCount: 1}
+func NewMessageReader(r io.Reader, opts ...Option) MessageReader {
+	cfg := newConfig()
+	for _, opt := range opts {
+		opt(cfg)
+	}
+
+	return &messageReader{r: r, refCount: 1, mem: cfg.alloc}
 }
 
 // Retain increases the reference count by 1.
@@ -224,12 +231,14 @@ func (r *messageReader) Message() (*Message, error) {
 	meta := flatbuf.GetRootAsMessage(buf, 0)
 	bodyLen := meta.BodyLength()
 
-	buf = make([]byte, bodyLen)
-	_, err = io.ReadFull(r.r, buf)
+	body := memory.NewResizableBuffer(r.mem)
+	defer body.Release()
+	body.Resize(int(bodyLen))
+
+	_, err = io.ReadFull(r.r, body.Bytes())
 	if err != nil {
 		return nil, xerrors.Errorf("arrow/ipc: could not read message body: %w", err)
 	}
-	body := memory.NewBufferBytes(buf)
 
 	if r.msg != nil {
 		r.msg.Release()
diff --git a/go/arrow/ipc/message_test.go b/go/arrow/ipc/message_test.go
new file mode 100644
index 0000000000000..22a6873eadcb3
--- /dev/null
+++ b/go/arrow/ipc/message_test.go
@@ -0,0 +1,102 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ipc
+
+import (
+	"bytes"
+	"io"
+	"testing"
+
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/memory"
+)
+
+func TestMessageReaderBodyInAllocator(t *testing.T) {
+	mem := memory.NewCheckedAllocator(memory.NewGoAllocator())
+	defer mem.AssertSize(t, 0)
+
+	const numRecords = 3
+	buf := writeRecordsIntoBuffer(t, numRecords)
+	r := NewMessageReader(buf, WithAllocator(mem))
+	defer r.Release()
+
+	msgs := make([]*Message, 0)
+	for {
+		m, err := r.Message()
+		if err == io.EOF {
+			break
+		}
+		if err != nil {
+			t.Fatal(err)
+		}
+		m.Retain()
+		msgs = append(msgs, m)
+	}
+	if len(msgs) != numRecords+1 {
+		t.Fatalf("expected %d messages but got %d", numRecords+1, len(msgs))
+	}
+
+	if mem.CurrentAlloc() <= 0 {
+		t.Fatal("message bodies should have been allocated")
+	}
+
+	for _, m := range msgs {
+		m.Release()
+	}
+}
+
+func writeRecordsIntoBuffer(t *testing.T, numRecords int) *bytes.Buffer {
+	mem := memory.NewCheckedAllocator(memory.NewGoAllocator())
+	defer mem.AssertSize(t, 0)
+
+	s, recs := getTestRecords(mem, numRecords)
+	buf := new(bytes.Buffer)
+	w := NewWriter(buf, WithAllocator(mem), WithSchema(s))
+	for _, rec := range recs {
+		err := w.Write(rec)
+		rec.Release()
+		if err != nil {
+			t.Fatal(err)
+		}
+	}
+	if err := w.Close(); err != nil {
+		t.Fatal(err)
+	}
+	return buf
+}
+
+func getTestRecords(mem memory.Allocator, numRecords int) (*arrow.Schema, []array.Record) {
+	meta := arrow.NewMetadata([]string{}, []string{})
+	s := arrow.NewSchema([]arrow.Field{
+		{Name: "test-col", Type: arrow.PrimitiveTypes.Int64},
+	}, &meta)
+
+	builder := array.NewRecordBuilder(mem, s)
+	defer builder.Release()
+
+	recs := make([]array.Record, numRecords)
+	for i := 0; i < len(recs); i++ {
+		col := builder.Field(0).(*array.Int64Builder)
+		for i := 0; i < 10; i++ {
+			col.Append(int64(i))
+		}
+		recs[i] = builder.NewRecord()
+	}
+
+	return s, recs
+}
diff --git a/go/arrow/ipc/reader.go b/go/arrow/ipc/reader.go
index 42572b7396be1..1ccfac123a976 100644
--- a/go/arrow/ipc/reader.go
+++ b/go/arrow/ipc/reader.go
@@ -75,7 +75,7 @@ func NewReaderFromMessageReader(r MessageReader, opts ...Option) (*Reader, error
 
 // NewReader returns a reader that reads records from an input stream.
 func NewReader(r io.Reader, opts ...Option) (*Reader, error) {
-	return NewReaderFromMessageReader(NewMessageReader(r), opts...)
+	return NewReaderFromMessageReader(NewMessageReader(r, opts...), opts...)
 }
 
 // Err returns the last error encountered during the iteration over the
@@ -176,7 +176,7 @@ func (r *Reader) next() bool {
 		return false
 	}
 
-	r.rec = newRecord(r.schema, msg.meta, bytes.NewReader(msg.body.Bytes()))
+	r.rec = newRecord(r.schema, msg.meta, bytes.NewReader(msg.body.Bytes()), r.mem)
 	return true
 }
 
diff --git a/go/arrow/memory/checked_allocator.go b/go/arrow/memory/checked_allocator.go
index da300aefd34a2..06be9bda30924 100644
--- a/go/arrow/memory/checked_allocator.go
+++ b/go/arrow/memory/checked_allocator.go
@@ -21,12 +21,13 @@ import (
 	"runtime"
 	"strconv"
 	"sync"
+	"sync/atomic"
 	"unsafe"
 )
 
 type CheckedAllocator struct {
 	mem Allocator
-	sz  int
+	sz  int64
 
 	allocs sync.Map
 }
@@ -35,10 +36,10 @@ func NewCheckedAllocator(mem Allocator) *CheckedAllocator {
 	return &CheckedAllocator{mem: mem}
 }
 
-func (a *CheckedAllocator) CurrentAlloc() int { return a.sz }
+func (a *CheckedAllocator) CurrentAlloc() int { return int(atomic.LoadInt64(&a.sz)) }
 
 func (a *CheckedAllocator) Allocate(size int) []byte {
-	a.sz += size
+	atomic.AddInt64(&a.sz, int64(size))
 	out := a.mem.Allocate(size)
 	if size == 0 {
 		return out
@@ -52,7 +53,7 @@ func (a *CheckedAllocator) Allocate(size int) []byte {
 }
 
 func (a *CheckedAllocator) Reallocate(size int, b []byte) []byte {
-	a.sz += size - len(b)
+	atomic.AddInt64(&a.sz, int64(size-len(b)))
 
 	oldptr := uintptr(unsafe.Pointer(&b[0]))
 	out := a.mem.Reallocate(size, b)
@@ -69,7 +70,7 @@ func (a *CheckedAllocator) Reallocate(size int, b []byte) []byte {
 }
 
 func (a *CheckedAllocator) Free(b []byte) {
-	a.sz -= len(b)
+	atomic.AddInt64(&a.sz, int64(len(b)*-1))
 	defer a.mem.Free(b)
 
 	if len(b) == 0 {
@@ -127,7 +128,7 @@ func (a *CheckedAllocator) AssertSize(t TestingT, sz int) {
 		return true
 	})
 
-	if a.sz != sz {
+	if int(atomic.LoadInt64(&a.sz)) != sz {
 		t.Helper()
 		t.Errorf("invalid memory size exp=%d, got=%d", sz, a.sz)
 	}
@@ -139,13 +140,15 @@ type CheckedAllocatorScope struct {
 }
 
 func NewCheckedAllocatorScope(alloc *CheckedAllocator) *CheckedAllocatorScope {
-	return &CheckedAllocatorScope{alloc: alloc, sz: alloc.sz}
+	sz := atomic.LoadInt64(&alloc.sz)
+	return &CheckedAllocatorScope{alloc: alloc, sz: int(sz)}
 }
 
 func (c *CheckedAllocatorScope) CheckSize(t TestingT) {
-	if c.sz != c.alloc.sz {
+	sz := int(atomic.LoadInt64(&c.alloc.sz))
+	if c.sz != sz {
 		t.Helper()
-		t.Errorf("invalid memory size exp=%d, got=%d", c.sz, c.alloc.sz)
+		t.Errorf("invalid memory size exp=%d, got=%d", c.sz, sz)
 	}
 }
 

From a25aa8da292489457002d7565e764b2da28fcb5f Mon Sep 17 00:00:00 2001
From: Dominik Moritz <domoritz@gmail.com>
Date: Fri, 19 Nov 2021 09:16:38 +0900
Subject: [PATCH 173/194] ARROW-14739: [JS] Ensure docs point to right source

Closes #11728 from domoritz/dom/docs-build-remote

Lead-authored-by: Dominik Moritz <domoritz@gmail.com>
Co-authored-by: Sutou Kouhei <kou@cozmixng.org>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 ci/scripts/js_build.sh                 | 11 ++++++++++-
 dev/tasks/docker-tests/azure.linux.yml |  1 +
 js/DEVELOP.md                          |  4 ++++
 3 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/ci/scripts/js_build.sh b/ci/scripts/js_build.sh
index 10ceb41ee6589..12f58d54bb8a7 100755
--- a/ci/scripts/js_build.sh
+++ b/ci/scripts/js_build.sh
@@ -30,7 +30,16 @@ yarn lint:ci
 yarn build
 
 if [ "${with_docs}" == "true" ]; then
-  yarn doc
+  if [ "$(git config --get remote.origin.url)" == "https://github.com/apache/arrow.git" ]; then
+    yarn doc
+  elif [ "$(git config --get remote.upstream.url)" == "https://github.com/apache/arrow.git" ]; then
+    yarn doc --gitRemote upstream
+  elif [ "$(git config --get remote.apache.url)" == "git@github.com:apache/arrow.git" ]; then
+    yarn doc --gitRemote apache
+  else
+    echo "Failed to build docs because the remote is not set correctly. Please set the origin or upstream remote to https://github.com/apache/arrow.git or the apache remote to git@github.com:apache/arrow.git."
+    exit 0
+  fi
 fi
 
 popd
diff --git a/dev/tasks/docker-tests/azure.linux.yml b/dev/tasks/docker-tests/azure.linux.yml
index b8f1151f70d84..dfa07a0d5b48f 100644
--- a/dev/tasks/docker-tests/azure.linux.yml
+++ b/dev/tasks/docker-tests/azure.linux.yml
@@ -43,6 +43,7 @@ jobs:
       git -C arrow fetch -t {{ arrow.remote }} {{ arrow.branch }}
       git -C arrow checkout FETCH_HEAD
       git -C arrow submodule update --init --recursive
+      git -C arrow remote add upstream https://github.com/apache/arrow.git
     displayName: Clone arrow
 
   - script: pip install -e arrow/dev/archery[docker]
diff --git a/js/DEVELOP.md b/js/DEVELOP.md
index 66cefb084358f..fd62486c403ea 100644
--- a/js/DEVELOP.md
+++ b/js/DEVELOP.md
@@ -70,6 +70,10 @@ To run tests directly on the sources without bundling, use the `src` target (e.g
 
 Uses [lerna](https://github.com/lerna/lerna) to publish each build target to npm with [conventional](https://conventionalcommits.org/) [changelogs](https://github.com/conventional-changelog/conventional-changelog/tree/master/packages/conventional-changelog-cli).
 
+* `yarn doc`
+
+Compiles the documentation with [Typedoc](https://typedoc.org/). Use `yarn doc --watch` to automatically rebuild when the docs change.
+
 # Running the Performance Benchmarks
 
 You can run the benchmarks with `yarn perf`. To print the results to stderr as JSON, add the `--json` flag (e.g. `yarn perf --json 2> perf.json`).

From 0628c76e5c98185662661415bfd2a09c5a50daf6 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Fri, 19 Nov 2021 09:18:25 +0900
Subject: [PATCH 174/194] ARROW-14750: [Release] Update post-03-website.sh for
 6.0.1

  * Add more artifacts
    * AlmaLinux
    * Amazon Linux
    * C#
  * Add support for patch release
  * Ensure updating base branch of arrow-site

Closes #11734 from kou/release-post-website

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 dev/release/post-03-website.sh | 40 ++++++++++++++++++++++++----------
 1 file changed, 29 insertions(+), 11 deletions(-)

diff --git a/dev/release/post-03-website.sh b/dev/release/post-03-website.sh
index 7aceeaf59d76f..772a54d8c3d1b 100755
--- a/dev/release/post-03-website.sh
+++ b/dev/release/post-03-website.sh
@@ -38,12 +38,23 @@ announce_file="${release_dir}/${version}.md"
 versions_yml="${ARROW_SITE_DIR}/_data/versions.yml"
 
 pushd "${ARROW_SITE_DIR}"
+git fetch --all --prune --tags --force -j$(nproc)
 git checkout master
+git rebase apache/master
+git branch -D ${branch_name} || :
 git checkout -b ${branch_name}
 popd
 
 pushd "${ARROW_DIR}"
 
+previous_major_version="$(echo ${previous_version} | grep -o '^[0-9]*')"
+major_version="$(echo ${version} | grep -o '^[0-9]*')"
+if [ ${previous_major_version} -eq ${major_version} ]; then
+  release_type=patch
+else
+  release_type=major
+fi
+
 release_date=$(LANG=C date "+%-d %B %Y")
 previous_tag_date=$(git log -n 1 --pretty=%aI apache-arrow-${previous_version})
 rough_previous_release_date=$(date --date "${previous_tag_date}" +%s)
@@ -98,17 +109,20 @@ limitations under the License.
 
 # Apache Arrow ${version} (${release_date})
 
-This is a major release covering more than ${rough_n_development_months} months of development.
+This is a ${release_type} release covering more than ${rough_n_development_months} months of development.
 
 ## Download
 
 * [**Source Artifacts**][1]
 * **Binary Artifacts**
-  * [For CentOS][2]
-  * [For Debian][3]
-  * [For Python][4]
-  * [For Ubuntu][5]
-* [Git tag][6]
+  * [For AlmaLinux][2]
+  * [For Amazon Linux][3]
+  * [For CentOS][4]
+  * [For C#][5]
+  * [For Debian][6]
+  * [For Python][7]
+  * [For Ubuntu][8]
+* [Git tag][9]
 
 ## Contributors
 
@@ -145,11 +159,14 @@ archery release changelog generate ${version} | \
 
 cat <<ANNOUNCE >> "${announce_file}"
 [1]: https://www.apache.org/dyn/closer.lua/arrow/arrow-${version}/
-[2]: https://apache.jfrog.io/artifactory/arrow/centos/
-[3]: https://apache.jfrog.io/artifactory/arrow/debian/
-[4]: https://apache.jfrog.io/artifactory/arrow/python/${version}/
-[5]: https://apache.jfrog.io/artifactory/arrow/ubuntu/
-[6]: https://github.com/apache/arrow/releases/tag/apache-arrow-${version}
+[2]: https://apache.jfrog.io/artifactory/arrow/almalinux/
+[3]: https://apache.jfrog.io/artifactory/arrow/amazon-linux/
+[4]: https://apache.jfrog.io/artifactory/arrow/centos/
+[5]: https://apache.jfrog.io/artifactory/arrow/nuget/
+[6]: https://apache.jfrog.io/artifactory/arrow/debian/
+[7]: https://apache.jfrog.io/artifactory/arrow/python/${version}/
+[8]: https://apache.jfrog.io/artifactory/arrow/ubuntu/
+[9]: https://github.com/apache/arrow/releases/tag/apache-arrow-${version}
 ANNOUNCE
 git add "${announce_file}"
 
@@ -237,6 +254,7 @@ cat <<YAML > "${versions_yml}"
 current:
   number: '${version}'
   pinned_number: '${pinned_version}'
+  major_number: '${major_version}'
   date: '${release_date}'
   git-tag: '${git_tag_hash}'
   github-tag-link: 'https://github.com/apache/arrow/releases/tag/${git_tag}'

From 607464d90ba4bc319f1566101e884d21cc17e469 Mon Sep 17 00:00:00 2001
From: Benson Muite <bkmgit@users.noreply.github.com>
Date: Fri, 19 Nov 2021 16:11:43 +0900
Subject: [PATCH 175/194] ARROW-14777: [Release] Enable to run on RHEL
 derivatives

Closes #11743 from bkmgit/ARROW-14777

Authored-by: Benson Muite <bkmgit@users.noreply.github.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 dev/release/02-source.sh | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/dev/release/02-source.sh b/dev/release/02-source.sh
index 89c8e9889f6c6..a5cf64266e4f2 100755
--- a/dev/release/02-source.sh
+++ b/dev/release/02-source.sh
@@ -85,11 +85,20 @@ if [ ${SOURCE_RAT} -gt 0 ]; then
   "${SOURCE_DIR}/run-rat.sh" ${tarball}
 fi
 
+if type shasum >/dev/null 2>&1; then
+  sha256_generate="shasum -a 256"
+  sha512_generate="shasum -a 512"
+else
+  sha256_generate="sha256sum"
+  sha512_generate="sha512sum"
+fi
+
+
 if [ ${SOURCE_UPLOAD} -gt 0 ]; then
   # sign the archive
   gpg --armor --output ${tarball}.asc --detach-sig ${tarball}
-  shasum -a 256 $tarball > ${tarball}.sha256
-  shasum -a 512 $tarball > ${tarball}.sha512
+  ${sha256_generate} $tarball > ${tarball}.sha256
+  ${sha512_generate} $tarball > ${tarball}.sha512
 
   # check out the arrow RC folder
   svn co --depth=empty https://dist.apache.org/repos/dist/dev/arrow tmp

From 463108dda72aafd60fd37b960226585dcf777c4a Mon Sep 17 00:00:00 2001
From: Weston Pace <weston.pace@gmail.com>
Date: Thu, 18 Nov 2021 22:08:21 -1000
Subject: [PATCH 176/194] ARROW-14667: [C++] Added a dcheck to ensure aws is
 initialized before s3 options are used

ARROW-14667 was fixed in R but we could probably do better to prevent this kind of thing.  Short of requiring `S3Options` to have a `Make` method we could just add a simple `DCHECK`.

Closes #11697 from westonpace/bugfix/ARROW-14667--add-check-for-s3-init-to-options

Authored-by: Weston Pace <weston.pace@gmail.com>
Signed-off-by: Weston Pace <weston.pace@gmail.com>
---
 cpp/src/arrow/filesystem/s3fs.cc | 4 ++++
 cpp/src/arrow/filesystem/s3fs.h  | 2 ++
 2 files changed, 6 insertions(+)

diff --git a/cpp/src/arrow/filesystem/s3fs.cc b/cpp/src/arrow/filesystem/s3fs.cc
index 49766d1758656..34fccd4c9d04b 100644
--- a/cpp/src/arrow/filesystem/s3fs.cc
+++ b/cpp/src/arrow/filesystem/s3fs.cc
@@ -210,6 +210,10 @@ bool S3ProxyOptions::Equals(const S3ProxyOptions& other) const {
 // -----------------------------------------------------------------------
 // S3Options implementation
 
+S3Options::S3Options() {
+  DCHECK(aws_initialized.load()) << "Must initialize S3 before using S3Options";
+}
+
 void S3Options::ConfigureDefaultCredentials() {
   credentials_provider =
       std::make_shared<Aws::Auth::DefaultAWSCredentialsProviderChain>();
diff --git a/cpp/src/arrow/filesystem/s3fs.h b/cpp/src/arrow/filesystem/s3fs.h
index abb9c852a952c..502353ef14348 100644
--- a/cpp/src/arrow/filesystem/s3fs.h
+++ b/cpp/src/arrow/filesystem/s3fs.h
@@ -139,6 +139,8 @@ struct ARROW_EXPORT S3Options {
   /// delay between retries.
   std::shared_ptr<S3RetryStrategy> retry_strategy;
 
+  S3Options();
+
   /// Configure with the default AWS credentials provider chain.
   void ConfigureDefaultCredentials();
 

From 7130b8f8e471dee84067ea98668d756dc827831a Mon Sep 17 00:00:00 2001
From: Romain Francois <romain@rstudio.com>
Date: Fri, 19 Nov 2021 09:22:56 -0500
Subject: [PATCH 177/194] ARROW-14712: [R] fix compare_dplyr_error() for dplyr
 1.0.8

As of `dplyr` 1.0.8, we're starting to use chained errors so the messages this was after is now in the parent error.

This pull request should be fine with both cran dplyr and future dplyr. We then can adapt again when dplyr 1.0.8 is on cran.

Closes #11736 from romainfrancois/ARROW_14712

Authored-by: Romain Francois <romain@rstudio.com>
Signed-off-by: Neal Richardson <neal.p.richardson@gmail.com>
---
 r/tests/testthat/helper-expectation.R | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/r/tests/testthat/helper-expectation.R b/r/tests/testthat/helper-expectation.R
index 3a255fcda2cb1..06a29fa6b7445 100644
--- a/r/tests/testthat/helper-expectation.R
+++ b/r/tests/testthat/helper-expectation.R
@@ -157,6 +157,10 @@ compare_dplyr_error <- function(expr,
     error = function(e) {
       msg <- conditionMessage(e)
 
+      if (grepl("Problem while computing", msg[1])) {
+        msg <- conditionMessage(e$parent)
+      }
+
       # The error here is of the form:
       #
       # Problem with `filter()` .input `..1`.

From a2b7ba367733ee8686f26680985bed3a530a5722 Mon Sep 17 00:00:00 2001
From: Matthew Topol <mtopol@factset.com>
Date: Fri, 19 Nov 2021 10:16:02 -0500
Subject: [PATCH 178/194] ARROW-9630: [Go] Implement public JSON reader/writer

This JSON reader/writer is intended to support the standard Go encoding/json package for Go and writes out arrays and records as data, not as the integration format. This implements `MarshalJSON` for the array types, and `UnmarshalJSON` for the array builders (in order to unmarshal json data to add to an existing builder) along with providing a few helper functions for single array from json or single recordbatch from json logic.

Closes #11359 from zeroshade/arrow-9630-json-rw

Authored-by: Matthew Topol <mtopol@factset.com>
Signed-off-by: Matthew Topol <mtopol@factset.com>
---
 go/arrow/array/array.go                   |    5 +
 go/arrow/array/binary.go                  |   18 +
 go/arrow/array/binarybuilder.go           |   55 ++
 go/arrow/array/boolean.go                 |   20 +
 go/arrow/array/booleanbuilder.go          |   55 ++
 go/arrow/array/builder.go                 |    7 +
 go/arrow/array/decimal128.go              |   92 ++
 go/arrow/array/extension.go               |   14 +
 go/arrow/array/fixed_size_list.go         |   93 ++
 go/arrow/array/fixedsize_binary.go        |   21 +
 go/arrow/array/fixedsize_binarybuilder.go |   67 ++
 go/arrow/array/float16.go                 |   20 +
 go/arrow/array/float16_builder.go         |   65 ++
 go/arrow/array/interval.go                |  195 ++++
 go/arrow/array/json_reader.go             |  204 ++++
 go/arrow/array/json_reader_test.go        |  117 +++
 go/arrow/array/list.go                    |   83 ++
 go/arrow/array/map.go                     |   26 +
 go/arrow/array/null.go                    |   54 ++
 go/arrow/array/numeric.gen.go             |  308 ++++++
 go/arrow/array/numeric.gen.go.tmpl        |   43 +-
 go/arrow/array/numericbuilder.gen.go      | 1030 +++++++++++++++++++++
 go/arrow/array/numericbuilder.gen.go.tmpl |  141 ++-
 go/arrow/array/record.go                  |   57 ++
 go/arrow/array/string.go                  |   65 ++
 go/arrow/array/struct.go                  |  105 +++
 go/arrow/array/util.go                    |  196 ++++
 go/arrow/array/util_test.go               |  434 +++++++++
 go/arrow/datatype_fixedwidth.go           |  232 +++++
 go/arrow/datatype_fixedwidth_test.go      |   60 ++
 go/arrow/scalar/parse.go                  |   58 +-
 go/arrow/scalar/temporal.go               |   63 +-
 go/arrow/tools.go                         |   24 +
 go/go.mod                                 |    3 +
 go/go.sum                                 |   39 +-
 35 files changed, 3958 insertions(+), 111 deletions(-)
 create mode 100644 go/arrow/array/json_reader.go
 create mode 100644 go/arrow/array/json_reader_test.go
 create mode 100644 go/arrow/array/util_test.go
 create mode 100644 go/arrow/tools.go

diff --git a/go/arrow/array/array.go b/go/arrow/array/array.go
index 3ac8c623a51c1..2d75b511630d2 100644
--- a/go/arrow/array/array.go
+++ b/go/arrow/array/array.go
@@ -17,6 +17,7 @@
 package array
 
 import (
+	"encoding/json"
 	"sync/atomic"
 
 	"github.com/apache/arrow/go/v7/arrow"
@@ -26,6 +27,8 @@ import (
 
 // A type which satisfies array.Interface represents an immutable sequence of values.
 type Interface interface {
+	json.Marshaler
+
 	// DataType returns the type metadata for this instance.
 	DataType() arrow.DataType
 
@@ -56,6 +59,8 @@ type Interface interface {
 	// Release may be called simultaneously from multiple goroutines.
 	// When the reference count goes to zero, the memory is freed.
 	Release()
+
+	getOneForMarshal(i int) interface{}
 }
 
 const (
diff --git a/go/arrow/array/binary.go b/go/arrow/array/binary.go
index db932a1e3e54c..66934e2e92543 100644
--- a/go/arrow/array/binary.go
+++ b/go/arrow/array/binary.go
@@ -23,6 +23,7 @@ import (
 	"unsafe"
 
 	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/goccy/go-json"
 )
 
 // A type which represents an immutable sequence of variable-length binary strings.
@@ -117,6 +118,23 @@ func (a *Binary) setData(data *Data) {
 	}
 }
 
+func (a *Binary) getOneForMarshal(i int) interface{} {
+	if a.IsNull(i) {
+		return nil
+	}
+	return a.Value(i)
+}
+
+func (a *Binary) MarshalJSON() ([]byte, error) {
+	vals := make([]interface{}, a.Len())
+	for i := 0; i < a.Len(); i++ {
+		vals[i] = a.getOneForMarshal(i)
+	}
+	// golang marshal standard says that []byte will be marshalled
+	// as a base64-encoded string
+	return json.Marshal(vals)
+}
+
 func arrayEqualBinary(left, right *Binary) bool {
 	for i := 0; i < left.Len(); i++ {
 		if left.IsNull(i) {
diff --git a/go/arrow/array/binarybuilder.go b/go/arrow/array/binarybuilder.go
index 24ac914957176..758bea9325e49 100644
--- a/go/arrow/array/binarybuilder.go
+++ b/go/arrow/array/binarybuilder.go
@@ -17,12 +17,17 @@
 package array
 
 import (
+	"bytes"
+	"encoding/base64"
+	"fmt"
 	"math"
+	"reflect"
 	"sync/atomic"
 
 	"github.com/apache/arrow/go/v7/arrow"
 	"github.com/apache/arrow/go/v7/arrow/internal/debug"
 	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/goccy/go-json"
 )
 
 const (
@@ -212,6 +217,56 @@ func (b *BinaryBuilder) appendNextOffset() {
 	b.offsets.AppendValue(int32(numBytes))
 }
 
+func (b *BinaryBuilder) unmarshalOne(dec *json.Decoder) error {
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	switch v := t.(type) {
+	case string:
+		data, err := base64.StdEncoding.DecodeString(v)
+		if err != nil {
+			return err
+		}
+		b.Append(data)
+	case []byte:
+		b.Append(v)
+	case nil:
+		b.AppendNull()
+	default:
+		return &json.UnmarshalTypeError{
+			Value:  fmt.Sprint(t),
+			Type:   reflect.TypeOf([]byte{}),
+			Offset: dec.InputOffset(),
+		}
+	}
+	return nil
+}
+
+func (b *BinaryBuilder) unmarshal(dec *json.Decoder) error {
+	for dec.More() {
+		if err := b.unmarshalOne(dec); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (b *BinaryBuilder) UnmarshalJSON(data []byte) error {
+	dec := json.NewDecoder(bytes.NewReader(data))
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	if delim, ok := t.(json.Delim); !ok || delim != '[' {
+		return fmt.Errorf("binary builder must unpack from json array, found %s", delim)
+	}
+
+	return b.unmarshal(dec)
+}
+
 var (
 	_ Builder = (*BinaryBuilder)(nil)
 )
diff --git a/go/arrow/array/boolean.go b/go/arrow/array/boolean.go
index e4c8a77b32437..36e6b124b8ee0 100644
--- a/go/arrow/array/boolean.go
+++ b/go/arrow/array/boolean.go
@@ -23,6 +23,7 @@ import (
 	"github.com/apache/arrow/go/v7/arrow"
 	"github.com/apache/arrow/go/v7/arrow/bitutil"
 	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/goccy/go-json"
 )
 
 // A type which represents an immutable sequence of boolean values.
@@ -78,6 +79,25 @@ func (a *Boolean) setData(data *Data) {
 	}
 }
 
+func (a *Boolean) getOneForMarshal(i int) interface{} {
+	if a.IsValid(i) {
+		return a.Value(i)
+	}
+	return nil
+}
+
+func (a *Boolean) MarshalJSON() ([]byte, error) {
+	vals := make([]interface{}, a.Len())
+	for i := 0; i < a.Len(); i++ {
+		if a.IsValid(i) {
+			vals[i] = a.Value(i)
+		} else {
+			vals[i] = nil
+		}
+	}
+	return json.Marshal(vals)
+}
+
 func arrayEqualBoolean(left, right *Boolean) bool {
 	for i := 0; i < left.Len(); i++ {
 		if left.IsNull(i) {
diff --git a/go/arrow/array/booleanbuilder.go b/go/arrow/array/booleanbuilder.go
index 7c81c2ec93401..5cf24d46774ae 100644
--- a/go/arrow/array/booleanbuilder.go
+++ b/go/arrow/array/booleanbuilder.go
@@ -17,12 +17,17 @@
 package array
 
 import (
+	"bytes"
+	"fmt"
+	"reflect"
+	"strconv"
 	"sync/atomic"
 
 	"github.com/apache/arrow/go/v7/arrow"
 	"github.com/apache/arrow/go/v7/arrow/bitutil"
 	"github.com/apache/arrow/go/v7/arrow/internal/debug"
 	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/goccy/go-json"
 )
 
 type BooleanBuilder struct {
@@ -160,6 +165,56 @@ func (b *BooleanBuilder) newData() *Data {
 	return res
 }
 
+func (b *BooleanBuilder) unmarshalOne(dec *json.Decoder) error {
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	switch v := t.(type) {
+	case bool:
+		b.Append(v)
+	case string:
+		val, err := strconv.ParseBool(v)
+		if err != nil {
+			return err
+		}
+		b.Append(val)
+	case nil:
+		b.AppendNull()
+	default:
+		return &json.UnmarshalTypeError{
+			Value:  fmt.Sprint(t),
+			Type:   reflect.TypeOf(true),
+			Offset: dec.InputOffset(),
+		}
+	}
+	return nil
+}
+
+func (b *BooleanBuilder) unmarshal(dec *json.Decoder) error {
+	for dec.More() {
+		if err := b.unmarshalOne(dec); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (b *BooleanBuilder) UnmarshalJSON(data []byte) error {
+	dec := json.NewDecoder(bytes.NewReader(data))
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	if delim, ok := t.(json.Delim); !ok || delim != '[' {
+		return fmt.Errorf("boolean builder must unpack from json array, found %s", delim)
+	}
+
+	return b.unmarshal(dec)
+}
+
 var (
 	_ Builder = (*BooleanBuilder)(nil)
 )
diff --git a/go/arrow/array/builder.go b/go/arrow/array/builder.go
index ad56d8102525c..bfd8354cde36a 100644
--- a/go/arrow/array/builder.go
+++ b/go/arrow/array/builder.go
@@ -23,6 +23,7 @@ import (
 	"github.com/apache/arrow/go/v7/arrow"
 	"github.com/apache/arrow/go/v7/arrow/bitutil"
 	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/goccy/go-json"
 )
 
 const (
@@ -31,6 +32,9 @@ const (
 
 // Builder provides an interface to build arrow arrays.
 type Builder interface {
+	// you can unmarshal a json array to add the values to a builder
+	json.Unmarshaler
+
 	// Retain increases the reference count by 1.
 	// Retain may be called simultaneously from multiple goroutines.
 	Retain()
@@ -66,6 +70,9 @@ type Builder interface {
 
 	init(capacity int)
 	resize(newBits int, init func(int))
+
+	unmarshalOne(*json.Decoder) error
+	unmarshal(*json.Decoder) error
 }
 
 // builder provides common functionality for managing the validity bitmap (nulls) when building arrays.
diff --git a/go/arrow/array/decimal128.go b/go/arrow/array/decimal128.go
index 10559661820b4..3d7c95ec646ee 100644
--- a/go/arrow/array/decimal128.go
+++ b/go/arrow/array/decimal128.go
@@ -17,7 +17,11 @@
 package array
 
 import (
+	"bytes"
 	"fmt"
+	"math"
+	"math/big"
+	"reflect"
 	"strings"
 	"sync/atomic"
 
@@ -26,6 +30,7 @@ import (
 	"github.com/apache/arrow/go/v7/arrow/decimal128"
 	"github.com/apache/arrow/go/v7/arrow/internal/debug"
 	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/goccy/go-json"
 )
 
 // A type which represents an immutable sequence of 128-bit decimal values.
@@ -75,6 +80,25 @@ func (a *Decimal128) setData(data *Data) {
 	}
 }
 
+func (a *Decimal128) getOneForMarshal(i int) interface{} {
+	if a.IsNull(i) {
+		return nil
+	}
+
+	typ := a.DataType().(*arrow.Decimal128Type)
+	f := (&big.Float{}).SetInt(a.Value(i).BigInt())
+	f.Quo(f, big.NewFloat(math.Pow10(int(typ.Scale))))
+	return f.Text('g', int(typ.Precision))
+}
+
+func (a *Decimal128) MarshalJSON() ([]byte, error) {
+	vals := make([]interface{}, a.Len())
+	for i := 0; i < a.Len(); i++ {
+		vals[i] = a.getOneForMarshal(i)
+	}
+	return json.Marshal(vals)
+}
+
 func arrayEqualDecimal128(left, right *Decimal128) bool {
 	for i := 0; i < left.Len(); i++ {
 		if left.IsNull(i) {
@@ -229,6 +253,74 @@ func (b *Decimal128Builder) newData() (data *Data) {
 	return
 }
 
+func (b *Decimal128Builder) unmarshalOne(dec *json.Decoder) error {
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	var out *big.Float
+
+	switch v := t.(type) {
+	case float64:
+		out = big.NewFloat(v)
+	case string:
+		// there's no strong rationale for using ToNearestAway, it's just
+		// what got me the closest equivalent values with the values
+		// that I tested with, and there isn't a good way to push
+		// an option all the way down here to control it.
+		out, _, err = big.ParseFloat(v, 10, 128, big.ToNearestAway)
+		if err != nil {
+			return err
+		}
+	case json.Number:
+		out, _, err = big.ParseFloat(v.String(), 10, 128, big.ToNearestAway)
+		if err != nil {
+			return err
+		}
+	case nil:
+		b.AppendNull()
+		return nil
+	default:
+		return &json.UnmarshalTypeError{
+			Value:  fmt.Sprint(t),
+			Type:   reflect.TypeOf(decimal128.Num{}),
+			Offset: dec.InputOffset(),
+		}
+	}
+
+	val, _ := out.Mul(out, big.NewFloat(math.Pow10(int(b.dtype.Scale)))).Int(nil)
+	b.Append(decimal128.FromBigInt(val))
+	return nil
+}
+
+func (b *Decimal128Builder) unmarshal(dec *json.Decoder) error {
+	for dec.More() {
+		if err := b.unmarshalOne(dec); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// UnmarshalJSON will add the unmarshalled values to this builder.
+//
+// If the values are strings, they will get parsed with big.ParseFloat using
+// a rounding mode of big.ToNearestAway currently.
+func (b *Decimal128Builder) UnmarshalJSON(data []byte) error {
+	dec := json.NewDecoder(bytes.NewReader(data))
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	if delim, ok := t.(json.Delim); !ok || delim != '[' {
+		return fmt.Errorf("decimal128 builder must unpack from json array, found %s", delim)
+	}
+
+	return b.unmarshal(dec)
+}
+
 var (
 	_ Interface = (*Decimal128)(nil)
 	_ Builder   = (*Decimal128Builder)(nil)
diff --git a/go/arrow/array/extension.go b/go/arrow/array/extension.go
index 5172158ca243d..db87b488bbd2d 100644
--- a/go/arrow/array/extension.go
+++ b/go/arrow/array/extension.go
@@ -21,6 +21,7 @@ import (
 
 	"github.com/apache/arrow/go/v7/arrow"
 	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/goccy/go-json"
 	"golang.org/x/xerrors"
 )
 
@@ -129,6 +130,14 @@ type ExtensionArrayBase struct {
 	storage Interface
 }
 
+func (e *ExtensionArrayBase) getOneForMarshal(i int) interface{} {
+	return e.storage.getOneForMarshal(i)
+}
+
+func (e *ExtensionArrayBase) MarshalJSON() ([]byte, error) {
+	return json.Marshal(e.storage)
+}
+
 // Retain increases the reference count by 1.
 // Retain may be called simultaneously from multiple goroutines.
 func (e *ExtensionArrayBase) Retain() {
@@ -234,3 +243,8 @@ func (b *ExtensionBuilder) NewExtensionArray() ExtensionArray {
 	defer data.Release()
 	return NewExtensionData(data)
 }
+
+var (
+	_ Interface = (ExtensionArray)(nil)
+	_ Builder   = (*ExtensionBuilder)(nil)
+)
diff --git a/go/arrow/array/fixed_size_list.go b/go/arrow/array/fixed_size_list.go
index 1bf07d67e0ce1..acef33c797887 100644
--- a/go/arrow/array/fixed_size_list.go
+++ b/go/arrow/array/fixed_size_list.go
@@ -17,6 +17,7 @@
 package array
 
 import (
+	"bytes"
 	"fmt"
 	"strings"
 	"sync/atomic"
@@ -25,6 +26,7 @@ import (
 	"github.com/apache/arrow/go/v7/arrow/bitutil"
 	"github.com/apache/arrow/go/v7/arrow/internal/debug"
 	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/goccy/go-json"
 )
 
 // FixedSizeList represents an immutable sequence of N array values.
@@ -110,6 +112,44 @@ func (a *FixedSizeList) Release() {
 	a.values.Release()
 }
 
+func (a *FixedSizeList) getOneForMarshal(i int) interface{} {
+	if a.IsNull(i) {
+		return nil
+	}
+	slice := a.newListValue(i)
+	defer slice.Release()
+	v, err := json.Marshal(slice)
+	if err != nil {
+		panic(err)
+	}
+
+	return json.RawMessage(v)
+}
+
+func (a *FixedSizeList) MarshalJSON() ([]byte, error) {
+	var buf bytes.Buffer
+	enc := json.NewEncoder(&buf)
+
+	buf.WriteByte('[')
+	for i := 0; i < a.Len(); i++ {
+		if i != 0 {
+			buf.WriteByte(',')
+		}
+		if a.IsNull(i) {
+			enc.Encode(nil)
+			continue
+		}
+
+		slice := a.newListValue(i)
+		if err := enc.Encode(slice); err != nil {
+			return nil, err
+		}
+		slice.Release()
+	}
+	buf.WriteByte(']')
+	return buf.Bytes(), nil
+}
+
 type FixedSizeListBuilder struct {
 	builder
 
@@ -234,6 +274,59 @@ func (b *FixedSizeListBuilder) newData() (data *Data) {
 	return
 }
 
+func (b *FixedSizeListBuilder) unmarshalOne(dec *json.Decoder) error {
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	switch t {
+	case json.Delim('['):
+		b.Append(true)
+		if err := b.values.unmarshal(dec); err != nil {
+			return err
+		}
+		// consume ']'
+		_, err := dec.Token()
+		return err
+	case nil:
+		b.AppendNull()
+		for i := int32(0); i < b.n; i++ {
+			b.values.AppendNull()
+		}
+	default:
+		return &json.UnmarshalTypeError{
+			Value:  fmt.Sprint(t),
+			Struct: arrow.FixedSizeListOf(b.n, b.etype).String(),
+		}
+	}
+
+	return nil
+}
+
+func (b *FixedSizeListBuilder) unmarshal(dec *json.Decoder) error {
+	for dec.More() {
+		if err := b.unmarshalOne(dec); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (b *FixedSizeListBuilder) UnmarshalJSON(data []byte) error {
+	dec := json.NewDecoder(bytes.NewReader(data))
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	if delim, ok := t.(json.Delim); !ok || delim != '[' {
+		return fmt.Errorf("fixed size list builder must unpack from json array, found %s", delim)
+	}
+
+	return b.unmarshal(dec)
+}
+
 var (
 	_ Interface = (*FixedSizeList)(nil)
 	_ Builder   = (*FixedSizeListBuilder)(nil)
diff --git a/go/arrow/array/fixedsize_binary.go b/go/arrow/array/fixedsize_binary.go
index 6da2d5d45e1b0..867ea7873870f 100644
--- a/go/arrow/array/fixedsize_binary.go
+++ b/go/arrow/array/fixedsize_binary.go
@@ -22,6 +22,7 @@ import (
 	"strings"
 
 	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/goccy/go-json"
 )
 
 // A type which represents an immutable sequence of fixed-length binary strings.
@@ -78,6 +79,26 @@ func (a *FixedSizeBinary) setData(data *Data) {
 
 }
 
+func (a *FixedSizeBinary) getOneForMarshal(i int) interface{} {
+	if a.IsNull(i) {
+		return nil
+	}
+
+	return a.Value(i)
+}
+
+func (a *FixedSizeBinary) MarshalJSON() ([]byte, error) {
+	vals := make([]interface{}, a.Len())
+	for i := 0; i < a.Len(); i++ {
+		if a.IsValid(i) {
+			vals[i] = a.Value(i)
+		} else {
+			vals[i] = nil
+		}
+	}
+	return json.Marshal(vals)
+}
+
 func arrayEqualFixedSizeBinary(left, right *FixedSizeBinary) bool {
 	for i := 0; i < left.Len(); i++ {
 		if left.IsNull(i) {
diff --git a/go/arrow/array/fixedsize_binarybuilder.go b/go/arrow/array/fixedsize_binarybuilder.go
index 8aaa93cd48bda..fee0a7077bbe3 100644
--- a/go/arrow/array/fixedsize_binarybuilder.go
+++ b/go/arrow/array/fixedsize_binarybuilder.go
@@ -17,12 +17,16 @@
 package array
 
 import (
+	"bytes"
+	"encoding/base64"
 	"fmt"
+	"reflect"
 	"sync/atomic"
 
 	"github.com/apache/arrow/go/v7/arrow"
 	"github.com/apache/arrow/go/v7/arrow/internal/debug"
 	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/goccy/go-json"
 )
 
 // A FixedSizeBinaryBuilder is used to build a FixedSizeBinary array using the Append methods.
@@ -149,6 +153,69 @@ func (b *FixedSizeBinaryBuilder) newData() (data *Data) {
 	return
 }
 
+func (b *FixedSizeBinaryBuilder) unmarshalOne(dec *json.Decoder) error {
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	var val []byte
+	switch v := t.(type) {
+	case string:
+		data, err := base64.RawStdEncoding.DecodeString(v)
+		if err != nil {
+			return err
+		}
+		val = data
+	case []byte:
+		val = v
+	case nil:
+		b.AppendNull()
+		return nil
+	default:
+		return &json.UnmarshalTypeError{
+			Value:  fmt.Sprint(t),
+			Type:   reflect.TypeOf([]byte{}),
+			Offset: dec.InputOffset(),
+			Struct: fmt.Sprintf("FixedSizeBinary[%d]", b.dtype.ByteWidth),
+		}
+	}
+
+	if len(val) != b.dtype.ByteWidth {
+		return &json.UnmarshalTypeError{
+			Value:  fmt.Sprint(val),
+			Type:   reflect.TypeOf([]byte{}),
+			Offset: dec.InputOffset(),
+			Struct: fmt.Sprintf("FixedSizeBinary[%d]", b.dtype.ByteWidth),
+		}
+	}
+	b.Append(val)
+	return nil
+}
+
+func (b *FixedSizeBinaryBuilder) unmarshal(dec *json.Decoder) error {
+	for dec.More() {
+		if err := b.unmarshalOne(dec); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (b *FixedSizeBinaryBuilder) UnmarshalJSON(data []byte) error {
+	dec := json.NewDecoder(bytes.NewReader(data))
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	if delim, ok := t.(json.Delim); !ok || delim != '[' {
+		return fmt.Errorf("fixed size binary builder must unpack from json array, found %s", delim)
+	}
+
+	return b.unmarshal(dec)
+}
+
 var (
 	_ Builder = (*FixedSizeBinaryBuilder)(nil)
 )
diff --git a/go/arrow/array/float16.go b/go/arrow/array/float16.go
index a7fd4eb1449e6..74a5f8b5f0a68 100644
--- a/go/arrow/array/float16.go
+++ b/go/arrow/array/float16.go
@@ -22,6 +22,7 @@ import (
 
 	"github.com/apache/arrow/go/v7/arrow"
 	"github.com/apache/arrow/go/v7/arrow/float16"
+	"github.com/goccy/go-json"
 )
 
 // A type which represents an immutable sequence of Float16 values.
@@ -70,6 +71,25 @@ func (a *Float16) setData(data *Data) {
 	}
 }
 
+func (a *Float16) getOneForMarshal(i int) interface{} {
+	if a.IsValid(i) {
+		return a.values[i].Float32()
+	}
+	return nil
+}
+
+func (a *Float16) MarshalJSON() ([]byte, error) {
+	vals := make([]interface{}, a.Len())
+	for i, v := range a.values {
+		if a.IsValid(i) {
+			vals[i] = v.Float32()
+		} else {
+			vals[i] = nil
+		}
+	}
+	return json.Marshal(vals)
+}
+
 func arrayEqualFloat16(left, right *Float16) bool {
 	for i := 0; i < left.Len(); i++ {
 		if left.IsNull(i) {
diff --git a/go/arrow/array/float16_builder.go b/go/arrow/array/float16_builder.go
index 1cf538af6a9be..ebd9fc628e444 100644
--- a/go/arrow/array/float16_builder.go
+++ b/go/arrow/array/float16_builder.go
@@ -17,6 +17,10 @@
 package array
 
 import (
+	"bytes"
+	"fmt"
+	"reflect"
+	"strconv"
 	"sync/atomic"
 
 	"github.com/apache/arrow/go/v7/arrow"
@@ -24,6 +28,7 @@ import (
 	"github.com/apache/arrow/go/v7/arrow/float16"
 	"github.com/apache/arrow/go/v7/arrow/internal/debug"
 	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/goccy/go-json"
 )
 
 type Float16Builder struct {
@@ -163,3 +168,63 @@ func (b *Float16Builder) newData() (data *Data) {
 
 	return
 }
+
+func (b *Float16Builder) unmarshalOne(dec *json.Decoder) error {
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	switch v := t.(type) {
+	case float64:
+		b.Append(float16.New(float32(v)))
+	case string:
+		f, err := strconv.ParseFloat(v, 32)
+		if err != nil {
+			return err
+		}
+		// this will currently silently truncate if it is too large
+		b.Append(float16.New(float32(f)))
+	case json.Number:
+		f, err := v.Float64()
+		if err != nil {
+			return err
+		}
+		b.Append(float16.New(float32(f)))
+	case nil:
+		b.AppendNull()
+	default:
+		return &json.UnmarshalTypeError{
+			Value:  fmt.Sprint(t),
+			Type:   reflect.TypeOf(float16.Num{}),
+			Offset: dec.InputOffset(),
+		}
+	}
+	return nil
+}
+
+func (b *Float16Builder) unmarshal(dec *json.Decoder) error {
+	for dec.More() {
+		if err := b.unmarshalOne(dec); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// UnmarshalJSON will add values to this builder from unmarshalling the
+// array of values. Currently values that are larger than a float16 will
+// be silently truncated.
+func (b *Float16Builder) UnmarshalJSON(data []byte) error {
+	dec := json.NewDecoder(bytes.NewReader(data))
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	if delim, ok := t.(json.Delim); !ok || delim != '[' {
+		return fmt.Errorf("float16 builder must unpack from json array, found %s", delim)
+	}
+
+	return b.unmarshal(dec)
+}
diff --git a/go/arrow/array/interval.go b/go/arrow/array/interval.go
index 2e0c1eed11459..bf6651ca7f69b 100644
--- a/go/arrow/array/interval.go
+++ b/go/arrow/array/interval.go
@@ -17,6 +17,7 @@
 package array
 
 import (
+	"bytes"
 	"fmt"
 	"strings"
 	"sync/atomic"
@@ -25,6 +26,7 @@ import (
 	"github.com/apache/arrow/go/v7/arrow/bitutil"
 	"github.com/apache/arrow/go/v7/arrow/internal/debug"
 	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/goccy/go-json"
 	"golang.org/x/xerrors"
 )
 
@@ -86,6 +88,32 @@ func (a *MonthInterval) setData(data *Data) {
 	}
 }
 
+func (a *MonthInterval) getOneForMarshal(i int) interface{} {
+	if a.IsValid(i) {
+		return a.values[i]
+	}
+	return nil
+}
+
+// MarshalJSON will create a json array out of a MonthInterval array,
+// each value will be an object of the form {"months": #} where
+// # is the numeric value of that index
+func (a *MonthInterval) MarshalJSON() ([]byte, error) {
+	if a.NullN() == 0 {
+		return json.Marshal(a.values)
+	}
+	vals := make([]interface{}, a.Len())
+	for i := 0; i < a.Len(); i++ {
+		if a.IsValid(i) {
+			vals[i] = a.values[i]
+		} else {
+			vals[i] = nil
+		}
+	}
+
+	return json.Marshal(vals)
+}
+
 func arrayEqualMonthInterval(left, right *MonthInterval) bool {
 	for i := 0; i < left.Len(); i++ {
 		if left.IsNull(i) {
@@ -234,6 +262,46 @@ func (b *MonthIntervalBuilder) newData() (data *Data) {
 	return
 }
 
+func (b *MonthIntervalBuilder) unmarshalOne(dec *json.Decoder) error {
+	var v *arrow.MonthInterval
+	if err := dec.Decode(&v); err != nil {
+		return err
+	}
+
+	if v == nil {
+		b.AppendNull()
+	} else {
+		b.Append(*v)
+	}
+	return nil
+}
+
+func (b *MonthIntervalBuilder) unmarshal(dec *json.Decoder) error {
+	for dec.More() {
+		if err := b.unmarshalOne(dec); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// UnmarshalJSON will add the unmarshalled values of an array to the builder,
+// values are expected to be strings of the form "#months" where # is the int32
+// value that will be added to the builder.
+func (b *MonthIntervalBuilder) UnmarshalJSON(data []byte) error {
+	dec := json.NewDecoder(bytes.NewReader(data))
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	if delim, ok := t.(json.Delim); !ok || delim != '[' {
+		return fmt.Errorf("month interval builder must unpack from json array, found %s", delim)
+	}
+
+	return b.unmarshal(dec)
+}
+
 // A type which represents an immutable sequence of arrow.DayTimeInterval values.
 type DayTimeInterval struct {
 	array
@@ -279,6 +347,30 @@ func (a *DayTimeInterval) setData(data *Data) {
 	}
 }
 
+func (a *DayTimeInterval) getOneForMarshal(i int) interface{} {
+	if a.IsValid(i) {
+		return a.values[i]
+	}
+	return nil
+}
+
+// MarshalJSON will marshal this array to JSON as an array of objects,
+// consisting of the form {"days": #, "milliseconds": #} for each element.
+func (a *DayTimeInterval) MarshalJSON() ([]byte, error) {
+	if a.NullN() == 0 {
+		return json.Marshal(a.values)
+	}
+	vals := make([]interface{}, a.Len())
+	for i, v := range a.values {
+		if a.IsValid(i) {
+			vals[i] = v
+		} else {
+			vals[i] = nil
+		}
+	}
+	return json.Marshal(vals)
+}
+
 func arrayEqualDayTimeInterval(left, right *DayTimeInterval) bool {
 	for i := 0; i < left.Len(); i++ {
 		if left.IsNull(i) {
@@ -427,6 +519,45 @@ func (b *DayTimeIntervalBuilder) newData() (data *Data) {
 	return
 }
 
+func (b *DayTimeIntervalBuilder) unmarshalOne(dec *json.Decoder) error {
+	var v *arrow.DayTimeInterval
+	if err := dec.Decode(&v); err != nil {
+		return err
+	}
+
+	if v == nil {
+		b.AppendNull()
+	} else {
+		b.Append(*v)
+	}
+	return nil
+}
+
+func (b *DayTimeIntervalBuilder) unmarshal(dec *json.Decoder) error {
+	for dec.More() {
+		if err := b.unmarshalOne(dec); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// UnmarshalJSON will add the values unmarshalled from an array to the builder,
+// with the values expected to be objects of the form {"days": #, "milliseconds": #}
+func (b *DayTimeIntervalBuilder) UnmarshalJSON(data []byte) error {
+	dec := json.NewDecoder(bytes.NewReader(data))
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	if delim, ok := t.(json.Delim); !ok || delim != '[' {
+		return fmt.Errorf("day_time interval builder must unpack from json array, found %s", delim)
+	}
+
+	return b.unmarshal(dec)
+}
+
 // A type which represents an immutable sequence of arrow.DayTimeInterval values.
 type MonthDayNanoInterval struct {
 	array
@@ -474,6 +605,30 @@ func (a *MonthDayNanoInterval) setData(data *Data) {
 	}
 }
 
+func (a *MonthDayNanoInterval) getOneForMarshal(i int) interface{} {
+	if a.IsValid(i) {
+		return a.values[i]
+	}
+	return nil
+}
+
+// MarshalJSON will marshal this array to a JSON array with elements
+// marshalled to the form {"months": #, "days": #, "nanoseconds": #}
+func (a *MonthDayNanoInterval) MarshalJSON() ([]byte, error) {
+	if a.NullN() == 0 {
+		return json.Marshal(a.values)
+	}
+	vals := make([]interface{}, a.Len())
+	for i, v := range a.values {
+		if a.IsValid(i) {
+			vals[i] = v
+		} else {
+			vals[i] = nil
+		}
+	}
+	return json.Marshal(vals)
+}
+
 func arrayEqualMonthDayNanoInterval(left, right *MonthDayNanoInterval) bool {
 	for i := 0; i < left.Len(); i++ {
 		if left.IsNull(i) {
@@ -622,6 +777,46 @@ func (b *MonthDayNanoIntervalBuilder) newData() (data *Data) {
 	return
 }
 
+func (b *MonthDayNanoIntervalBuilder) unmarshalOne(dec *json.Decoder) error {
+	var v *arrow.MonthDayNanoInterval
+	if err := dec.Decode(&v); err != nil {
+		return err
+	}
+
+	if v == nil {
+		b.AppendNull()
+	} else {
+		b.Append(*v)
+	}
+	return nil
+}
+
+func (b *MonthDayNanoIntervalBuilder) unmarshal(dec *json.Decoder) error {
+	for dec.More() {
+		if err := b.unmarshalOne(dec); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// UnmarshalJSON unmarshals a JSON array of objects and adds them to this builder,
+// each element of the array is expected to be an object of the form
+// {"months": #, "days": #, "nanoseconds": #}
+func (b *MonthDayNanoIntervalBuilder) UnmarshalJSON(data []byte) error {
+	dec := json.NewDecoder(bytes.NewReader(data))
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	if delim, ok := t.(json.Delim); !ok || delim != '[' {
+		return fmt.Errorf("month_day_nano interval builder must unpack from json array, found %s", delim)
+	}
+
+	return b.unmarshal(dec)
+}
+
 var (
 	_ Interface = (*MonthInterval)(nil)
 	_ Interface = (*DayTimeInterval)(nil)
diff --git a/go/arrow/array/json_reader.go b/go/arrow/array/json_reader.go
new file mode 100644
index 0000000000000..f6397c0b62e57
--- /dev/null
+++ b/go/arrow/array/json_reader.go
@@ -0,0 +1,204 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package array
+
+import (
+	"fmt"
+	"io"
+	"sync/atomic"
+
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/internal/debug"
+	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/goccy/go-json"
+)
+
+type Option func(config)
+type config interface{}
+
+// WithChunk sets the chunk size for reading in json records. The default is to
+// read in one row per record batch as a single object. If chunk size is set to
+// a negative value, then the entire file is read as a single record batch.
+// Otherwise a record batch is read in with chunk size rows per record batch until
+// it reaches EOF.
+func WithChunk(n int) Option {
+	return func(cfg config) {
+		switch cfg := cfg.(type) {
+		case *JSONReader:
+			cfg.chunk = n
+		default:
+			panic(fmt.Errorf("arrow/json): unknown config type %T", cfg))
+		}
+	}
+}
+
+// WithAllocator specifies the allocator to use for creating the record batches,
+// if it is not called, then memory.DefaultAllocator will be used.
+func WithAllocator(mem memory.Allocator) Option {
+	return func(cfg config) {
+		switch cfg := cfg.(type) {
+		case *JSONReader:
+			cfg.mem = mem
+		default:
+			panic(fmt.Errorf("arrow/json): unknown config type %T", cfg))
+		}
+	}
+}
+
+// JSONReader is a json reader that meets the RecordReader interface definition.
+//
+// To read in an array of objects as a record, you can use RecordFromJSON
+// which is equivalent to reading the json as a struct array whose fields are
+// the columns of the record. This primarily exists to fit the RecordReader
+// interface as a matching reader for the csv reader.
+type JSONReader struct {
+	r      *json.Decoder
+	schema *arrow.Schema
+
+	bldr *RecordBuilder
+
+	refs int64
+	cur  Record
+	err  error
+
+	chunk int
+	done  bool
+
+	mem  memory.Allocator
+	next func() bool
+}
+
+// NewJSONReader returns a json RecordReader which expects to find one json object
+// per row of dataset. Using WithChunk can control how many rows are processed
+// per record, which is how many objects become a single record from the file.
+//
+// If it is desired to write out an array of rows, then simply use RecordToStructArray
+// and json.Marshal the struct array for the same effect.
+func NewJSONReader(r io.Reader, schema *arrow.Schema, opts ...Option) *JSONReader {
+	rr := &JSONReader{
+		r:      json.NewDecoder(r),
+		schema: schema,
+		refs:   1,
+		chunk:  1,
+	}
+	for _, o := range opts {
+		o(rr)
+	}
+
+	if rr.mem == nil {
+		rr.mem = memory.DefaultAllocator
+	}
+
+	rr.bldr = NewRecordBuilder(rr.mem, schema)
+	switch {
+	case rr.chunk < 0:
+		rr.next = rr.nextall
+	case rr.chunk > 1:
+		rr.next = rr.nextn
+	default:
+		rr.next = rr.next1
+	}
+	return rr
+}
+
+// Err returns the last encountered error
+func (r *JSONReader) Err() error { return r.err }
+
+func (r *JSONReader) Schema() *arrow.Schema { return r.schema }
+
+// Record returns the last read in record. The returned record is only valid
+// until the next call to Next unless Retain is called on the record itself.
+func (r *JSONReader) Record() Record { return r.cur }
+
+func (r *JSONReader) Retain() {
+	atomic.AddInt64(&r.refs, 1)
+}
+
+func (r *JSONReader) Release() {
+	debug.Assert(atomic.LoadInt64(&r.refs) > 0, "too many releases")
+
+	if atomic.AddInt64(&r.refs, -1) == 0 {
+		if r.cur != nil {
+			r.cur.Release()
+			r.bldr.Release()
+			r.r = nil
+		}
+	}
+}
+
+// Next returns true if it read in a record, which will be available via Record
+// and false if there is either an error or the end of the reader.
+func (r *JSONReader) Next() bool {
+	if r.cur != nil {
+		r.cur.Release()
+		r.cur = nil
+	}
+
+	if r.err != nil || r.done {
+		return false
+	}
+
+	return r.next()
+}
+
+func (r *JSONReader) readNext() bool {
+	r.err = r.r.Decode(r.bldr)
+	if r.err != nil {
+		r.done = true
+		if r.err == io.EOF {
+			r.err = nil
+		}
+		return false
+	}
+	return true
+}
+
+func (r *JSONReader) nextall() bool {
+	for r.readNext() {
+	}
+
+	r.cur = r.bldr.NewRecord()
+	return r.cur.NumRows() > 0
+}
+
+func (r *JSONReader) next1() bool {
+	if !r.readNext() {
+		return false
+	}
+
+	r.cur = r.bldr.NewRecord()
+	return true
+}
+
+func (r *JSONReader) nextn() bool {
+	var n = 0
+
+	for i := 0; i < r.chunk && !r.done; i, n = i+1, n+1 {
+		if !r.readNext() {
+			break
+		}
+	}
+
+	if n > 0 {
+		r.cur = r.bldr.NewRecord()
+	}
+	return n > 0
+}
+
+var (
+	_ RecordReader = (*JSONReader)(nil)
+)
diff --git a/go/arrow/array/json_reader_test.go b/go/arrow/array/json_reader_test.go
new file mode 100644
index 0000000000000..71d4c0f6b6996
--- /dev/null
+++ b/go/arrow/array/json_reader_test.go
@@ -0,0 +1,117 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package array_test
+
+import (
+	"strings"
+	"testing"
+
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/stretchr/testify/assert"
+)
+
+const jsondata = `
+	{"region": "NY", "model": "3", "sales": 742.0}
+	{"region": "NY", "model": "S", "sales": 304.125}
+	{"region": "NY", "model": "X", "sales": 136.25}
+	{"region": "NY", "model": "Y", "sales": 27.5}
+	{"region": "CA", "model": "3", "sales": 512}
+	{"region": "CA", "model": "S", "sales": 978}
+	{"region": "CA", "model": "X", "sales": 1.0}
+	{"region": "CA", "model": "Y", "sales": 69}
+	{"region": "QC", "model": "3", "sales": 273.5}
+	{"region": "QC", "model": "S", "sales": 13}
+	{"region": "QC", "model": "X", "sales": 54}
+	{"region": "QC", "model": "Y", "sales": 21}
+	{"region": "QC", "model": "3", "sales": 152.25}
+	{"region": "QC", "model": "S", "sales": 10}
+	{"region": "QC", "model": "X", "sales": 42}
+	{"region": "QC", "model": "Y", "sales": 37}`
+
+func TestJSONReader(t *testing.T) {
+	schema := arrow.NewSchema([]arrow.Field{
+		{Name: "region", Type: arrow.BinaryTypes.String, Nullable: true},
+		{Name: "model", Type: arrow.BinaryTypes.String},
+		{Name: "sales", Type: arrow.PrimitiveTypes.Float64, Nullable: true},
+	}, nil)
+
+	rdr := array.NewJSONReader(strings.NewReader(jsondata), schema)
+	defer rdr.Release()
+
+	n := 0
+	for rdr.Next() {
+		n++
+		rec := rdr.Record()
+		assert.NotNil(t, rec)
+		assert.EqualValues(t, 1, rec.NumRows())
+		assert.EqualValues(t, 3, rec.NumCols())
+	}
+
+	assert.NoError(t, rdr.Err())
+	assert.Equal(t, 16, n)
+}
+
+func TestJSONReaderAll(t *testing.T) {
+	schema := arrow.NewSchema([]arrow.Field{
+		{Name: "region", Type: arrow.BinaryTypes.String, Nullable: true},
+		{Name: "model", Type: arrow.BinaryTypes.String},
+		{Name: "sales", Type: arrow.PrimitiveTypes.Float64, Nullable: true},
+	}, nil)
+
+	mem := memory.NewCheckedAllocator(memory.NewGoAllocator())
+	defer mem.AssertSize(t, 0)
+
+	rdr := array.NewJSONReader(strings.NewReader(jsondata), schema, array.WithAllocator(mem), array.WithChunk(-1))
+	defer rdr.Release()
+
+	assert.True(t, rdr.Next())
+	rec := rdr.Record()
+	assert.NotNil(t, rec)
+	assert.NoError(t, rdr.Err())
+
+	assert.EqualValues(t, 16, rec.NumRows())
+	assert.EqualValues(t, 3, rec.NumCols())
+	assert.False(t, rdr.Next())
+}
+
+func TestJSONReaderChunked(t *testing.T) {
+	schema := arrow.NewSchema([]arrow.Field{
+		{Name: "region", Type: arrow.BinaryTypes.String, Nullable: true},
+		{Name: "model", Type: arrow.BinaryTypes.String},
+		{Name: "sales", Type: arrow.PrimitiveTypes.Float64, Nullable: true},
+	}, nil)
+
+	mem := memory.NewCheckedAllocator(memory.NewGoAllocator())
+	defer mem.AssertSize(t, 0)
+
+	rdr := array.NewJSONReader(strings.NewReader(jsondata), schema, array.WithAllocator(mem), array.WithChunk(4))
+	defer rdr.Release()
+
+	n := 0
+	for rdr.Next() {
+		n++
+		rec := rdr.Record()
+		assert.NotNil(t, rec)
+		assert.NoError(t, rdr.Err())
+		assert.EqualValues(t, 4, rec.NumRows())
+	}
+
+	assert.Equal(t, 4, n)
+	assert.NoError(t, rdr.Err())
+}
diff --git a/go/arrow/array/list.go b/go/arrow/array/list.go
index 1b81da07b3886..9ebc0a40e7381 100644
--- a/go/arrow/array/list.go
+++ b/go/arrow/array/list.go
@@ -17,6 +17,7 @@
 package array
 
 import (
+	"bytes"
 	"fmt"
 	"strings"
 	"sync/atomic"
@@ -25,6 +26,7 @@ import (
 	"github.com/apache/arrow/go/v7/arrow/bitutil"
 	"github.com/apache/arrow/go/v7/arrow/internal/debug"
 	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/goccy/go-json"
 )
 
 // List represents an immutable sequence of array values.
@@ -79,6 +81,37 @@ func (a *List) setData(data *Data) {
 	a.values = MakeFromData(data.childData[0])
 }
 
+func (a *List) getOneForMarshal(i int) interface{} {
+	if a.IsNull(i) {
+		return nil
+	}
+
+	slice := a.newListValue(i)
+	defer slice.Release()
+	v, err := json.Marshal(slice)
+	if err != nil {
+		panic(err)
+	}
+	return json.RawMessage(v)
+}
+
+func (a *List) MarshalJSON() ([]byte, error) {
+	var buf bytes.Buffer
+	enc := json.NewEncoder(&buf)
+
+	buf.WriteByte('[')
+	for i := 0; i < a.Len(); i++ {
+		if i != 0 {
+			buf.WriteByte(',')
+		}
+		if err := enc.Encode(a.getOneForMarshal(i)); err != nil {
+			return nil, err
+		}
+	}
+	buf.WriteByte(']')
+	return buf.Bytes(), nil
+}
+
 func arrayEqualList(left, right *List) bool {
 	for i := 0; i < left.Len(); i++ {
 		if left.IsNull(i) {
@@ -263,6 +296,56 @@ func (b *ListBuilder) newData() (data *Data) {
 	return
 }
 
+func (b *ListBuilder) unmarshalOne(dec *json.Decoder) error {
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	switch t {
+	case json.Delim('['):
+		b.Append(true)
+		if err := b.values.unmarshal(dec); err != nil {
+			return err
+		}
+		// consume ']'
+		_, err := dec.Token()
+		return err
+	case nil:
+		b.AppendNull()
+	default:
+		return &json.UnmarshalTypeError{
+			Value:  fmt.Sprint(t),
+			Struct: arrow.ListOf(b.etype).String(),
+		}
+	}
+
+	return nil
+}
+
+func (b *ListBuilder) unmarshal(dec *json.Decoder) error {
+	for dec.More() {
+		if err := b.unmarshalOne(dec); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (b *ListBuilder) UnmarshalJSON(data []byte) error {
+	dec := json.NewDecoder(bytes.NewReader(data))
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	if delim, ok := t.(json.Delim); !ok || delim != '[' {
+		return fmt.Errorf("list builder must unpack from json array, found %s", delim)
+	}
+
+	return b.unmarshal(dec)
+}
+
 var (
 	_ Interface = (*List)(nil)
 	_ Builder   = (*ListBuilder)(nil)
diff --git a/go/arrow/array/map.go b/go/arrow/array/map.go
index 7728420603a29..cbcc088894331 100644
--- a/go/arrow/array/map.go
+++ b/go/arrow/array/map.go
@@ -17,8 +17,12 @@
 package array
 
 import (
+	"bytes"
+	"fmt"
+
 	"github.com/apache/arrow/go/v7/arrow"
 	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/goccy/go-json"
 )
 
 // Map represents an immutable sequence of Key/Value structs. It is a
@@ -266,6 +270,28 @@ func (b *MapBuilder) ValueBuilder() *StructBuilder {
 	return b.listBuilder.ValueBuilder().(*StructBuilder)
 }
 
+func (b *MapBuilder) unmarshalOne(dec *json.Decoder) error {
+	return b.listBuilder.unmarshalOne(dec)
+}
+
+func (b *MapBuilder) unmarshal(dec *json.Decoder) error {
+	return b.listBuilder.unmarshal(dec)
+}
+
+func (b *MapBuilder) UnmarshalJSON(data []byte) error {
+	dec := json.NewDecoder(bytes.NewReader(data))
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	if delim, ok := t.(json.Delim); !ok || delim != '[' {
+		return fmt.Errorf("map builder must unpack from json array, found %s", delim)
+	}
+
+	return b.unmarshal(dec)
+}
+
 var (
 	_ Interface = (*Map)(nil)
 	_ Builder   = (*MapBuilder)(nil)
diff --git a/go/arrow/array/null.go b/go/arrow/array/null.go
index 945232e2b56d5..288ccb43fba51 100644
--- a/go/arrow/array/null.go
+++ b/go/arrow/array/null.go
@@ -17,12 +17,16 @@
 package array
 
 import (
+	"bytes"
+	"fmt"
+	"reflect"
 	"strings"
 	"sync/atomic"
 
 	"github.com/apache/arrow/go/v7/arrow"
 	"github.com/apache/arrow/go/v7/arrow/internal/debug"
 	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/goccy/go-json"
 )
 
 // Null represents an immutable, degenerate array with no physical storage.
@@ -73,6 +77,14 @@ func (a *Null) setData(data *Data) {
 	a.array.data.nulls = a.array.data.length
 }
 
+func (a *Null) getOneForMarshal(i int) interface{} {
+	return nil
+}
+
+func (a *Null) MarshalJSON() ([]byte, error) {
+	return json.Marshal(make([]interface{}, a.Len()))
+}
+
 type NullBuilder struct {
 	builder
 }
@@ -134,6 +146,48 @@ func (b *NullBuilder) newData() (data *Data) {
 	return
 }
 
+func (b *NullBuilder) unmarshalOne(dec *json.Decoder) error {
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	switch t.(type) {
+	case nil:
+		b.AppendNull()
+	default:
+		return &json.UnmarshalTypeError{
+			Value:  fmt.Sprint(t),
+			Type:   reflect.TypeOf(nil),
+			Offset: dec.InputOffset(),
+		}
+	}
+	return nil
+}
+
+func (b *NullBuilder) unmarshal(dec *json.Decoder) error {
+	for dec.More() {
+		if err := b.unmarshalOne(dec); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (b *NullBuilder) UnmarshalJSON(data []byte) error {
+	dec := json.NewDecoder(bytes.NewReader(data))
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	if delim, ok := t.(json.Delim); !ok || delim != '[' {
+		return fmt.Errorf("null builder must unpack from json array, found %s", delim)
+	}
+
+	return b.unmarshal(dec)
+}
+
 var (
 	_ Interface = (*Null)(nil)
 	_ Builder   = (*NullBuilder)(nil)
diff --git a/go/arrow/array/numeric.gen.go b/go/arrow/array/numeric.gen.go
index f958c713c46e4..5a9cc91013f25 100644
--- a/go/arrow/array/numeric.gen.go
+++ b/go/arrow/array/numeric.gen.go
@@ -23,6 +23,7 @@ import (
 	"strings"
 
 	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/goccy/go-json"
 )
 
 // A type which represents an immutable sequence of int64 values.
@@ -80,6 +81,27 @@ func (a *Int64) setData(data *Data) {
 	}
 }
 
+func (a *Int64) getOneForMarshal(i int) interface{} {
+	if a.IsNull(i) {
+		return nil
+	}
+
+	return a.values[i]
+}
+
+func (a *Int64) MarshalJSON() ([]byte, error) {
+	vals := make([]interface{}, a.Len())
+	for i := 0; i < a.Len(); i++ {
+		if a.IsValid(i) {
+			vals[i] = float64(a.values[i]) // prevent uint8 from being seen as binary data
+		} else {
+			vals[i] = nil
+		}
+	}
+
+	return json.Marshal(vals)
+}
+
 func arrayEqualInt64(left, right *Int64) bool {
 	for i := 0; i < left.Len(); i++ {
 		if left.IsNull(i) {
@@ -147,6 +169,27 @@ func (a *Uint64) setData(data *Data) {
 	}
 }
 
+func (a *Uint64) getOneForMarshal(i int) interface{} {
+	if a.IsNull(i) {
+		return nil
+	}
+
+	return a.values[i]
+}
+
+func (a *Uint64) MarshalJSON() ([]byte, error) {
+	vals := make([]interface{}, a.Len())
+	for i := 0; i < a.Len(); i++ {
+		if a.IsValid(i) {
+			vals[i] = float64(a.values[i]) // prevent uint8 from being seen as binary data
+		} else {
+			vals[i] = nil
+		}
+	}
+
+	return json.Marshal(vals)
+}
+
 func arrayEqualUint64(left, right *Uint64) bool {
 	for i := 0; i < left.Len(); i++ {
 		if left.IsNull(i) {
@@ -214,6 +257,27 @@ func (a *Float64) setData(data *Data) {
 	}
 }
 
+func (a *Float64) getOneForMarshal(i int) interface{} {
+	if a.IsNull(i) {
+		return nil
+	}
+
+	return a.values[i]
+}
+
+func (a *Float64) MarshalJSON() ([]byte, error) {
+	vals := make([]interface{}, a.Len())
+	for i := 0; i < a.Len(); i++ {
+		if a.IsValid(i) {
+			vals[i] = float64(a.values[i]) // prevent uint8 from being seen as binary data
+		} else {
+			vals[i] = nil
+		}
+	}
+
+	return json.Marshal(vals)
+}
+
 func arrayEqualFloat64(left, right *Float64) bool {
 	for i := 0; i < left.Len(); i++ {
 		if left.IsNull(i) {
@@ -281,6 +345,27 @@ func (a *Int32) setData(data *Data) {
 	}
 }
 
+func (a *Int32) getOneForMarshal(i int) interface{} {
+	if a.IsNull(i) {
+		return nil
+	}
+
+	return a.values[i]
+}
+
+func (a *Int32) MarshalJSON() ([]byte, error) {
+	vals := make([]interface{}, a.Len())
+	for i := 0; i < a.Len(); i++ {
+		if a.IsValid(i) {
+			vals[i] = float64(a.values[i]) // prevent uint8 from being seen as binary data
+		} else {
+			vals[i] = nil
+		}
+	}
+
+	return json.Marshal(vals)
+}
+
 func arrayEqualInt32(left, right *Int32) bool {
 	for i := 0; i < left.Len(); i++ {
 		if left.IsNull(i) {
@@ -348,6 +433,27 @@ func (a *Uint32) setData(data *Data) {
 	}
 }
 
+func (a *Uint32) getOneForMarshal(i int) interface{} {
+	if a.IsNull(i) {
+		return nil
+	}
+
+	return a.values[i]
+}
+
+func (a *Uint32) MarshalJSON() ([]byte, error) {
+	vals := make([]interface{}, a.Len())
+	for i := 0; i < a.Len(); i++ {
+		if a.IsValid(i) {
+			vals[i] = float64(a.values[i]) // prevent uint8 from being seen as binary data
+		} else {
+			vals[i] = nil
+		}
+	}
+
+	return json.Marshal(vals)
+}
+
 func arrayEqualUint32(left, right *Uint32) bool {
 	for i := 0; i < left.Len(); i++ {
 		if left.IsNull(i) {
@@ -415,6 +521,27 @@ func (a *Float32) setData(data *Data) {
 	}
 }
 
+func (a *Float32) getOneForMarshal(i int) interface{} {
+	if a.IsNull(i) {
+		return nil
+	}
+
+	return a.values[i]
+}
+
+func (a *Float32) MarshalJSON() ([]byte, error) {
+	vals := make([]interface{}, a.Len())
+	for i := 0; i < a.Len(); i++ {
+		if a.IsValid(i) {
+			vals[i] = float64(a.values[i]) // prevent uint8 from being seen as binary data
+		} else {
+			vals[i] = nil
+		}
+	}
+
+	return json.Marshal(vals)
+}
+
 func arrayEqualFloat32(left, right *Float32) bool {
 	for i := 0; i < left.Len(); i++ {
 		if left.IsNull(i) {
@@ -482,6 +609,27 @@ func (a *Int16) setData(data *Data) {
 	}
 }
 
+func (a *Int16) getOneForMarshal(i int) interface{} {
+	if a.IsNull(i) {
+		return nil
+	}
+
+	return a.values[i]
+}
+
+func (a *Int16) MarshalJSON() ([]byte, error) {
+	vals := make([]interface{}, a.Len())
+	for i := 0; i < a.Len(); i++ {
+		if a.IsValid(i) {
+			vals[i] = float64(a.values[i]) // prevent uint8 from being seen as binary data
+		} else {
+			vals[i] = nil
+		}
+	}
+
+	return json.Marshal(vals)
+}
+
 func arrayEqualInt16(left, right *Int16) bool {
 	for i := 0; i < left.Len(); i++ {
 		if left.IsNull(i) {
@@ -549,6 +697,27 @@ func (a *Uint16) setData(data *Data) {
 	}
 }
 
+func (a *Uint16) getOneForMarshal(i int) interface{} {
+	if a.IsNull(i) {
+		return nil
+	}
+
+	return a.values[i]
+}
+
+func (a *Uint16) MarshalJSON() ([]byte, error) {
+	vals := make([]interface{}, a.Len())
+	for i := 0; i < a.Len(); i++ {
+		if a.IsValid(i) {
+			vals[i] = float64(a.values[i]) // prevent uint8 from being seen as binary data
+		} else {
+			vals[i] = nil
+		}
+	}
+
+	return json.Marshal(vals)
+}
+
 func arrayEqualUint16(left, right *Uint16) bool {
 	for i := 0; i < left.Len(); i++ {
 		if left.IsNull(i) {
@@ -616,6 +785,27 @@ func (a *Int8) setData(data *Data) {
 	}
 }
 
+func (a *Int8) getOneForMarshal(i int) interface{} {
+	if a.IsNull(i) {
+		return nil
+	}
+
+	return float64(a.values[i]) // prevent uint8 from being seen as binary data
+}
+
+func (a *Int8) MarshalJSON() ([]byte, error) {
+	vals := make([]interface{}, a.Len())
+	for i := 0; i < a.Len(); i++ {
+		if a.IsValid(i) {
+			vals[i] = float64(a.values[i]) // prevent uint8 from being seen as binary data
+		} else {
+			vals[i] = nil
+		}
+	}
+
+	return json.Marshal(vals)
+}
+
 func arrayEqualInt8(left, right *Int8) bool {
 	for i := 0; i < left.Len(); i++ {
 		if left.IsNull(i) {
@@ -683,6 +873,27 @@ func (a *Uint8) setData(data *Data) {
 	}
 }
 
+func (a *Uint8) getOneForMarshal(i int) interface{} {
+	if a.IsNull(i) {
+		return nil
+	}
+
+	return float64(a.values[i]) // prevent uint8 from being seen as binary data
+}
+
+func (a *Uint8) MarshalJSON() ([]byte, error) {
+	vals := make([]interface{}, a.Len())
+	for i := 0; i < a.Len(); i++ {
+		if a.IsValid(i) {
+			vals[i] = float64(a.values[i]) // prevent uint8 from being seen as binary data
+		} else {
+			vals[i] = nil
+		}
+	}
+
+	return json.Marshal(vals)
+}
+
 func arrayEqualUint8(left, right *Uint8) bool {
 	for i := 0; i < left.Len(); i++ {
 		if left.IsNull(i) {
@@ -750,6 +961,22 @@ func (a *Timestamp) setData(data *Data) {
 	}
 }
 
+func (a *Timestamp) getOneForMarshal(i int) interface{} {
+	if a.IsNull(i) {
+		return nil
+	}
+	return a.values[i].ToTime(a.DataType().(*arrow.TimestampType).Unit).Format("2006-01-02 15:04:05.999999999")
+}
+
+func (a *Timestamp) MarshalJSON() ([]byte, error) {
+	vals := make([]interface{}, a.Len())
+	for i := range a.values {
+		vals[i] = a.getOneForMarshal(i)
+	}
+
+	return json.Marshal(vals)
+}
+
 func arrayEqualTimestamp(left, right *Timestamp) bool {
 	for i := 0; i < left.Len(); i++ {
 		if left.IsNull(i) {
@@ -817,6 +1044,22 @@ func (a *Time32) setData(data *Data) {
 	}
 }
 
+func (a *Time32) getOneForMarshal(i int) interface{} {
+	if a.IsNull(i) {
+		return nil
+	}
+	return a.values[i].ToTime(a.DataType().(*arrow.Time32Type).Unit).Format("15:04:05.999999999")
+}
+
+func (a *Time32) MarshalJSON() ([]byte, error) {
+	vals := make([]interface{}, a.Len())
+	for i := range a.values {
+		vals[i] = a.getOneForMarshal(i)
+	}
+
+	return json.Marshal(vals)
+}
+
 func arrayEqualTime32(left, right *Time32) bool {
 	for i := 0; i < left.Len(); i++ {
 		if left.IsNull(i) {
@@ -884,6 +1127,22 @@ func (a *Time64) setData(data *Data) {
 	}
 }
 
+func (a *Time64) getOneForMarshal(i int) interface{} {
+	if a.IsNull(i) {
+		return nil
+	}
+	return a.values[i].ToTime(a.DataType().(*arrow.Time64Type).Unit).Format("15:04:05.999999999")
+}
+
+func (a *Time64) MarshalJSON() ([]byte, error) {
+	vals := make([]interface{}, a.Len())
+	for i := range a.values {
+		vals[i] = a.getOneForMarshal(i)
+	}
+
+	return json.Marshal(vals)
+}
+
 func arrayEqualTime64(left, right *Time64) bool {
 	for i := 0; i < left.Len(); i++ {
 		if left.IsNull(i) {
@@ -951,6 +1210,22 @@ func (a *Date32) setData(data *Data) {
 	}
 }
 
+func (a *Date32) getOneForMarshal(i int) interface{} {
+	if a.IsNull(i) {
+		return nil
+	}
+	return a.values[i].ToTime().Format("2006-01-02")
+}
+
+func (a *Date32) MarshalJSON() ([]byte, error) {
+	vals := make([]interface{}, a.Len())
+	for i := range a.values {
+		vals[i] = a.getOneForMarshal(i)
+	}
+
+	return json.Marshal(vals)
+}
+
 func arrayEqualDate32(left, right *Date32) bool {
 	for i := 0; i < left.Len(); i++ {
 		if left.IsNull(i) {
@@ -1018,6 +1293,22 @@ func (a *Date64) setData(data *Data) {
 	}
 }
 
+func (a *Date64) getOneForMarshal(i int) interface{} {
+	if a.IsNull(i) {
+		return nil
+	}
+	return a.values[i].ToTime().Format("2006-01-02")
+}
+
+func (a *Date64) MarshalJSON() ([]byte, error) {
+	vals := make([]interface{}, a.Len())
+	for i := range a.values {
+		vals[i] = a.getOneForMarshal(i)
+	}
+
+	return json.Marshal(vals)
+}
+
 func arrayEqualDate64(left, right *Date64) bool {
 	for i := 0; i < left.Len(); i++ {
 		if left.IsNull(i) {
@@ -1085,6 +1376,23 @@ func (a *Duration) setData(data *Data) {
 	}
 }
 
+func (a *Duration) getOneForMarshal(i int) interface{} {
+	if a.IsNull(i) {
+		return nil
+	}
+	// return value and suffix as a string such as "12345ms"
+	return fmt.Sprintf("%d%s", a.values[i], a.DataType().(*arrow.DurationType).Unit.String())
+}
+
+func (a *Duration) MarshalJSON() ([]byte, error) {
+	vals := make([]interface{}, a.Len())
+	for i := range a.values {
+		vals[i] = a.getOneForMarshal(i)
+	}
+
+	return json.Marshal(vals)
+}
+
 func arrayEqualDuration(left, right *Duration) bool {
 	for i := 0; i < left.Len(); i++ {
 		if left.IsNull(i) {
diff --git a/go/arrow/array/numeric.gen.go.tmpl b/go/arrow/array/numeric.gen.go.tmpl
index 70cf40ff614fc..5a6b46a683a46 100644
--- a/go/arrow/array/numeric.gen.go.tmpl
+++ b/go/arrow/array/numeric.gen.go.tmpl
@@ -19,8 +19,10 @@ package array
 import (
 	"fmt"
 	"strings"
+	"time"
 
-	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow"	
+	"github.com/goccy/go-json"
 )
 
 {{range .In}}
@@ -80,6 +82,45 @@ func (a *{{.Name}}) setData(data *Data) {
 	}
 }
 
+func (a *{{.Name}}) getOneForMarshal(i int) interface{} {
+	if a.IsNull(i) {
+		return nil
+	}
+{{if or (eq .Name "Date32") (eq .Name "Date64") -}}
+	return a.values[i].ToTime().Format("2006-01-02")
+{{else if or (eq .Name "Time32") (eq .Name "Time64") -}}
+	return a.values[i].ToTime(a.DataType().(*{{.QualifiedType}}Type).Unit).Format("15:04:05.999999999")
+{{else if or (eq .Name "Timestamp") -}}
+	return a.values[i].ToTime(a.DataType().(*{{.QualifiedType}}Type).Unit).Format("2006-01-02 15:04:05.999999999")
+{{else if (eq .Name "Duration") -}}	
+	// return value and suffix as a string such as "12345ms"
+	return fmt.Sprintf("%d%s", a.values[i], a.DataType().(*{{.QualifiedType}}Type).Unit.String())	
+{{else if (eq .Size "1")}}
+	return float64(a.values[i]) // prevent uint8 from being seen as binary data
+{{else}}
+	return a.values[i]
+{{end -}}
+}
+
+func (a *{{.Name}}) MarshalJSON() ([]byte, error) {
+{{if .QualifiedType -}}
+	vals := make([]interface{}, a.Len())
+	for i := range a.values {
+		vals[i] = a.getOneForMarshal(i)
+	}
+{{else -}}
+	vals := make([]interface{}, a.Len())
+	for i := 0; i < a.Len(); i++ {
+		if a.IsValid(i) {
+			vals[i] = float64(a.values[i]) // prevent uint8 from being seen as binary data
+		} else {
+			vals[i] = nil
+		}
+	}
+{{end}}
+	return json.Marshal(vals)
+}
+
 func arrayEqual{{.Name}}(left, right *{{.Name}}) bool {
 	for i := 0; i < left.Len(); i++ {
 		if left.IsNull(i) {
diff --git a/go/arrow/array/numericbuilder.gen.go b/go/arrow/array/numericbuilder.gen.go
index 0802d81c102ef..9786caf7d1f88 100644
--- a/go/arrow/array/numericbuilder.gen.go
+++ b/go/arrow/array/numericbuilder.gen.go
@@ -19,12 +19,19 @@
 package array
 
 import (
+	"bytes"
+	"fmt"
+	"reflect"
+	"strconv"
+	"strings"
 	"sync/atomic"
+	"time"
 
 	"github.com/apache/arrow/go/v7/arrow"
 	"github.com/apache/arrow/go/v7/arrow/bitutil"
 	"github.com/apache/arrow/go/v7/arrow/internal/debug"
 	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/goccy/go-json"
 )
 
 type Int64Builder struct {
@@ -163,6 +170,73 @@ func (b *Int64Builder) newData() (data *Data) {
 	return
 }
 
+func (b *Int64Builder) unmarshalOne(dec *json.Decoder) error {
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	switch v := t.(type) {
+	case nil:
+		b.AppendNull()
+
+	case string:
+		f, err := strconv.ParseInt(v, 10, 8*8)
+		if err != nil {
+			return &json.UnmarshalTypeError{
+				Value:  v,
+				Type:   reflect.TypeOf(int64(0)),
+				Offset: dec.InputOffset(),
+			}
+		}
+		b.Append(int64(f))
+	case float64:
+		b.Append(int64(v))
+	case json.Number:
+		f, err := strconv.ParseInt(v.String(), 10, 8*8)
+		if err != nil {
+			return &json.UnmarshalTypeError{
+				Value:  v.String(),
+				Type:   reflect.TypeOf(int64(0)),
+				Offset: dec.InputOffset(),
+			}
+		}
+		b.Append(int64(f))
+
+	default:
+		return &json.UnmarshalTypeError{
+			Value:  fmt.Sprint(t),
+			Type:   reflect.TypeOf(int64(0)),
+			Offset: dec.InputOffset(),
+		}
+	}
+
+	return nil
+}
+
+func (b *Int64Builder) unmarshal(dec *json.Decoder) error {
+	for dec.More() {
+		if err := b.unmarshalOne(dec); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (b *Int64Builder) UnmarshalJSON(data []byte) error {
+	dec := json.NewDecoder(bytes.NewReader(data))
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	if delim, ok := t.(json.Delim); !ok || delim != '[' {
+		return fmt.Errorf("binary builder must unpack from json array, found %s", delim)
+	}
+
+	return b.unmarshal(dec)
+}
+
 type Uint64Builder struct {
 	builder
 
@@ -299,6 +373,73 @@ func (b *Uint64Builder) newData() (data *Data) {
 	return
 }
 
+func (b *Uint64Builder) unmarshalOne(dec *json.Decoder) error {
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	switch v := t.(type) {
+	case nil:
+		b.AppendNull()
+
+	case string:
+		f, err := strconv.ParseUint(v, 10, 8*8)
+		if err != nil {
+			return &json.UnmarshalTypeError{
+				Value:  v,
+				Type:   reflect.TypeOf(uint64(0)),
+				Offset: dec.InputOffset(),
+			}
+		}
+		b.Append(uint64(f))
+	case float64:
+		b.Append(uint64(v))
+	case json.Number:
+		f, err := strconv.ParseUint(v.String(), 10, 8*8)
+		if err != nil {
+			return &json.UnmarshalTypeError{
+				Value:  v.String(),
+				Type:   reflect.TypeOf(uint64(0)),
+				Offset: dec.InputOffset(),
+			}
+		}
+		b.Append(uint64(f))
+
+	default:
+		return &json.UnmarshalTypeError{
+			Value:  fmt.Sprint(t),
+			Type:   reflect.TypeOf(uint64(0)),
+			Offset: dec.InputOffset(),
+		}
+	}
+
+	return nil
+}
+
+func (b *Uint64Builder) unmarshal(dec *json.Decoder) error {
+	for dec.More() {
+		if err := b.unmarshalOne(dec); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (b *Uint64Builder) UnmarshalJSON(data []byte) error {
+	dec := json.NewDecoder(bytes.NewReader(data))
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	if delim, ok := t.(json.Delim); !ok || delim != '[' {
+		return fmt.Errorf("binary builder must unpack from json array, found %s", delim)
+	}
+
+	return b.unmarshal(dec)
+}
+
 type Float64Builder struct {
 	builder
 
@@ -435,6 +576,73 @@ func (b *Float64Builder) newData() (data *Data) {
 	return
 }
 
+func (b *Float64Builder) unmarshalOne(dec *json.Decoder) error {
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	switch v := t.(type) {
+	case nil:
+		b.AppendNull()
+
+	case string:
+		f, err := strconv.ParseFloat(v, 8*8)
+		if err != nil {
+			return &json.UnmarshalTypeError{
+				Value:  v,
+				Type:   reflect.TypeOf(float64(0)),
+				Offset: dec.InputOffset(),
+			}
+		}
+		b.Append(float64(f))
+	case float64:
+		b.Append(float64(v))
+	case json.Number:
+		f, err := strconv.ParseFloat(v.String(), 8*8)
+		if err != nil {
+			return &json.UnmarshalTypeError{
+				Value:  v.String(),
+				Type:   reflect.TypeOf(float64(0)),
+				Offset: dec.InputOffset(),
+			}
+		}
+		b.Append(float64(f))
+
+	default:
+		return &json.UnmarshalTypeError{
+			Value:  fmt.Sprint(t),
+			Type:   reflect.TypeOf(float64(0)),
+			Offset: dec.InputOffset(),
+		}
+	}
+
+	return nil
+}
+
+func (b *Float64Builder) unmarshal(dec *json.Decoder) error {
+	for dec.More() {
+		if err := b.unmarshalOne(dec); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (b *Float64Builder) UnmarshalJSON(data []byte) error {
+	dec := json.NewDecoder(bytes.NewReader(data))
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	if delim, ok := t.(json.Delim); !ok || delim != '[' {
+		return fmt.Errorf("binary builder must unpack from json array, found %s", delim)
+	}
+
+	return b.unmarshal(dec)
+}
+
 type Int32Builder struct {
 	builder
 
@@ -571,6 +779,73 @@ func (b *Int32Builder) newData() (data *Data) {
 	return
 }
 
+func (b *Int32Builder) unmarshalOne(dec *json.Decoder) error {
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	switch v := t.(type) {
+	case nil:
+		b.AppendNull()
+
+	case string:
+		f, err := strconv.ParseInt(v, 10, 4*8)
+		if err != nil {
+			return &json.UnmarshalTypeError{
+				Value:  v,
+				Type:   reflect.TypeOf(int32(0)),
+				Offset: dec.InputOffset(),
+			}
+		}
+		b.Append(int32(f))
+	case float64:
+		b.Append(int32(v))
+	case json.Number:
+		f, err := strconv.ParseInt(v.String(), 10, 4*8)
+		if err != nil {
+			return &json.UnmarshalTypeError{
+				Value:  v.String(),
+				Type:   reflect.TypeOf(int32(0)),
+				Offset: dec.InputOffset(),
+			}
+		}
+		b.Append(int32(f))
+
+	default:
+		return &json.UnmarshalTypeError{
+			Value:  fmt.Sprint(t),
+			Type:   reflect.TypeOf(int32(0)),
+			Offset: dec.InputOffset(),
+		}
+	}
+
+	return nil
+}
+
+func (b *Int32Builder) unmarshal(dec *json.Decoder) error {
+	for dec.More() {
+		if err := b.unmarshalOne(dec); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (b *Int32Builder) UnmarshalJSON(data []byte) error {
+	dec := json.NewDecoder(bytes.NewReader(data))
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	if delim, ok := t.(json.Delim); !ok || delim != '[' {
+		return fmt.Errorf("binary builder must unpack from json array, found %s", delim)
+	}
+
+	return b.unmarshal(dec)
+}
+
 type Uint32Builder struct {
 	builder
 
@@ -707,6 +982,73 @@ func (b *Uint32Builder) newData() (data *Data) {
 	return
 }
 
+func (b *Uint32Builder) unmarshalOne(dec *json.Decoder) error {
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	switch v := t.(type) {
+	case nil:
+		b.AppendNull()
+
+	case string:
+		f, err := strconv.ParseUint(v, 10, 4*8)
+		if err != nil {
+			return &json.UnmarshalTypeError{
+				Value:  v,
+				Type:   reflect.TypeOf(uint32(0)),
+				Offset: dec.InputOffset(),
+			}
+		}
+		b.Append(uint32(f))
+	case float64:
+		b.Append(uint32(v))
+	case json.Number:
+		f, err := strconv.ParseUint(v.String(), 10, 4*8)
+		if err != nil {
+			return &json.UnmarshalTypeError{
+				Value:  v.String(),
+				Type:   reflect.TypeOf(uint32(0)),
+				Offset: dec.InputOffset(),
+			}
+		}
+		b.Append(uint32(f))
+
+	default:
+		return &json.UnmarshalTypeError{
+			Value:  fmt.Sprint(t),
+			Type:   reflect.TypeOf(uint32(0)),
+			Offset: dec.InputOffset(),
+		}
+	}
+
+	return nil
+}
+
+func (b *Uint32Builder) unmarshal(dec *json.Decoder) error {
+	for dec.More() {
+		if err := b.unmarshalOne(dec); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (b *Uint32Builder) UnmarshalJSON(data []byte) error {
+	dec := json.NewDecoder(bytes.NewReader(data))
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	if delim, ok := t.(json.Delim); !ok || delim != '[' {
+		return fmt.Errorf("binary builder must unpack from json array, found %s", delim)
+	}
+
+	return b.unmarshal(dec)
+}
+
 type Float32Builder struct {
 	builder
 
@@ -843,6 +1185,73 @@ func (b *Float32Builder) newData() (data *Data) {
 	return
 }
 
+func (b *Float32Builder) unmarshalOne(dec *json.Decoder) error {
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	switch v := t.(type) {
+	case nil:
+		b.AppendNull()
+
+	case string:
+		f, err := strconv.ParseFloat(v, 4*8)
+		if err != nil {
+			return &json.UnmarshalTypeError{
+				Value:  v,
+				Type:   reflect.TypeOf(float32(0)),
+				Offset: dec.InputOffset(),
+			}
+		}
+		b.Append(float32(f))
+	case float64:
+		b.Append(float32(v))
+	case json.Number:
+		f, err := strconv.ParseFloat(v.String(), 4*8)
+		if err != nil {
+			return &json.UnmarshalTypeError{
+				Value:  v.String(),
+				Type:   reflect.TypeOf(float32(0)),
+				Offset: dec.InputOffset(),
+			}
+		}
+		b.Append(float32(f))
+
+	default:
+		return &json.UnmarshalTypeError{
+			Value:  fmt.Sprint(t),
+			Type:   reflect.TypeOf(float32(0)),
+			Offset: dec.InputOffset(),
+		}
+	}
+
+	return nil
+}
+
+func (b *Float32Builder) unmarshal(dec *json.Decoder) error {
+	for dec.More() {
+		if err := b.unmarshalOne(dec); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (b *Float32Builder) UnmarshalJSON(data []byte) error {
+	dec := json.NewDecoder(bytes.NewReader(data))
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	if delim, ok := t.(json.Delim); !ok || delim != '[' {
+		return fmt.Errorf("binary builder must unpack from json array, found %s", delim)
+	}
+
+	return b.unmarshal(dec)
+}
+
 type Int16Builder struct {
 	builder
 
@@ -979,6 +1388,73 @@ func (b *Int16Builder) newData() (data *Data) {
 	return
 }
 
+func (b *Int16Builder) unmarshalOne(dec *json.Decoder) error {
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	switch v := t.(type) {
+	case nil:
+		b.AppendNull()
+
+	case string:
+		f, err := strconv.ParseInt(v, 10, 2*8)
+		if err != nil {
+			return &json.UnmarshalTypeError{
+				Value:  v,
+				Type:   reflect.TypeOf(int16(0)),
+				Offset: dec.InputOffset(),
+			}
+		}
+		b.Append(int16(f))
+	case float64:
+		b.Append(int16(v))
+	case json.Number:
+		f, err := strconv.ParseInt(v.String(), 10, 2*8)
+		if err != nil {
+			return &json.UnmarshalTypeError{
+				Value:  v.String(),
+				Type:   reflect.TypeOf(int16(0)),
+				Offset: dec.InputOffset(),
+			}
+		}
+		b.Append(int16(f))
+
+	default:
+		return &json.UnmarshalTypeError{
+			Value:  fmt.Sprint(t),
+			Type:   reflect.TypeOf(int16(0)),
+			Offset: dec.InputOffset(),
+		}
+	}
+
+	return nil
+}
+
+func (b *Int16Builder) unmarshal(dec *json.Decoder) error {
+	for dec.More() {
+		if err := b.unmarshalOne(dec); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (b *Int16Builder) UnmarshalJSON(data []byte) error {
+	dec := json.NewDecoder(bytes.NewReader(data))
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	if delim, ok := t.(json.Delim); !ok || delim != '[' {
+		return fmt.Errorf("binary builder must unpack from json array, found %s", delim)
+	}
+
+	return b.unmarshal(dec)
+}
+
 type Uint16Builder struct {
 	builder
 
@@ -1115,6 +1591,73 @@ func (b *Uint16Builder) newData() (data *Data) {
 	return
 }
 
+func (b *Uint16Builder) unmarshalOne(dec *json.Decoder) error {
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	switch v := t.(type) {
+	case nil:
+		b.AppendNull()
+
+	case string:
+		f, err := strconv.ParseUint(v, 10, 2*8)
+		if err != nil {
+			return &json.UnmarshalTypeError{
+				Value:  v,
+				Type:   reflect.TypeOf(uint16(0)),
+				Offset: dec.InputOffset(),
+			}
+		}
+		b.Append(uint16(f))
+	case float64:
+		b.Append(uint16(v))
+	case json.Number:
+		f, err := strconv.ParseUint(v.String(), 10, 2*8)
+		if err != nil {
+			return &json.UnmarshalTypeError{
+				Value:  v.String(),
+				Type:   reflect.TypeOf(uint16(0)),
+				Offset: dec.InputOffset(),
+			}
+		}
+		b.Append(uint16(f))
+
+	default:
+		return &json.UnmarshalTypeError{
+			Value:  fmt.Sprint(t),
+			Type:   reflect.TypeOf(uint16(0)),
+			Offset: dec.InputOffset(),
+		}
+	}
+
+	return nil
+}
+
+func (b *Uint16Builder) unmarshal(dec *json.Decoder) error {
+	for dec.More() {
+		if err := b.unmarshalOne(dec); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (b *Uint16Builder) UnmarshalJSON(data []byte) error {
+	dec := json.NewDecoder(bytes.NewReader(data))
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	if delim, ok := t.(json.Delim); !ok || delim != '[' {
+		return fmt.Errorf("binary builder must unpack from json array, found %s", delim)
+	}
+
+	return b.unmarshal(dec)
+}
+
 type Int8Builder struct {
 	builder
 
@@ -1251,6 +1794,73 @@ func (b *Int8Builder) newData() (data *Data) {
 	return
 }
 
+func (b *Int8Builder) unmarshalOne(dec *json.Decoder) error {
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	switch v := t.(type) {
+	case nil:
+		b.AppendNull()
+
+	case string:
+		f, err := strconv.ParseInt(v, 10, 1*8)
+		if err != nil {
+			return &json.UnmarshalTypeError{
+				Value:  v,
+				Type:   reflect.TypeOf(int8(0)),
+				Offset: dec.InputOffset(),
+			}
+		}
+		b.Append(int8(f))
+	case float64:
+		b.Append(int8(v))
+	case json.Number:
+		f, err := strconv.ParseInt(v.String(), 10, 1*8)
+		if err != nil {
+			return &json.UnmarshalTypeError{
+				Value:  v.String(),
+				Type:   reflect.TypeOf(int8(0)),
+				Offset: dec.InputOffset(),
+			}
+		}
+		b.Append(int8(f))
+
+	default:
+		return &json.UnmarshalTypeError{
+			Value:  fmt.Sprint(t),
+			Type:   reflect.TypeOf(int8(0)),
+			Offset: dec.InputOffset(),
+		}
+	}
+
+	return nil
+}
+
+func (b *Int8Builder) unmarshal(dec *json.Decoder) error {
+	for dec.More() {
+		if err := b.unmarshalOne(dec); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (b *Int8Builder) UnmarshalJSON(data []byte) error {
+	dec := json.NewDecoder(bytes.NewReader(data))
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	if delim, ok := t.(json.Delim); !ok || delim != '[' {
+		return fmt.Errorf("binary builder must unpack from json array, found %s", delim)
+	}
+
+	return b.unmarshal(dec)
+}
+
 type Uint8Builder struct {
 	builder
 
@@ -1387,6 +1997,73 @@ func (b *Uint8Builder) newData() (data *Data) {
 	return
 }
 
+func (b *Uint8Builder) unmarshalOne(dec *json.Decoder) error {
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	switch v := t.(type) {
+	case nil:
+		b.AppendNull()
+
+	case string:
+		f, err := strconv.ParseUint(v, 10, 1*8)
+		if err != nil {
+			return &json.UnmarshalTypeError{
+				Value:  v,
+				Type:   reflect.TypeOf(uint8(0)),
+				Offset: dec.InputOffset(),
+			}
+		}
+		b.Append(uint8(f))
+	case float64:
+		b.Append(uint8(v))
+	case json.Number:
+		f, err := strconv.ParseUint(v.String(), 10, 1*8)
+		if err != nil {
+			return &json.UnmarshalTypeError{
+				Value:  v.String(),
+				Type:   reflect.TypeOf(uint8(0)),
+				Offset: dec.InputOffset(),
+			}
+		}
+		b.Append(uint8(f))
+
+	default:
+		return &json.UnmarshalTypeError{
+			Value:  fmt.Sprint(t),
+			Type:   reflect.TypeOf(uint8(0)),
+			Offset: dec.InputOffset(),
+		}
+	}
+
+	return nil
+}
+
+func (b *Uint8Builder) unmarshal(dec *json.Decoder) error {
+	for dec.More() {
+		if err := b.unmarshalOne(dec); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (b *Uint8Builder) UnmarshalJSON(data []byte) error {
+	dec := json.NewDecoder(bytes.NewReader(data))
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	if delim, ok := t.(json.Delim); !ok || delim != '[' {
+		return fmt.Errorf("binary builder must unpack from json array, found %s", delim)
+	}
+
+	return b.unmarshal(dec)
+}
+
 type TimestampBuilder struct {
 	builder
 
@@ -1524,6 +2201,61 @@ func (b *TimestampBuilder) newData() (data *Data) {
 	return
 }
 
+func (b *TimestampBuilder) unmarshalOne(dec *json.Decoder) error {
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	switch v := t.(type) {
+	case nil:
+		b.AppendNull()
+	case string:
+		tm, err := arrow.TimestampFromString(v, b.dtype.Unit)
+		if err != nil {
+			return &json.UnmarshalTypeError{
+				Value:  v,
+				Type:   reflect.TypeOf(arrow.Timestamp(0)),
+				Offset: dec.InputOffset(),
+			}
+		}
+
+		b.Append(tm)
+
+	default:
+		return &json.UnmarshalTypeError{
+			Value:  fmt.Sprint(t),
+			Type:   reflect.TypeOf(arrow.Timestamp(0)),
+			Offset: dec.InputOffset(),
+		}
+	}
+
+	return nil
+}
+
+func (b *TimestampBuilder) unmarshal(dec *json.Decoder) error {
+	for dec.More() {
+		if err := b.unmarshalOne(dec); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (b *TimestampBuilder) UnmarshalJSON(data []byte) error {
+	dec := json.NewDecoder(bytes.NewReader(data))
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	if delim, ok := t.(json.Delim); !ok || delim != '[' {
+		return fmt.Errorf("binary builder must unpack from json array, found %s", delim)
+	}
+
+	return b.unmarshal(dec)
+}
+
 type Time32Builder struct {
 	builder
 
@@ -1661,6 +2393,61 @@ func (b *Time32Builder) newData() (data *Data) {
 	return
 }
 
+func (b *Time32Builder) unmarshalOne(dec *json.Decoder) error {
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	switch v := t.(type) {
+	case nil:
+		b.AppendNull()
+	case string:
+		tm, err := arrow.Time32FromString(v, b.dtype.Unit)
+		if err != nil {
+			return &json.UnmarshalTypeError{
+				Value:  v,
+				Type:   reflect.TypeOf(arrow.Time32(0)),
+				Offset: dec.InputOffset(),
+			}
+		}
+
+		b.Append(tm)
+
+	default:
+		return &json.UnmarshalTypeError{
+			Value:  fmt.Sprint(t),
+			Type:   reflect.TypeOf(arrow.Time32(0)),
+			Offset: dec.InputOffset(),
+		}
+	}
+
+	return nil
+}
+
+func (b *Time32Builder) unmarshal(dec *json.Decoder) error {
+	for dec.More() {
+		if err := b.unmarshalOne(dec); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (b *Time32Builder) UnmarshalJSON(data []byte) error {
+	dec := json.NewDecoder(bytes.NewReader(data))
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	if delim, ok := t.(json.Delim); !ok || delim != '[' {
+		return fmt.Errorf("binary builder must unpack from json array, found %s", delim)
+	}
+
+	return b.unmarshal(dec)
+}
+
 type Time64Builder struct {
 	builder
 
@@ -1798,6 +2585,61 @@ func (b *Time64Builder) newData() (data *Data) {
 	return
 }
 
+func (b *Time64Builder) unmarshalOne(dec *json.Decoder) error {
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	switch v := t.(type) {
+	case nil:
+		b.AppendNull()
+	case string:
+		tm, err := arrow.Time64FromString(v, b.dtype.Unit)
+		if err != nil {
+			return &json.UnmarshalTypeError{
+				Value:  v,
+				Type:   reflect.TypeOf(arrow.Time64(0)),
+				Offset: dec.InputOffset(),
+			}
+		}
+
+		b.Append(tm)
+
+	default:
+		return &json.UnmarshalTypeError{
+			Value:  fmt.Sprint(t),
+			Type:   reflect.TypeOf(arrow.Time64(0)),
+			Offset: dec.InputOffset(),
+		}
+	}
+
+	return nil
+}
+
+func (b *Time64Builder) unmarshal(dec *json.Decoder) error {
+	for dec.More() {
+		if err := b.unmarshalOne(dec); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (b *Time64Builder) UnmarshalJSON(data []byte) error {
+	dec := json.NewDecoder(bytes.NewReader(data))
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	if delim, ok := t.(json.Delim); !ok || delim != '[' {
+		return fmt.Errorf("binary builder must unpack from json array, found %s", delim)
+	}
+
+	return b.unmarshal(dec)
+}
+
 type Date32Builder struct {
 	builder
 
@@ -1934,6 +2776,61 @@ func (b *Date32Builder) newData() (data *Data) {
 	return
 }
 
+func (b *Date32Builder) unmarshalOne(dec *json.Decoder) error {
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	switch v := t.(type) {
+	case nil:
+		b.AppendNull()
+	case string:
+		tm, err := time.Parse("2006-01-02", v)
+		if err != nil {
+			return &json.UnmarshalTypeError{
+				Value:  v,
+				Type:   reflect.TypeOf(arrow.Date32(0)),
+				Offset: dec.InputOffset(),
+			}
+		}
+
+		b.Append(arrow.Date32FromTime(tm))
+
+	default:
+		return &json.UnmarshalTypeError{
+			Value:  fmt.Sprint(t),
+			Type:   reflect.TypeOf(arrow.Date32(0)),
+			Offset: dec.InputOffset(),
+		}
+	}
+
+	return nil
+}
+
+func (b *Date32Builder) unmarshal(dec *json.Decoder) error {
+	for dec.More() {
+		if err := b.unmarshalOne(dec); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (b *Date32Builder) UnmarshalJSON(data []byte) error {
+	dec := json.NewDecoder(bytes.NewReader(data))
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	if delim, ok := t.(json.Delim); !ok || delim != '[' {
+		return fmt.Errorf("binary builder must unpack from json array, found %s", delim)
+	}
+
+	return b.unmarshal(dec)
+}
+
 type Date64Builder struct {
 	builder
 
@@ -2070,6 +2967,61 @@ func (b *Date64Builder) newData() (data *Data) {
 	return
 }
 
+func (b *Date64Builder) unmarshalOne(dec *json.Decoder) error {
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	switch v := t.(type) {
+	case nil:
+		b.AppendNull()
+	case string:
+		tm, err := time.Parse("2006-01-02", v)
+		if err != nil {
+			return &json.UnmarshalTypeError{
+				Value:  v,
+				Type:   reflect.TypeOf(arrow.Date64(0)),
+				Offset: dec.InputOffset(),
+			}
+		}
+
+		b.Append(arrow.Date64FromTime(tm))
+
+	default:
+		return &json.UnmarshalTypeError{
+			Value:  fmt.Sprint(t),
+			Type:   reflect.TypeOf(arrow.Date64(0)),
+			Offset: dec.InputOffset(),
+		}
+	}
+
+	return nil
+}
+
+func (b *Date64Builder) unmarshal(dec *json.Decoder) error {
+	for dec.More() {
+		if err := b.unmarshalOne(dec); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (b *Date64Builder) UnmarshalJSON(data []byte) error {
+	dec := json.NewDecoder(bytes.NewReader(data))
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	if delim, ok := t.(json.Delim); !ok || delim != '[' {
+		return fmt.Errorf("binary builder must unpack from json array, found %s", delim)
+	}
+
+	return b.unmarshal(dec)
+}
+
 type DurationBuilder struct {
 	builder
 
@@ -2207,6 +3159,84 @@ func (b *DurationBuilder) newData() (data *Data) {
 	return
 }
 
+func (b *DurationBuilder) unmarshalOne(dec *json.Decoder) error {
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	switch v := t.(type) {
+	case nil:
+		b.AppendNull()
+	case string:
+		// be flexible for specifying durations by accepting forms like
+		// 3h2m0.5s regardless of the unit and converting it to the proper
+		// precision.
+		val, err := time.ParseDuration(v)
+		if err != nil {
+			// if we got an error, maybe it was because the attempt to create
+			// a time.Duration (int64) in nanoseconds would overflow. check if
+			// the string is just a large number followed by the unit suffix
+			if strings.HasSuffix(v, b.dtype.Unit.String()) {
+				value, err := strconv.ParseInt(v[:len(v)-len(b.dtype.Unit.String())], 10, 64)
+				if err == nil {
+					b.Append(arrow.Duration(value))
+					break
+				}
+			}
+
+			return &json.UnmarshalTypeError{
+				Value:  v,
+				Type:   reflect.TypeOf(arrow.Duration(0)),
+				Offset: dec.InputOffset(),
+			}
+		}
+
+		switch b.dtype.Unit {
+		case arrow.Nanosecond:
+			b.Append(arrow.Duration(val.Nanoseconds()))
+		case arrow.Microsecond:
+			b.Append(arrow.Duration(val.Microseconds()))
+		case arrow.Millisecond:
+			b.Append(arrow.Duration(val.Milliseconds()))
+		case arrow.Second:
+			b.Append(arrow.Duration(val.Seconds()))
+		}
+
+	default:
+		return &json.UnmarshalTypeError{
+			Value:  fmt.Sprint(t),
+			Type:   reflect.TypeOf(arrow.Duration(0)),
+			Offset: dec.InputOffset(),
+		}
+	}
+
+	return nil
+}
+
+func (b *DurationBuilder) unmarshal(dec *json.Decoder) error {
+	for dec.More() {
+		if err := b.unmarshalOne(dec); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (b *DurationBuilder) UnmarshalJSON(data []byte) error {
+	dec := json.NewDecoder(bytes.NewReader(data))
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	if delim, ok := t.(json.Delim); !ok || delim != '[' {
+		return fmt.Errorf("binary builder must unpack from json array, found %s", delim)
+	}
+
+	return b.unmarshal(dec)
+}
+
 var (
 	_ Builder = (*Int64Builder)(nil)
 	_ Builder = (*Uint64Builder)(nil)
diff --git a/go/arrow/array/numericbuilder.gen.go.tmpl b/go/arrow/array/numericbuilder.gen.go.tmpl
index e066d4fbb3869..52a0664a6a1e8 100644
--- a/go/arrow/array/numericbuilder.gen.go.tmpl
+++ b/go/arrow/array/numericbuilder.gen.go.tmpl
@@ -20,7 +20,8 @@ import (
 	"github.com/apache/arrow/go/v7/arrow"
 	"github.com/apache/arrow/go/v7/arrow/bitutil"
 	"github.com/apache/arrow/go/v7/arrow/internal/debug"
-	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/apache/arrow/go/v7/arrow/memory"	
+	"github.com/goccy/go-json"
 )
 
 {{range .In}}
@@ -173,6 +174,144 @@ func (b *{{.Name}}Builder) newData() (data *Data) {
 
 	return
 }
+
+func (b *{{.Name}}Builder) unmarshalOne(dec *json.Decoder) error {
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	switch v := t.(type) {
+	case nil:
+		b.AppendNull()
+{{if or (eq .Name "Date32") (eq .Name "Date64") -}}
+	case string:
+		tm, err := time.Parse("2006-01-02", v)
+		if err != nil {
+			return &json.UnmarshalTypeError{
+				Value: v,
+				Type: reflect.TypeOf({{.QualifiedType}}(0)),
+				Offset: dec.InputOffset(),
+			}
+		}
+
+		b.Append({{.QualifiedType}}FromTime(tm))
+{{else if or (eq .Name "Time32") (eq .Name "Time64") (eq .Name "Timestamp") -}}
+	case string:
+		tm, err := {{.QualifiedType}}FromString(v, b.dtype.Unit)
+		if err != nil {
+			return &json.UnmarshalTypeError{
+				Value: v,
+				Type: reflect.TypeOf({{.QualifiedType}}(0)),
+				Offset: dec.InputOffset(),
+			}
+		}
+
+		b.Append(tm)
+{{else if eq .Name "Duration" -}}
+	case string:
+		// be flexible for specifying durations by accepting forms like
+		// 3h2m0.5s regardless of the unit and converting it to the proper
+		// precision.
+		val, err := time.ParseDuration(v)
+		if err != nil {
+			// if we got an error, maybe it was because the attempt to create
+			// a time.Duration (int64) in nanoseconds would overflow. check if
+			// the string is just a large number followed by the unit suffix
+			if strings.HasSuffix(v, b.dtype.Unit.String()) {
+				value, err := strconv.ParseInt(v[:len(v)-len(b.dtype.Unit.String())], 10, 64)
+				if err == nil {
+					b.Append(arrow.Duration(value))
+					break
+				}
+			}
+		
+			return &json.UnmarshalTypeError{
+				Value: v,
+				Type: reflect.TypeOf({{.QualifiedType}}(0)),
+				Offset: dec.InputOffset(),
+			}
+		}
+
+		switch b.dtype.Unit {
+		case arrow.Nanosecond:
+			b.Append({{.QualifiedType}}(val.Nanoseconds()))
+		case arrow.Microsecond:
+			b.Append({{.QualifiedType}}(val.Microseconds()))
+		case arrow.Millisecond:
+			b.Append({{.QualifiedType}}(val.Milliseconds()))
+		case arrow.Second:
+			b.Append({{.QualifiedType}}(val.Seconds()))
+		}
+{{else}}
+	case string:
+{{if or (eq .Name "Float32") (eq .Name "Float64") -}}
+		f, err := strconv.ParseFloat(v, {{.Size}}*8)
+{{else if eq (printf "%.1s" .Name) "U" -}}
+		f, err := strconv.ParseUint(v, 10, {{.Size}}*8)
+{{else -}}
+		f, err := strconv.ParseInt(v, 10, {{.Size}}*8)
+{{end -}}
+		if err != nil {
+			return &json.UnmarshalTypeError{
+				Value: v,
+				Type: reflect.TypeOf({{.name}}(0)),
+				Offset: dec.InputOffset(),
+			}
+		}
+		b.Append({{.name}}(f))
+	case float64:
+		b.Append({{.name}}(v))
+	case json.Number:
+{{if or (eq .Name "Float32") (eq .Name "Float64") -}}
+		f, err := strconv.ParseFloat(v.String(), {{.Size}}*8)
+{{else if eq (printf "%.1s" .Name) "U" -}}
+		f, err := strconv.ParseUint(v.String(), 10, {{.Size}}*8)
+{{else -}}
+		f, err := strconv.ParseInt(v.String(), 10, {{.Size}}*8)
+{{end -}}
+		if err != nil {
+			return &json.UnmarshalTypeError{
+				Value: v.String(),
+				Type: reflect.TypeOf({{.name}}(0)),
+				Offset: dec.InputOffset(),
+			}
+		}
+		b.Append({{.name}}(f))
+{{end}}
+	default:
+		return &json.UnmarshalTypeError{
+			Value: fmt.Sprint(t),
+			Type: reflect.TypeOf({{or .QualifiedType .Type}}(0)),
+			Offset: dec.InputOffset(),
+		}
+	}
+
+	return nil
+}
+
+func (b *{{.Name}}Builder) unmarshal(dec *json.Decoder) error {
+	for dec.More() {
+		if err := b.unmarshalOne(dec); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (b *{{.Name}}Builder) UnmarshalJSON(data []byte) error {
+	dec := json.NewDecoder(bytes.NewReader(data))
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	if delim, ok := t.(json.Delim); !ok || delim != '[' {
+		return fmt.Errorf("binary builder must unpack from json array, found %s", delim)
+	}
+
+	return b.unmarshal(dec)
+}
 {{end}}
 
 var (
diff --git a/go/arrow/array/record.go b/go/arrow/array/record.go
index 741bc51b7f876..41e3dc476e89a 100644
--- a/go/arrow/array/record.go
+++ b/go/arrow/array/record.go
@@ -17,6 +17,7 @@
 package array
 
 import (
+	"bytes"
 	"fmt"
 	"strings"
 	"sync/atomic"
@@ -24,6 +25,7 @@ import (
 	"github.com/apache/arrow/go/v7/arrow"
 	"github.com/apache/arrow/go/v7/arrow/internal/debug"
 	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/goccy/go-json"
 )
 
 // RecordReader reads a stream of records.
@@ -109,6 +111,8 @@ func (rs *simpleRecords) Next() bool {
 // Record is a collection of equal-length arrays
 // matching a particular Schema.
 type Record interface {
+	json.Marshaler
+
 	Release()
 	Retain()
 
@@ -254,6 +258,12 @@ func (rec *simpleRecord) String() string {
 	return o.String()
 }
 
+func (rec *simpleRecord) MarshalJSON() ([]byte, error) {
+	arr := RecordToStructArray(rec)
+	defer arr.Release()
+	return arr.MarshalJSON()
+}
+
 // RecordBuilder eases the process of building a Record, iteratively, from
 // a known Schema.
 type RecordBuilder struct {
@@ -338,6 +348,53 @@ func (b *RecordBuilder) NewRecord() Record {
 	return NewRecord(b.schema, cols, rows)
 }
 
+// UnmarshalJSON for record builder will read in a single object and add the values
+// to each field in the recordbuilder, missing fields will get a null and unexpected
+// keys will be ignored. If reading in an array of records as a single batch, then use
+// a structbuilder and use RecordFromStruct.
+func (b *RecordBuilder) UnmarshalJSON(data []byte) error {
+	dec := json.NewDecoder(bytes.NewReader(data))
+	// should start with a '{'
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	if delim, ok := t.(json.Delim); !ok || delim != '{' {
+		return fmt.Errorf("record should start with '{', not %s", t)
+	}
+
+	keylist := make(map[string]bool)
+	for dec.More() {
+		keyTok, err := dec.Token()
+		if err != nil {
+			return err
+		}
+
+		key := keyTok.(string)
+		if keylist[key] {
+			return fmt.Errorf("key %s shows up twice in row to be decoded", key)
+		}
+		keylist[key] = true
+
+		indices := b.schema.FieldIndices(key)
+		if len(indices) == 0 {
+			continue
+		}
+
+		if err := b.fields[indices[0]].unmarshalOne(dec); err != nil {
+			return err
+		}
+	}
+
+	for i, f := range b.schema.Fields() {
+		if !keylist[f.Name] {
+			b.fields[i].AppendNull()
+		}
+	}
+	return nil
+}
+
 var (
 	_ Record       = (*simpleRecord)(nil)
 	_ RecordReader = (*simpleRecords)(nil)
diff --git a/go/arrow/array/string.go b/go/arrow/array/string.go
index b0b291f0c0c1e..96c7150bc460e 100644
--- a/go/arrow/array/string.go
+++ b/go/arrow/array/string.go
@@ -17,6 +17,7 @@
 package array
 
 import (
+	"bytes"
 	"fmt"
 	"math"
 	"reflect"
@@ -25,6 +26,7 @@ import (
 
 	"github.com/apache/arrow/go/v7/arrow"
 	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/goccy/go-json"
 )
 
 const (
@@ -118,6 +120,25 @@ func (a *String) setData(data *Data) {
 	}
 }
 
+func (a *String) getOneForMarshal(i int) interface{} {
+	if a.IsValid(i) {
+		return a.Value(i)
+	}
+	return nil
+}
+
+func (a *String) MarshalJSON() ([]byte, error) {
+	vals := make([]interface{}, a.Len())
+	for i := 0; i < a.Len(); i++ {
+		if a.IsValid(i) {
+			vals[i] = a.Value(i)
+		} else {
+			vals[i] = nil
+		}
+	}
+	return json.Marshal(vals)
+}
+
 func arrayEqualString(left, right *String) bool {
 	for i := 0; i < left.Len(); i++ {
 		if left.IsNull(i) {
@@ -223,6 +244,50 @@ func (b *StringBuilder) NewStringArray() (a *String) {
 	return
 }
 
+func (b *StringBuilder) unmarshalOne(dec *json.Decoder) error {
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	switch v := t.(type) {
+	case nil:
+		b.AppendNull()
+	case string:
+		b.Append(v)
+	default:
+		return &json.UnmarshalTypeError{
+			Value:  fmt.Sprint(v),
+			Type:   reflect.TypeOf(string("")),
+			Offset: dec.InputOffset(),
+		}
+	}
+	return nil
+}
+
+func (b *StringBuilder) unmarshal(dec *json.Decoder) error {
+	for dec.More() {
+		if err := b.unmarshalOne(dec); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (b *StringBuilder) UnmarshalJSON(data []byte) error {
+	dec := json.NewDecoder(bytes.NewReader(data))
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	if delim, ok := t.(json.Delim); !ok || delim != '[' {
+		return fmt.Errorf("string builder must unpack from json array, found %s", delim)
+	}
+
+	return b.unmarshal(dec)
+}
+
 var (
 	_ Interface = (*String)(nil)
 	_ Builder   = (*StringBuilder)(nil)
diff --git a/go/arrow/array/struct.go b/go/arrow/array/struct.go
index 807eb70d73a0f..89baf4cdd3113 100644
--- a/go/arrow/array/struct.go
+++ b/go/arrow/array/struct.go
@@ -18,6 +18,7 @@ package array
 
 import (
 	"bytes"
+	"errors"
 	"fmt"
 	"strings"
 	"sync/atomic"
@@ -26,6 +27,7 @@ import (
 	"github.com/apache/arrow/go/v7/arrow/bitutil"
 	"github.com/apache/arrow/go/v7/arrow/internal/debug"
 	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/goccy/go-json"
 )
 
 // Struct represents an ordered sequence of relative types.
@@ -105,6 +107,36 @@ func (a *Struct) setData(data *Data) {
 	}
 }
 
+func (a *Struct) getOneForMarshal(i int) interface{} {
+	if a.IsNull(i) {
+		return nil
+	}
+
+	tmp := make(map[string]interface{})
+	fieldList := a.data.dtype.(*arrow.StructType).Fields()
+	for j, d := range a.fields {
+		tmp[fieldList[j].Name] = d.getOneForMarshal(i)
+	}
+	return tmp
+}
+
+func (a *Struct) MarshalJSON() ([]byte, error) {
+	var buf bytes.Buffer
+	enc := json.NewEncoder(&buf)
+
+	buf.WriteByte('[')
+	for i := 0; i < a.Len(); i++ {
+		if i != 0 {
+			buf.WriteByte(',')
+		}
+		if err := enc.Encode(a.getOneForMarshal(i)); err != nil {
+			return nil, err
+		}
+	}
+	buf.WriteByte(']')
+	return buf.Bytes(), nil
+}
+
 func arrayEqualStruct(left, right *Struct) bool {
 	for i, lf := range left.fields {
 		rf := right.fields[i]
@@ -272,6 +304,79 @@ func (b *StructBuilder) newData() (data *Data) {
 	return
 }
 
+func (b *StructBuilder) unmarshalOne(dec *json.Decoder) error {
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	switch t {
+	case json.Delim('{'):
+		b.Append(true)
+		keylist := make(map[string]bool)
+		for dec.More() {
+			keyTok, err := dec.Token()
+			if err != nil {
+				return err
+			}
+
+			key, ok := keyTok.(string)
+			if !ok {
+				return errors.New("missing key")
+			}
+
+			if keylist[key] {
+				return fmt.Errorf("key %s is specified twice", key)
+			}
+
+			keylist[key] = true
+
+			idx, ok := b.dtype.(*arrow.StructType).FieldIdx(key)
+			if !ok {
+				continue
+			}
+
+			if err := b.fields[idx].unmarshalOne(dec); err != nil {
+				return err
+			}
+		}
+		// consume '}'
+		_, err := dec.Token()
+		return err
+	case nil:
+		b.AppendNull()
+	default:
+		return &json.UnmarshalTypeError{
+			Offset: dec.InputOffset(),
+			Struct: fmt.Sprint(b.dtype),
+		}
+	}
+	return nil
+}
+
+func (b *StructBuilder) unmarshal(dec *json.Decoder) error {
+	for dec.More() {
+		if err := b.unmarshalOne(dec); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (b *StructBuilder) UnmarshalJSON(data []byte) error {
+	dec := json.NewDecoder(bytes.NewReader(data))
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	if delim, ok := t.(json.Delim); !ok || delim != '[' {
+		return fmt.Errorf("struct builder must unpack from json array, found %s", delim)
+	}
+
+	return b.unmarshal(dec)
+}
+
 var (
 	_ Interface = (*Struct)(nil)
 	_ Builder   = (*StructBuilder)(nil)
diff --git a/go/arrow/array/util.go b/go/arrow/array/util.go
index c8d7e17c2bd8b..b4d735ecb9b01 100644
--- a/go/arrow/array/util.go
+++ b/go/arrow/array/util.go
@@ -16,9 +16,205 @@
 
 package array
 
+import (
+	"errors"
+	"fmt"
+	"io"
+
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/goccy/go-json"
+)
+
 func min(a, b int) int {
 	if a < b {
 		return a
 	}
 	return b
 }
+
+type fromJSONCfg struct {
+	multiDocument bool
+	startOffset   int64
+}
+
+type FromJSONOption func(*fromJSONCfg)
+
+func WithMultipleDocs() FromJSONOption {
+	return func(c *fromJSONCfg) {
+		c.multiDocument = true
+	}
+}
+
+// WithStartOffset attempts to start decoding from the reader at the offset
+// passed in. If using this option the reader must fulfill the io.ReadSeeker
+// interface, or else an error will be returned.
+//
+// It will call Seek(off, io.SeekStart) on the reader
+func WithStartOffset(off int64) FromJSONOption {
+	return func(c *fromJSONCfg) {
+		c.startOffset = off
+	}
+}
+
+// FromJSON creates an array.Interface from a corresponding JSON stream and defined data type. If the types in the
+// json do not match the type provided, it will return errors. This is *not* the integration test format
+// and should not be used as such. This intended to be used by consumers more similarly to the current exposing of
+// the csv reader/writer. It also returns the input offset in the reader where it finished decoding since buffering
+// by the decoder could leave the reader's cursor past where the parsing finished if attempting to parse multiple json
+// arrays from one stream.
+//
+// All the Array types implement json.Marshaller and thus can be written to json
+// using the json.Marshal function
+//
+// The JSON provided must be formatted in one of two ways:
+//		Default: the top level of the json must be a list which matches the type specified exactly
+//		Example: `[1, 2, 3, 4, 5]` for any integer type or `[[...], null, [], .....]` for a List type
+//					Struct arrays are represented a list of objects: `[{"foo": 1, "bar": "moo"}, {"foo": 5, "bar": "baz"}]`
+//
+//		Using WithMultipleDocs:
+//			If the JSON provided is multiple newline separated json documents, then use this option
+// 			and each json document will be treated as a single row of the array. This is most useful for record batches
+// 			and interacting with other processes that use json. For example:
+//				`{"col1": 1, "col2": "row1", "col3": ...}\n{"col1": 2, "col2": "row2", "col3": ...}\n.....`
+//
+// Duration values get formated upon marshalling as a string consisting of their numeric
+// value followed by the unit suffix such as "10s" for a value of 10 and unit of Seconds.
+// with "ms" for millisecond, "us" for microsecond, and "ns" for nanosecond as the suffixes.
+// Unmarshalling duration values is more permissive since it first tries to use Go's
+// time.ParseDuration function which means it allows values in the form 3h25m0.3s in addition
+// to the same values which are output.
+//
+// Interval types are marshalled / unmarshalled as follows:
+//  MonthInterval is marshalled as an object with the format:
+//	 { "months": #}
+//  DayTimeInterval is marshalled using Go's regular marshalling of structs:
+//	 { "days": #, "milliseconds": # }
+//  MonthDayNanoInterval values are marshalled the same as DayTime using Go's struct marshalling:
+//   { "months": #, "days": #, "nanoseconds": # }
+//
+// Times use a format of HH:MM or HH:MM:SS[.zzz] where the fractions of a second cannot
+// exceed the precision allowed by the time unit, otherwise unmarshalling will error.
+//
+// Dates use YYYY-MM-DD format
+//
+// Timestamps use RFC3339Nano format except without a timezone, all of the following are valid:
+//	YYYY-MM-DD
+//	YYYY-MM-DD[T]HH
+//	YYYY-MM-DD[T]HH:MM
+//  YYYY-MM-DD[T]HH:MM:SS[.zzzzzzzzzz]
+//
+// The fractions of a second cannot exceed the precision allowed by the timeunit of the datatype.
+//
+// When processing structs as objects order of keys does not matter, but keys cannot be repeated.
+func FromJSON(mem memory.Allocator, dt arrow.DataType, r io.Reader, opts ...FromJSONOption) (arr Interface, offset int64, err error) {
+	var cfg fromJSONCfg
+	for _, o := range opts {
+		o(&cfg)
+	}
+
+	if cfg.startOffset != 0 {
+		seeker, ok := r.(io.ReadSeeker)
+		if !ok {
+			return nil, 0, errors.New("using StartOffset option requires reader to be a ReadSeeker, cannot seek")
+		}
+
+		seeker.Seek(cfg.startOffset, io.SeekStart)
+	}
+
+	bldr := NewBuilder(mem, dt)
+	defer bldr.Release()
+
+	dec := json.NewDecoder(r)
+	defer func() {
+		if errors.Is(err, io.EOF) {
+			err = fmt.Errorf("failed parsing json: %w", io.ErrUnexpectedEOF)
+		}
+	}()
+
+	if !cfg.multiDocument {
+		t, err := dec.Token()
+		if err != nil {
+			return nil, dec.InputOffset(), err
+		}
+
+		if delim, ok := t.(json.Delim); !ok || delim != '[' {
+			return nil, dec.InputOffset(), fmt.Errorf("json doc must be an array, found %s", delim)
+		}
+	}
+
+	if err = bldr.unmarshal(dec); err != nil {
+		return nil, dec.InputOffset(), err
+	}
+
+	if !cfg.multiDocument {
+		// consume the last ']'
+		if _, err = dec.Token(); err != nil {
+			return nil, dec.InputOffset(), err
+		}
+	}
+
+	return bldr.NewArray(), dec.InputOffset(), nil
+}
+
+// RecordToStructArray constructs a struct array from the columns of the record batch
+// by referencing them, zero-copy.
+func RecordToStructArray(rec Record) *Struct {
+	cols := make([]*Data, rec.NumCols())
+	for i, c := range rec.Columns() {
+		cols[i] = c.Data()
+	}
+
+	data := NewData(arrow.StructOf(rec.Schema().Fields()...), int(rec.NumRows()), []*memory.Buffer{nil}, cols, 0, 0)
+	defer data.Release()
+
+	return NewStructData(data)
+}
+
+// RecordFromStructArray is a convenience function for converting a struct array into
+// a record batch without copying the data. If the passed in schema is nil, the fields
+// of the struct will be used to define the record batch. Otherwise the passed in
+// schema will be used to create the record batch. If passed in, the schema must match
+// the fields of the struct column.
+func RecordFromStructArray(in *Struct, schema *arrow.Schema) Record {
+	if schema == nil {
+		schema = arrow.NewSchema(in.DataType().(*arrow.StructType).Fields(), nil)
+	}
+
+	return NewRecord(schema, in.fields, int64(in.Len()))
+}
+
+// RecordFromJSON creates a record batch from JSON data. See array.FromJSON for the details
+// of formatting and logic.
+//
+// A record batch from JSON is equivalent to reading a struct array in from json and then
+// converting it to a record batch.
+func RecordFromJSON(mem memory.Allocator, schema *arrow.Schema, r io.Reader, opts ...FromJSONOption) (Record, int64, error) {
+	st := arrow.StructOf(schema.Fields()...)
+	arr, off, err := FromJSON(mem, st, r, opts...)
+	if err != nil {
+		return nil, off, err
+	}
+	defer arr.Release()
+
+	return RecordFromStructArray(arr.(*Struct), schema), off, nil
+}
+
+// RecordToJSON writes out the given record following the format of each row is a single object
+// on a single line of the output.
+func RecordToJSON(rec Record, w io.Writer) error {
+	enc := json.NewEncoder(w)
+
+	fields := rec.Schema().Fields()
+
+	cols := make(map[string]interface{})
+	for i := 0; int64(i) < rec.NumRows(); i++ {
+		for j, c := range rec.Columns() {
+			cols[fields[j].Name] = c.getOneForMarshal(i)
+		}
+		if err := enc.Encode(cols); err != nil {
+			return err
+		}
+	}
+	return nil
+}
diff --git a/go/arrow/array/util_test.go b/go/arrow/array/util_test.go
new file mode 100644
index 0000000000000..f8c592459b094
--- /dev/null
+++ b/go/arrow/array/util_test.go
@@ -0,0 +1,434 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package array_test
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"reflect"
+	"strings"
+	"testing"
+
+	"github.com/apache/arrow/go/v7/arrow"
+	"github.com/apache/arrow/go/v7/arrow/array"
+	"github.com/apache/arrow/go/v7/arrow/decimal128"
+	"github.com/apache/arrow/go/v7/arrow/internal/arrdata"
+	"github.com/apache/arrow/go/v7/arrow/memory"
+	"github.com/goccy/go-json"
+	"github.com/stretchr/testify/assert"
+)
+
+var typemap = map[arrow.DataType]reflect.Type{
+	arrow.PrimitiveTypes.Int8:   reflect.TypeOf(int8(0)),
+	arrow.PrimitiveTypes.Uint8:  reflect.TypeOf(uint8(0)),
+	arrow.PrimitiveTypes.Int16:  reflect.TypeOf(int16(0)),
+	arrow.PrimitiveTypes.Uint16: reflect.TypeOf(uint16(0)),
+	arrow.PrimitiveTypes.Int32:  reflect.TypeOf(int32(0)),
+	arrow.PrimitiveTypes.Uint32: reflect.TypeOf(uint32(0)),
+	arrow.PrimitiveTypes.Int64:  reflect.TypeOf(int64(0)),
+	arrow.PrimitiveTypes.Uint64: reflect.TypeOf(uint64(0)),
+}
+
+func TestIntegerArrsJSON(t *testing.T) {
+	const N = 10
+	types := []arrow.DataType{
+		arrow.PrimitiveTypes.Int8,
+		arrow.PrimitiveTypes.Uint8,
+		arrow.PrimitiveTypes.Int16,
+		arrow.PrimitiveTypes.Uint16,
+		arrow.PrimitiveTypes.Int32,
+		arrow.PrimitiveTypes.Uint32,
+		arrow.PrimitiveTypes.Int64,
+		arrow.PrimitiveTypes.Uint64,
+	}
+
+	for _, tt := range types {
+		t.Run(fmt.Sprint(tt), func(t *testing.T) {
+			mem := memory.NewCheckedAllocator(memory.NewGoAllocator())
+			defer mem.AssertSize(t, 0)
+
+			jsontest := make([]int, N)
+			vals := reflect.MakeSlice(reflect.SliceOf(typemap[tt]), N, N)
+			for i := 0; i < N; i++ {
+				vals.Index(i).Set(reflect.ValueOf(i).Convert(typemap[tt]))
+				jsontest[i] = i
+			}
+
+			data, _ := json.Marshal(jsontest)
+			arr, _, err := array.FromJSON(mem, tt, bytes.NewReader(data))
+			assert.NoError(t, err)
+			defer arr.Release()
+
+			assert.EqualValues(t, N, arr.Len())
+			assert.Zero(t, arr.NullN())
+
+			output, err := json.Marshal(arr)
+			assert.NoError(t, err)
+			assert.JSONEq(t, string(data), string(output))
+		})
+		t.Run(fmt.Sprint(tt)+" errors", func(t *testing.T) {
+			_, _, err := array.FromJSON(memory.DefaultAllocator, tt, strings.NewReader(""))
+			assert.Error(t, err)
+
+			_, _, err = array.FromJSON(memory.DefaultAllocator, tt, strings.NewReader("["))
+			assert.ErrorIs(t, err, io.ErrUnexpectedEOF)
+
+			_, _, err = array.FromJSON(memory.DefaultAllocator, tt, strings.NewReader("0"))
+			assert.Error(t, err)
+
+			_, _, err = array.FromJSON(memory.DefaultAllocator, tt, strings.NewReader("{}"))
+			assert.Error(t, err)
+
+			_, _, err = array.FromJSON(memory.DefaultAllocator, tt, strings.NewReader("[[0]]"))
+			assert.EqualError(t, err, "json: cannot unmarshal [ into Go value of type "+tt.Name())
+		})
+	}
+}
+
+func TestStringsJSON(t *testing.T) {
+	tests := []struct {
+		jsonstring string
+		values     []string
+		valids     []bool
+	}{
+		{"[]", []string{}, []bool{}},
+		{`["", "foo"]`, []string{"", "foo"}, nil},
+		{`["", null]`, []string{"", ""}, []bool{true, false}},
+		// NUL character in string
+		{`["", "some\u0000char"]`, []string{"", "some\x00char"}, nil},
+		// utf8 sequence in string
+		{"[\"\xc3\xa9\"]", []string{"\xc3\xa9"}, nil},
+		// bytes < 0x20 can be represented as JSON unicode escapes
+		{`["\u0000\u001f"]`, []string{"\x00\x1f"}, nil},
+	}
+
+	for _, tt := range tests {
+		t.Run("json "+tt.jsonstring, func(t *testing.T) {
+			bldr := array.NewStringBuilder(memory.DefaultAllocator)
+			defer bldr.Release()
+
+			bldr.AppendValues(tt.values, tt.valids)
+			expected := bldr.NewStringArray()
+			defer expected.Release()
+
+			arr, _, err := array.FromJSON(memory.DefaultAllocator, arrow.BinaryTypes.String, strings.NewReader(tt.jsonstring))
+			assert.NoError(t, err)
+			defer arr.Release()
+
+			assert.Truef(t, array.ArrayEqual(expected, arr), "expected: %s\ngot: %s\n", expected, arr)
+
+			data, err := json.Marshal(arr)
+			assert.NoError(t, err)
+			assert.JSONEq(t, tt.jsonstring, string(data))
+		})
+	}
+
+	t.Run("errors", func(t *testing.T) {
+		_, _, err := array.FromJSON(memory.DefaultAllocator, arrow.BinaryTypes.String, strings.NewReader("[0]"))
+		assert.Error(t, err)
+
+		_, _, err = array.FromJSON(memory.DefaultAllocator, arrow.BinaryTypes.String, strings.NewReader("[[]]"))
+		assert.Error(t, err)
+	})
+}
+
+func TestStructArrayFromJSON(t *testing.T) {
+	mem := memory.NewCheckedAllocator(memory.NewGoAllocator())
+	defer mem.AssertSize(t, 0)
+
+	jsonStr := `[{"hello": 3.5, "world": true, "yo": "foo"},{"hello": 3.25, "world": false, "yo": "bar"}]`
+
+	arr, _, err := array.FromJSON(mem, arrow.StructOf(
+		arrow.Field{Name: "hello", Type: arrow.PrimitiveTypes.Float64},
+		arrow.Field{Name: "world", Type: arrow.FixedWidthTypes.Boolean},
+		arrow.Field{Name: "yo", Type: arrow.BinaryTypes.String},
+	), strings.NewReader(jsonStr))
+	assert.NoError(t, err)
+	defer arr.Release()
+
+	output, err := json.Marshal(arr)
+	assert.NoError(t, err)
+	assert.JSONEq(t, jsonStr, string(output))
+}
+
+func TestArrayFromJSONMulti(t *testing.T) {
+	arr, _, err := array.FromJSON(memory.DefaultAllocator, arrow.StructOf(
+		arrow.Field{Name: "hello", Type: arrow.PrimitiveTypes.Float64},
+		arrow.Field{Name: "world", Type: arrow.FixedWidthTypes.Boolean},
+		arrow.Field{Name: "yo", Type: arrow.BinaryTypes.String},
+	), strings.NewReader("{\"hello\": 3.5, \"world\": true, \"yo\": \"foo\"}\n{\"hello\": 3.25, \"world\": false, \"yo\": \"bar\"}\n"),
+		array.WithMultipleDocs())
+	assert.NoError(t, err)
+	defer arr.Release()
+
+	assert.EqualValues(t, 2, arr.Len())
+	assert.Zero(t, arr.NullN())
+}
+
+func TestNestedJSONArrs(t *testing.T) {
+	mem := memory.NewCheckedAllocator(memory.NewGoAllocator())
+	defer mem.AssertSize(t, 0)
+
+	jsonStr := `[{"hello": 1.5, "world": [1, 2, 3, 4], "yo": [{"foo": "2005-05-06", "bar": "15:02:04.123"},{"foo": "1956-01-02", "bar": "02:10:00"}]}]`
+
+	arr, _, err := array.FromJSON(mem, arrow.StructOf(
+		arrow.Field{Name: "hello", Type: arrow.PrimitiveTypes.Float64},
+		arrow.Field{Name: "world", Type: arrow.ListOf(arrow.PrimitiveTypes.Int32)},
+		arrow.Field{Name: "yo", Type: arrow.FixedSizeListOf(2, arrow.StructOf(
+			arrow.Field{Name: "foo", Type: arrow.FixedWidthTypes.Date32},
+			arrow.Field{Name: "bar", Type: arrow.FixedWidthTypes.Time32ms},
+		))},
+	), strings.NewReader(jsonStr))
+	assert.NoError(t, err)
+	defer arr.Release()
+
+	v, err := json.Marshal(arr)
+	assert.NoError(t, err)
+	assert.JSONEq(t, jsonStr, string(v))
+}
+
+func TestGetNullsFromJSON(t *testing.T) {
+	mem := memory.NewCheckedAllocator(memory.NewGoAllocator())
+	defer mem.AssertSize(t, 0)
+
+	jsonStr := `[
+		{"yo": "thing", "arr": null, "nuf": {"ps": "今日は"}},
+		{"yo": null, "nuf": {"ps": null}, "arr": []},
+		{ "nuf": null, "yo": "今日は", "arr": [1,2,3]}
+	]`
+
+	rec, _, err := array.RecordFromJSON(mem, arrow.NewSchema([]arrow.Field{
+		{Name: "yo", Type: arrow.BinaryTypes.String, Nullable: true},
+		{Name: "arr", Type: arrow.ListOf(arrow.PrimitiveTypes.Int32), Nullable: true},
+		{Name: "nuf", Type: arrow.StructOf(arrow.Field{Name: "ps", Type: arrow.BinaryTypes.String, Nullable: true}), Nullable: true},
+	}, nil), strings.NewReader(jsonStr))
+	assert.NoError(t, err)
+	defer rec.Release()
+
+	assert.EqualValues(t, 3, rec.NumCols())
+	assert.EqualValues(t, 3, rec.NumRows())
+
+	data, err := json.Marshal(rec)
+	assert.NoError(t, err)
+	assert.JSONEq(t, jsonStr, string(data))
+}
+
+func TestDurationsJSON(t *testing.T) {
+	tests := []struct {
+		unit    arrow.TimeUnit
+		jsonstr string
+		values  []arrow.Duration
+	}{
+		{arrow.Second, `["1s", "2s", "3s", "4s", "5s"]`, []arrow.Duration{1, 2, 3, 4, 5}},
+		{arrow.Millisecond, `["1ms", "2ms", "3ms", "4ms", "5ms"]`, []arrow.Duration{1, 2, 3, 4, 5}},
+		{arrow.Microsecond, `["1us", "2us", "3us", "4us", "5us"]`, []arrow.Duration{1, 2, 3, 4, 5}},
+		{arrow.Nanosecond, `["1ns", "2ns", "3ns", "4ns", "5ns"]`, []arrow.Duration{1, 2, 3, 4, 5}},
+	}
+	for _, tt := range tests {
+		dtype := &arrow.DurationType{Unit: tt.unit}
+		bldr := array.NewDurationBuilder(memory.DefaultAllocator, dtype)
+		defer bldr.Release()
+
+		bldr.AppendValues(tt.values, nil)
+		expected := bldr.NewArray()
+		defer expected.Release()
+
+		arr, _, err := array.FromJSON(memory.DefaultAllocator, dtype, strings.NewReader(tt.jsonstr))
+		assert.NoError(t, err)
+		defer arr.Release()
+
+		assert.Truef(t, array.ArrayEqual(expected, arr), "expected: %s\ngot: %s\n", expected, arr)
+	}
+}
+
+func TestTimestampsJSON(t *testing.T) {
+	tests := []struct {
+		unit    arrow.TimeUnit
+		jsonstr string
+		values  []arrow.Timestamp
+	}{
+		{arrow.Second, `["1970-01-01", "2000-02-29", "3989-07-14", "1900-02-28"]`, []arrow.Timestamp{0, 951782400, 63730281600, -2203977600}},
+		{arrow.Nanosecond, `["1970-01-01", "2000-02-29", "1900-02-28"]`, []arrow.Timestamp{0, 951782400000000000, -2203977600000000000}},
+	}
+
+	for _, tt := range tests {
+		dtype := &arrow.TimestampType{Unit: tt.unit}
+		bldr := array.NewTimestampBuilder(memory.DefaultAllocator, dtype)
+		defer bldr.Release()
+
+		bldr.AppendValues(tt.values, nil)
+		expected := bldr.NewArray()
+		defer expected.Release()
+
+		arr, _, err := array.FromJSON(memory.DefaultAllocator, dtype, strings.NewReader(tt.jsonstr))
+		assert.NoError(t, err)
+		defer arr.Release()
+
+		assert.Truef(t, array.ArrayEqual(expected, arr), "expected: %s\ngot: %s\n", expected, arr)
+	}
+}
+
+func TestDateJSON(t *testing.T) {
+	t.Run("date32", func(t *testing.T) {
+		bldr := array.NewDate32Builder(memory.DefaultAllocator)
+		defer bldr.Release()
+
+		jsonstr := `["1970-01-06", null, "1970-02-12"]`
+
+		bldr.AppendValues([]arrow.Date32{5, 0, 42}, []bool{true, false, true})
+		expected := bldr.NewArray()
+		defer expected.Release()
+
+		arr, _, err := array.FromJSON(memory.DefaultAllocator, arrow.FixedWidthTypes.Date32, strings.NewReader(jsonstr))
+		assert.NoError(t, err)
+		defer arr.Release()
+
+		assert.Truef(t, array.ArrayEqual(expected, arr), "expected: %s\ngot: %s\n", expected, arr)
+
+		data, err := json.Marshal(arr)
+		assert.NoError(t, err)
+		assert.JSONEq(t, jsonstr, string(data))
+	})
+	t.Run("date64", func(t *testing.T) {
+		bldr := array.NewDate64Builder(memory.DefaultAllocator)
+		defer bldr.Release()
+
+		jsonstr := `["1970-01-02", null, "2286-11-20"]`
+
+		bldr.AppendValues([]arrow.Date64{86400000, 0, 9999936000000}, []bool{true, false, true})
+		expected := bldr.NewArray()
+		defer expected.Release()
+
+		arr, _, err := array.FromJSON(memory.DefaultAllocator, arrow.FixedWidthTypes.Date64, strings.NewReader(jsonstr))
+		assert.NoError(t, err)
+		defer arr.Release()
+
+		assert.Truef(t, array.ArrayEqual(expected, arr), "expected: %s\ngot: %s\n", expected, arr)
+
+		data, err := json.Marshal(arr)
+		assert.NoError(t, err)
+		assert.JSONEq(t, jsonstr, string(data))
+	})
+}
+
+func TestTimeJSON(t *testing.T) {
+	tententen := 60*(60*(10)+10) + 10
+	tests := []struct {
+		dt       arrow.DataType
+		jsonstr  string
+		valueadd int
+	}{
+		{arrow.FixedWidthTypes.Time32s, `[null, "10:10:10"]`, 123},
+		{arrow.FixedWidthTypes.Time32ms, `[null, "10:10:10.123"]`, 456},
+		{arrow.FixedWidthTypes.Time64us, `[null, "10:10:10.123456"]`, 789},
+		{arrow.FixedWidthTypes.Time64ns, `[null, "10:10:10.123456789"]`, 0},
+	}
+
+	for _, tt := range tests {
+		t.Run(fmt.Sprint(tt.dt), func(t *testing.T) {
+			defer func() {
+				tententen = 1000*tententen + tt.valueadd
+			}()
+
+			bldr := array.NewBuilder(memory.DefaultAllocator, tt.dt)
+			defer bldr.Release()
+
+			switch tt.dt.ID() {
+			case arrow.TIME32:
+				bldr.(*array.Time32Builder).AppendValues([]arrow.Time32{0, arrow.Time32(tententen)}, []bool{false, true})
+			case arrow.TIME64:
+				bldr.(*array.Time64Builder).AppendValues([]arrow.Time64{0, arrow.Time64(tententen)}, []bool{false, true})
+			}
+
+			expected := bldr.NewArray()
+			defer expected.Release()
+
+			arr, _, err := array.FromJSON(memory.DefaultAllocator, tt.dt, strings.NewReader(tt.jsonstr))
+			assert.NoError(t, err)
+			defer arr.Release()
+
+			assert.Truef(t, array.ArrayEqual(expected, arr), "expected: %s\ngot: %s\n", expected, arr)
+
+			data, err := json.Marshal(arr)
+			assert.NoError(t, err)
+			assert.JSONEq(t, tt.jsonstr, string(data))
+		})
+	}
+}
+
+func TestDecimal128JSON(t *testing.T) {
+	dt := &arrow.Decimal128Type{Precision: 10, Scale: 4}
+	bldr := array.NewDecimal128Builder(memory.DefaultAllocator, dt)
+	defer bldr.Release()
+
+	bldr.AppendValues([]decimal128.Num{decimal128.FromU64(1234567), {}, decimal128.FromI64(-789000)}, []bool{true, false, true})
+	expected := bldr.NewArray()
+	defer expected.Release()
+
+	arr, _, err := array.FromJSON(memory.DefaultAllocator, dt, strings.NewReader(`["123.4567", null, "-78.9000"]`))
+	assert.NoError(t, err)
+	defer arr.Release()
+
+	assert.Truef(t, array.ArrayEqual(expected, arr), "expected: %s\ngot: %s\n", expected, arr)
+
+	data, err := json.Marshal(arr)
+	assert.NoError(t, err)
+	assert.JSONEq(t, `["123.4567", null, "-78.9"]`, string(data))
+}
+
+func TestArrRecordsJSONRoundTrip(t *testing.T) {
+	for k, v := range arrdata.Records {
+		if k == "decimal128" || k == "fixed_width_types" {
+			// test these separately since the sample data in the arrdata
+			// records doesn't lend itself to exactness when going to/from
+			// json. The fixed_width_types one uses negative values for
+			// time32 and time64 which correctly get interpreted into times,
+			// but re-encoding them in json produces the normalized positive
+			// values instead of re-creating negative ones.
+			// the decimal128 values don't get parsed *exactly* due to fun
+			// float weirdness due to their size, so smaller tests will work fine.
+			continue
+		}
+		t.Run(k, func(t *testing.T) {
+			var buf bytes.Buffer
+			assert.NotPanics(t, func() {
+				enc := json.NewEncoder(&buf)
+				for _, r := range v {
+					if err := enc.Encode(r); err != nil {
+						panic(err)
+					}
+				}
+			})
+
+			rdr := bytes.NewReader(buf.Bytes())
+			var cur int64
+
+			mem := memory.NewCheckedAllocator(memory.NewGoAllocator())
+			defer mem.AssertSize(t, 0)
+
+			for _, r := range v {
+				rec, off, err := array.RecordFromJSON(mem, r.Schema(), rdr, array.WithStartOffset(cur))
+				assert.NoError(t, err)
+				defer rec.Release()
+
+				assert.Truef(t, array.RecordApproxEqual(r, rec), "expected: %s\ngot: %s\n", r, rec)
+				cur += off
+			}
+		})
+	}
+}
diff --git a/go/arrow/datatype_fixedwidth.go b/go/arrow/datatype_fixedwidth.go
index 541f091e5f314..3d11884c0fa37 100644
--- a/go/arrow/datatype_fixedwidth.go
+++ b/go/arrow/datatype_fixedwidth.go
@@ -17,9 +17,12 @@
 package arrow
 
 import (
+	"encoding/json"
 	"fmt"
 	"strconv"
 	"time"
+
+	"golang.org/x/xerrors"
 )
 
 type BooleanType struct{}
@@ -54,6 +57,168 @@ type (
 	Duration  int64
 )
 
+// Date32FromTime returns a Date32 value from a time object
+func Date32FromTime(t time.Time) Date32 {
+	return Date32(t.Unix() / int64((time.Hour * 24).Seconds()))
+}
+
+func (d Date32) ToTime() time.Time {
+	return time.Unix(0, 0).UTC().AddDate(0, 0, int(d))
+}
+
+// Date64FromTime returns a Date64 value from a time object
+func Date64FromTime(t time.Time) Date64 {
+	return Date64(t.Unix()*1e3 + int64(t.Nanosecond())/1e6)
+}
+
+func (d Date64) ToTime() time.Time {
+	days := int(int64(d) / (time.Hour * 24).Milliseconds())
+	return time.Unix(0, 0).UTC().AddDate(0, 0, days)
+}
+
+// TimestampFromString parses a string and returns a timestamp for the given unit
+// level.
+//
+// The timestamp should be in one of the following forms, [T] can be either T
+// or a space, and [.zzzzzzzzz] can be either left out or up to 9 digits of
+// fractions of a second.
+//
+//	 YYYY-MM-DD
+//	 YYYY-MM-DD[T]HH
+//   YYYY-MM-DD[T]HH:MM
+//   YYYY-MM-DD[T]HH:MM:SS[.zzzzzzzz]
+func TimestampFromString(val string, unit TimeUnit) (Timestamp, error) {
+	format := "2006-01-02"
+	if val[len(val)-1] == 'Z' {
+		val = val[:len(val)-1]
+	}
+
+	switch {
+	case len(val) == 13:
+		format += string(val[10]) + "15"
+	case len(val) == 16:
+		format += string(val[10]) + "15:04"
+	case len(val) >= 19:
+		format += string(val[10]) + "15:04:05.999999999"
+	}
+
+	// error if we're truncating precision
+	// don't need a case for nano as time.Parse will already error if
+	// more than nanosecond precision is provided
+	switch {
+	case unit == Second && len(val) > 19:
+		return 0, xerrors.New("provided more than second precision for timestamp[s]")
+	case unit == Millisecond && len(val) > 23:
+		return 0, xerrors.New("provided more than millisecond precision for timestamp[ms]")
+	case unit == Microsecond && len(val) > 26:
+		return 0, xerrors.New("provided more than microsecond precision for timestamp[us]")
+	}
+
+	out, err := time.ParseInLocation(format, val, time.UTC)
+	if err != nil {
+		return 0, err
+	}
+
+	switch unit {
+	case Second:
+		return Timestamp(out.Unix()), nil
+	case Millisecond:
+		return Timestamp(out.Unix()*1e3 + int64(out.Nanosecond())/1e6), nil
+	case Microsecond:
+		return Timestamp(out.Unix()*1e6 + int64(out.Nanosecond())/1e3), nil
+	case Nanosecond:
+		return Timestamp(out.UnixNano()), nil
+	}
+	return 0, fmt.Errorf("unexpected timestamp unit: %s", unit)
+}
+
+func (t Timestamp) ToTime(unit TimeUnit) time.Time {
+	if unit == Second {
+		return time.Unix(int64(t), 0).UTC()
+	}
+	return time.Unix(0, int64(t)*int64(unit.Multiplier())).UTC()
+}
+
+// Time32FromString parses a string to return a Time32 value in the given unit,
+// unit needs to be only seconds or milliseconds and the string should be in the
+// form of HH:MM or HH:MM:SS[.zzz] where the fractions of a second are optional.
+func Time32FromString(val string, unit TimeUnit) (Time32, error) {
+	switch unit {
+	case Second:
+		if len(val) > 8 {
+			return 0, xerrors.New("cannot convert larger than second precision to time32s")
+		}
+	case Millisecond:
+		if len(val) > 12 {
+			return 0, xerrors.New("cannot convert larger than millisecond precision to time32ms")
+		}
+	case Microsecond, Nanosecond:
+		return 0, xerrors.New("time32 can only be seconds or milliseconds")
+	}
+
+	var (
+		out time.Time
+		err error
+	)
+	switch {
+	case len(val) == 5:
+		out, err = time.ParseInLocation("15:04", val, time.UTC)
+	default:
+		out, err = time.ParseInLocation("15:04:05.999", val, time.UTC)
+	}
+	if err != nil {
+		return 0, err
+	}
+	t := out.Sub(time.Date(0, 1, 1, 0, 0, 0, 0, time.UTC))
+	if unit == Second {
+		return Time32(t.Seconds()), nil
+	}
+	return Time32(t.Milliseconds()), nil
+}
+
+func (t Time32) ToTime(unit TimeUnit) time.Time {
+	return time.Unix(0, int64(t)*int64(unit.Multiplier())).UTC()
+}
+
+// Time64FromString parses a string to return a Time64 value in the given unit,
+// unit needs to be only microseconds or nanoseconds and the string should be in the
+// form of HH:MM or HH:MM:SS[.zzzzzzzzz] where the fractions of a second are optional.
+func Time64FromString(val string, unit TimeUnit) (Time64, error) {
+	// don't need to check length for nanoseconds as Parse will already error
+	// if more than 9 digits are provided for the fractional second
+	switch unit {
+	case Microsecond:
+		if len(val) > 15 {
+			return 0, xerrors.New("cannot convert larger than microsecond precision to time64us")
+		}
+	case Second, Millisecond:
+		return 0, xerrors.New("time64 should only be microseconds or nanoseconds")
+	}
+
+	var (
+		out time.Time
+		err error
+	)
+	switch {
+	case len(val) == 5:
+		out, err = time.ParseInLocation("15:04", val, time.UTC)
+	default:
+		out, err = time.ParseInLocation("15:04:05.999999999", val, time.UTC)
+	}
+	if err != nil {
+		return 0, err
+	}
+	t := out.Sub(time.Date(0, 1, 1, 0, 0, 0, 0, time.UTC))
+	if unit == Microsecond {
+		return Time64(t.Microseconds()), nil
+	}
+	return Time64(t.Nanoseconds()), nil
+}
+
+func (t Time64) ToTime(unit TimeUnit) time.Time {
+	return time.Unix(0, int64(t)*int64(unit.Multiplier())).UTC()
+}
+
 const (
 	Nanosecond TimeUnit = iota
 	Microsecond
@@ -163,6 +328,24 @@ func (t *Decimal128Type) Fingerprint() string {
 // MonthInterval represents a number of months.
 type MonthInterval int32
 
+func (m *MonthInterval) UnmarshalJSON(data []byte) error {
+	var val struct {
+		Months int32 `json:"months"`
+	}
+	if err := json.Unmarshal(data, &val); err != nil {
+		return err
+	}
+
+	*m = MonthInterval(val.Months)
+	return nil
+}
+
+func (m MonthInterval) MarshalJSON() ([]byte, error) {
+	return json.Marshal(struct {
+		Months int32 `json:"months"`
+	}{int32(m)})
+}
+
 // MonthIntervalType is encoded as a 32-bit signed integer,
 // representing a number of months.
 type MonthIntervalType struct{}
@@ -215,6 +398,55 @@ func (*MonthDayNanoIntervalType) Fingerprint() string {
 // BitWidth returns the number of bits required to store a single element of this data type in memory.
 func (*MonthDayNanoIntervalType) BitWidth() int { return 128 }
 
+type op int8
+
+const (
+	convDIVIDE = iota
+	convMULTIPLY
+)
+
+var timestampConversion = [...][4]struct {
+	op     op
+	factor int64
+}{
+	Nanosecond: {
+		Nanosecond:  {convMULTIPLY, int64(time.Nanosecond)},
+		Microsecond: {convDIVIDE, int64(time.Microsecond)},
+		Millisecond: {convDIVIDE, int64(time.Millisecond)},
+		Second:      {convDIVIDE, int64(time.Second)},
+	},
+	Microsecond: {
+		Nanosecond:  {convMULTIPLY, int64(time.Microsecond)},
+		Microsecond: {convMULTIPLY, 1},
+		Millisecond: {convDIVIDE, int64(time.Millisecond / time.Microsecond)},
+		Second:      {convDIVIDE, int64(time.Second / time.Microsecond)},
+	},
+	Millisecond: {
+		Nanosecond:  {convMULTIPLY, int64(time.Millisecond)},
+		Microsecond: {convMULTIPLY, int64(time.Millisecond / time.Microsecond)},
+		Millisecond: {convMULTIPLY, 1},
+		Second:      {convDIVIDE, int64(time.Second / time.Millisecond)},
+	},
+	Second: {
+		Nanosecond:  {convMULTIPLY, int64(time.Second)},
+		Microsecond: {convMULTIPLY, int64(time.Second / time.Microsecond)},
+		Millisecond: {convMULTIPLY, int64(time.Second / time.Millisecond)},
+		Second:      {convMULTIPLY, 1},
+	},
+}
+
+func ConvertTimestampValue(in, out TimeUnit, value int64) int64 {
+	conv := timestampConversion[int(in)][int(out)]
+	switch conv.op {
+	case convMULTIPLY:
+		return value * conv.factor
+	case convDIVIDE:
+		return value / conv.factor
+	}
+
+	return 0
+}
+
 var (
 	FixedWidthTypes = struct {
 		Boolean              FixedWidthDataType
diff --git a/go/arrow/datatype_fixedwidth_test.go b/go/arrow/datatype_fixedwidth_test.go
index 238c98ef94aab..5a46fb233907d 100644
--- a/go/arrow/datatype_fixedwidth_test.go
+++ b/go/arrow/datatype_fixedwidth_test.go
@@ -18,6 +18,7 @@ package arrow_test
 
 import (
 	"testing"
+	"time"
 
 	"github.com/apache/arrow/go/v7/arrow"
 	"github.com/stretchr/testify/assert"
@@ -158,6 +159,32 @@ func TestTime32Type(t *testing.T) {
 			}
 		})
 	}
+
+	for _, tc := range []struct {
+		unit    arrow.TimeUnit
+		str     string
+		want    arrow.Time32
+		wantErr bool
+	}{
+		{arrow.Second, "12:21", arrow.Time32(12*3600 + 21*60), false},
+		{arrow.Second, "02:30:45", arrow.Time32(2*3600 + 30*60 + 45), false},
+		{arrow.Second, "21:21:21.21", arrow.Time32(0), true},
+		{arrow.Millisecond, "21:21:21.21", arrow.Time32(21*3600000 + 21*60000 + 21*1000 + 210), false},
+		{arrow.Millisecond, "15:02:04.123", arrow.Time32(15*3600000 + 2*60000 + 4*1000 + 123), false},
+		{arrow.Millisecond, "12:12:12.1212", arrow.Time32(0), true},
+		{arrow.Microsecond, "10:10:10", arrow.Time32(0), true},
+		{arrow.Nanosecond, "10:10:10", arrow.Time32(0), true},
+	} {
+		t.Run("FromString", func(t *testing.T) {
+			v, e := arrow.Time32FromString(tc.str, tc.unit)
+			assert.Equal(t, tc.want, v)
+			if tc.wantErr {
+				assert.Error(t, e)
+			} else {
+				assert.NoError(t, e)
+			}
+		})
+	}
 }
 
 func TestTime64Type(t *testing.T) {
@@ -187,6 +214,39 @@ func TestTime64Type(t *testing.T) {
 			}
 		})
 	}
+
+	const (
+		h  = time.Hour
+		m  = time.Minute
+		s  = time.Second
+		us = time.Microsecond
+		ns = time.Nanosecond
+	)
+
+	for _, tc := range []struct {
+		unit    arrow.TimeUnit
+		str     string
+		want    arrow.Time64
+		wantErr bool
+	}{
+		{arrow.Second, "12:21", arrow.Time64(0), true},
+		{arrow.Millisecond, "21:21:21.21", arrow.Time64(0), true},
+		{arrow.Microsecond, "10:10:10", arrow.Time64((10*h + 10*m + 10*s).Microseconds()), false},
+		{arrow.Microsecond, "22:10:15.123456", arrow.Time64((22*h + 10*m + 15*s + 123456*us).Microseconds()), false},
+		{arrow.Microsecond, "12:34:56.78901234", arrow.Time64(0), true},
+		{arrow.Nanosecond, "12:34:56.78901234", arrow.Time64(12*h + 34*m + 56*s + 789012340), false},
+		{arrow.Nanosecond, "12:34:56.1234567890", arrow.Time64(0), true},
+	} {
+		t.Run("FromString", func(t *testing.T) {
+			v, e := arrow.Time64FromString(tc.str, tc.unit)
+			assert.Equal(t, tc.want, v)
+			if tc.wantErr {
+				assert.Error(t, e)
+			} else {
+				assert.NoError(t, e)
+			}
+		})
+	}
 }
 
 func TestDurationType(t *testing.T) {
diff --git a/go/arrow/scalar/parse.go b/go/arrow/scalar/parse.go
index 7f258e5b1274b..6a546284c4a7c 100644
--- a/go/arrow/scalar/parse.go
+++ b/go/arrow/scalar/parse.go
@@ -247,26 +247,10 @@ func ParseScalar(dt arrow.DataType, val string) (Scalar, error) {
 			return NewFloat64Scalar(float64(val)), nil
 		}
 	case arrow.TIMESTAMP:
-		format := "2006-01-02"
-		if val[len(val)-1] == 'Z' {
-			val = val[:len(val)-1]
-		}
-
-		switch {
-		case len(val) == 13:
-			format += string(val[10]) + "15"
-		case len(val) == 16:
-			format += string(val[10]) + "15:04"
-		case len(val) >= 19:
-			format += string(val[10]) + "15:04:05.999999999"
-		}
-
-		out, err := time.ParseInLocation(format, val, time.UTC)
+		value, err := arrow.TimestampFromString(val, dt.(*arrow.TimestampType).Unit)
 		if err != nil {
 			return nil, err
 		}
-
-		value := arrow.Timestamp(ConvertTimestampValue(arrow.Nanosecond, dt.(*arrow.TimestampType).Unit, out.UnixNano()))
 		return NewTimestampScalar(value, dt), nil
 	case arrow.DURATION:
 		value, err := time.ParseDuration(val)
@@ -292,48 +276,24 @@ func ParseScalar(dt arrow.DataType, val string) (Scalar, error) {
 			return nil, err
 		}
 		if dt.ID() == arrow.DATE32 {
-			return NewDate32Scalar(arrow.Date32(out.Unix() / int64((time.Hour * 24).Seconds()))), nil
+			return NewDate32Scalar(arrow.Date32FromTime(out)), nil
 		} else {
-			return NewDate64Scalar(arrow.Date64(out.Unix() * 1000)), nil
+			return NewDate64Scalar(arrow.Date64FromTime(out)), nil
 		}
 	case arrow.TIME32:
-		var (
-			out time.Time
-			err error
-		)
-		switch {
-		case len(val) == 5:
-			out, err = time.ParseInLocation("15:04", val, time.UTC)
-		default:
-			out, err = time.ParseInLocation("15:04:05.999", val, time.UTC)
-		}
+		tm, err := arrow.Time32FromString(val, dt.(*arrow.Time32Type).Unit)
 		if err != nil {
 			return nil, err
 		}
-		t := out.Sub(time.Date(0, 1, 1, 0, 0, 0, 0, time.UTC))
-		if dt.(*arrow.Time32Type).Unit == arrow.Second {
-			return NewTime32Scalar(arrow.Time32(t.Seconds()), dt), nil
-		}
-		return NewTime32Scalar(arrow.Time32(t.Milliseconds()), dt), nil
+
+		return NewTime32Scalar(tm, dt), nil
 	case arrow.TIME64:
-		var (
-			out time.Time
-			err error
-		)
-		switch {
-		case len(val) == 5:
-			out, err = time.ParseInLocation("15:04", val, time.UTC)
-		default:
-			out, err = time.ParseInLocation("15:04:05.999999999", val, time.UTC)
-		}
+		tm, err := arrow.Time64FromString(val, dt.(*arrow.Time64Type).Unit)
 		if err != nil {
 			return nil, err
 		}
-		t := out.Sub(time.Date(0, 1, 1, 0, 0, 0, 0, time.UTC))
-		if dt.(*arrow.Time64Type).Unit == arrow.Microsecond {
-			return NewTime64Scalar(arrow.Time64(t.Microseconds()), dt), nil
-		}
-		return NewTime64Scalar(arrow.Time64(t.Nanoseconds()), dt), nil
+
+		return NewTime64Scalar(tm, dt), nil
 	}
 
 	return nil, xerrors.Errorf("parsing of scalar for type %s not implemented", dt)
diff --git a/go/arrow/scalar/temporal.go b/go/arrow/scalar/temporal.go
index 4d9949f47f2ff..4ecc46db6bc13 100644
--- a/go/arrow/scalar/temporal.go
+++ b/go/arrow/scalar/temporal.go
@@ -26,55 +26,6 @@ import (
 	"golang.org/x/xerrors"
 )
 
-type op int8
-
-const (
-	convDIVIDE = iota
-	convMULTIPLY
-)
-
-var timestampConversion = [...][4]struct {
-	op     op
-	factor int64
-}{
-	arrow.Nanosecond: {
-		arrow.Nanosecond:  {convMULTIPLY, int64(time.Nanosecond)},
-		arrow.Microsecond: {convDIVIDE, int64(time.Microsecond)},
-		arrow.Millisecond: {convDIVIDE, int64(time.Millisecond)},
-		arrow.Second:      {convDIVIDE, int64(time.Second)},
-	},
-	arrow.Microsecond: {
-		arrow.Nanosecond:  {convMULTIPLY, int64(time.Microsecond)},
-		arrow.Microsecond: {convMULTIPLY, 1},
-		arrow.Millisecond: {convDIVIDE, int64(time.Millisecond / time.Microsecond)},
-		arrow.Second:      {convDIVIDE, int64(time.Second / time.Microsecond)},
-	},
-	arrow.Millisecond: {
-		arrow.Nanosecond:  {convMULTIPLY, int64(time.Millisecond)},
-		arrow.Microsecond: {convMULTIPLY, int64(time.Millisecond / time.Microsecond)},
-		arrow.Millisecond: {convMULTIPLY, 1},
-		arrow.Second:      {convDIVIDE, int64(time.Second / time.Millisecond)},
-	},
-	arrow.Second: {
-		arrow.Nanosecond:  {convMULTIPLY, int64(time.Second)},
-		arrow.Microsecond: {convMULTIPLY, int64(time.Second / time.Microsecond)},
-		arrow.Millisecond: {convMULTIPLY, int64(time.Second / time.Millisecond)},
-		arrow.Second:      {convMULTIPLY, 1},
-	},
-}
-
-func ConvertTimestampValue(in, out arrow.TimeUnit, value int64) int64 {
-	conv := timestampConversion[int(in)][int(out)]
-	switch conv.op {
-	case convMULTIPLY:
-		return value * conv.factor
-	case convDIVIDE:
-		return value / conv.factor
-	}
-
-	return 0
-}
-
 func temporalToString(s TemporalScalar) string {
 	switch s := s.(type) {
 	case *Date32:
@@ -178,7 +129,7 @@ func castTemporal(from TemporalScalar, to arrow.DataType) (Scalar, error) {
 			case *Date64:
 				newValue = int64(s.Value)
 			}
-			return NewTimestampScalar(arrow.Timestamp(ConvertTimestampValue(arrow.Millisecond, to.(*arrow.TimestampType).Unit, newValue)), to), nil
+			return NewTimestampScalar(arrow.Timestamp(arrow.ConvertTimestampValue(arrow.Millisecond, to.(*arrow.TimestampType).Unit, newValue)), to), nil
 		}
 
 		switch s := s.(type) {
@@ -194,20 +145,20 @@ func castTemporal(from TemporalScalar, to arrow.DataType) (Scalar, error) {
 	case *Timestamp:
 		switch to := to.(type) {
 		case *arrow.TimestampType:
-			return NewTimestampScalar(arrow.Timestamp(ConvertTimestampValue(s.Unit(), to.Unit, int64(s.Value))), to), nil
+			return NewTimestampScalar(arrow.Timestamp(arrow.ConvertTimestampValue(s.Unit(), to.Unit, int64(s.Value))), to), nil
 		case *arrow.Date32Type:
-			millis := ConvertTimestampValue(s.Unit(), arrow.Millisecond, int64(s.Value))
+			millis := arrow.ConvertTimestampValue(s.Unit(), arrow.Millisecond, int64(s.Value))
 			return NewDate32Scalar(arrow.Date32(millis / int64(millisecondsInDay))), nil
 		case *arrow.Date64Type:
-			millis := ConvertTimestampValue(s.Unit(), arrow.Millisecond, int64(s.Value))
+			millis := arrow.ConvertTimestampValue(s.Unit(), arrow.Millisecond, int64(s.Value))
 			return NewDate64Scalar(arrow.Date64(millis - millis%int64(millisecondsInDay))), nil
 		}
 	case TimeScalar:
 		switch to := to.(type) {
 		case *arrow.Time32Type:
-			return NewTime32Scalar(arrow.Time32(ConvertTimestampValue(s.Unit(), to.Unit, int64(s.value().(arrow.Time64)))), to), nil
+			return NewTime32Scalar(arrow.Time32(arrow.ConvertTimestampValue(s.Unit(), to.Unit, int64(s.value().(arrow.Time64)))), to), nil
 		case *arrow.Time64Type:
-			return NewTime64Scalar(arrow.Time64(ConvertTimestampValue(s.Unit(), to.Unit, int64(s.value().(arrow.Time32)))), to), nil
+			return NewTime64Scalar(arrow.Time64(arrow.ConvertTimestampValue(s.Unit(), to.Unit, int64(s.value().(arrow.Time32)))), to), nil
 		}
 
 	case *Duration:
@@ -215,7 +166,7 @@ func castTemporal(from TemporalScalar, to arrow.DataType) (Scalar, error) {
 		case *arrow.StringType:
 
 		case *arrow.DurationType:
-			return NewDurationScalar(arrow.Duration(ConvertTimestampValue(s.Unit(), to.Unit, int64(s.Value))), to), nil
+			return NewDurationScalar(arrow.Duration(arrow.ConvertTimestampValue(s.Unit(), to.Unit, int64(s.Value))), to), nil
 		}
 	}
 
diff --git a/go/arrow/tools.go b/go/arrow/tools.go
new file mode 100644
index 0000000000000..2730c36bbce58
--- /dev/null
+++ b/go/arrow/tools.go
@@ -0,0 +1,24 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build tools
+// +build tools
+
+package tools
+
+import (
+	_ "golang.org/x/tools/cmd/goimports"
+)
diff --git a/go/go.mod b/go/go.mod
index c4fd0fedfacc0..6bd4237bf715a 100644
--- a/go/go.mod
+++ b/go/go.mod
@@ -21,7 +21,9 @@ go 1.15
 require (
 	github.com/JohnCGriffin/overflow v0.0.0-20211019200055-46fa312c352c
 	github.com/andybalholm/brotli v1.0.3
+	github.com/apache/arrow/go/arrow v0.0.0-20211112161151-bc219186db40
 	github.com/apache/thrift v0.15.0
+	github.com/goccy/go-json v0.7.10
 	github.com/golang/protobuf v1.5.2
 	github.com/golang/snappy v0.0.4
 	github.com/google/flatbuffers v2.0.0+incompatible
@@ -34,6 +36,7 @@ require (
 	github.com/zeebo/xxh3 v0.13.0
 	golang.org/x/exp v0.0.0-20211028214138-64b4c8e87d1a
 	golang.org/x/sys v0.0.0-20211025201205-69cdffdb9359
+	golang.org/x/tools v0.1.4
 	golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1
 	gonum.org/v1/gonum v0.9.3
 	google.golang.org/grpc v1.41.0
diff --git a/go/go.sum b/go/go.sum
index f992bbf66ffb0..6c1e79c56ae31 100644
--- a/go/go.sum
+++ b/go/go.sum
@@ -20,6 +20,8 @@ github.com/alecthomas/units v0.0.0-20190717042225-c3de453c63f4/go.mod h1:ybxpYRF
 github.com/andybalholm/brotli v1.0.3 h1:fpcw+r1N1h0Poc1F/pHbW40cUm/lMEQslZtCkBQ0UnM=
 github.com/andybalholm/brotli v1.0.3/go.mod h1:fO7iG3H7G2nSZ7m0zPUDn85XEX2GTukHGRSepvi9Eig=
 github.com/antihax/optional v1.0.0/go.mod h1:uupD/76wgC+ih3iEmQUL+0Ugr19nfwCT1kdvxnR2qWY=
+github.com/apache/arrow/go/arrow v0.0.0-20211112161151-bc219186db40 h1:q4dksr6ICHXqG5hm0ZW5IHyeEJXoIJSOZeBLmWPNeIQ=
+github.com/apache/arrow/go/arrow v0.0.0-20211112161151-bc219186db40/go.mod h1:Q7yQnSMnLvcXlZ8RV+jwz/6y1rQTqbX6C82SndT52Zs=
 github.com/apache/thrift v0.12.0/go.mod h1:cp2SuWMxlEZw2r+iP2GNCdIi4C1qmUzdZFSVb+bacwQ=
 github.com/apache/thrift v0.13.0/go.mod h1:cp2SuWMxlEZw2r+iP2GNCdIi4C1qmUzdZFSVb+bacwQ=
 github.com/apache/thrift v0.15.0 h1:aGvdaR0v1t9XLgjtBYwxcBvBOTMqClzwE26CHOgjW1Y=
@@ -44,6 +46,7 @@ github.com/clbanning/x2j v0.0.0-20191024224557-825249438eec/go.mod h1:jMjuTZXRI4
 github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
 github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc=
 github.com/cncf/udpa/go v0.0.0-20201120205902-5459f2c99403/go.mod h1:WmhPx2Nbnhtbo57+VJT5O0JRkEi1Wbu0z5j0R8u5Hbk=
+github.com/cncf/xds/go v0.0.0-20210312221358-fbca930ec8ed/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs=
 github.com/cncf/xds/go v0.0.0-20210805033703-aa0b78936158/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs=
 github.com/cockroachdb/datadriven v0.0.0-20190809214429-80d97fb3cbaa/go.mod h1:zn76sxSg3SzpJ0PPJaLDCu+Bu0Lg3sKTORVIj19EIF8=
 github.com/codahale/hdrhistogram v0.0.0-20161010025455-3a0bb77429bd/go.mod h1:sE/e/2PUdi/liOCUjSTXgM1o87ZssimdTWN964YiIeI=
@@ -67,6 +70,8 @@ github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymF
 github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
 github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98=
 github.com/envoyproxy/go-control-plane v0.9.9-0.20201210154907-fd9021fe5dad/go.mod h1:cXg6YxExXjJnVBQHBLXeUAgxn2UodCpnH306RInaBQk=
+github.com/envoyproxy/go-control-plane v0.9.9-0.20210217033140-668b12f5399d/go.mod h1:cXg6YxExXjJnVBQHBLXeUAgxn2UodCpnH306RInaBQk=
+github.com/envoyproxy/go-control-plane v0.9.9-0.20210512163311-63b5d3c536b0/go.mod h1:hliV/p42l8fGbc6Y9bQ70uLwIvmJyVE5k4iMKlh8wCQ=
 github.com/envoyproxy/go-control-plane v0.9.10-0.20210907150352-cf90f659a021/go.mod h1:AFq3mo9L8Lqqiid3OhADV3RfLJnjiw63cSpi+fDTRC0=
 github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c=
 github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4=
@@ -92,6 +97,8 @@ github.com/go-logfmt/logfmt v0.5.0/go.mod h1:wCYkCAKZfumFQihp8CzCvQ3paCTfi41vtzG
 github.com/go-logr/logr v0.4.0/go.mod h1:z6/tIYblkpsD+a4lm/fGIIU9mZ+XfAiaFtq7xTgseGU=
 github.com/go-sql-driver/mysql v1.4.0/go.mod h1:zAC/RDZ24gD3HViQzih4MyKcchzm+sOG5ZlKdlhCg5w=
 github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY=
+github.com/goccy/go-json v0.7.10 h1:ulhbuNe1JqE68nMRXXTJRrUu0uhouf0VevLINxQq4Ec=
+github.com/goccy/go-json v0.7.10/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MGFi0w8I=
 github.com/gogo/googleapis v1.1.0/go.mod h1:gf4bu3Q80BeJ6H1S1vYPm8/ELATdvryBaNFGgqEef3s=
 github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ=
 github.com/gogo/protobuf v1.2.0/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ=
@@ -118,6 +125,7 @@ github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaS
 github.com/golang/protobuf v1.5.2 h1:ROPKBNFfQgOUMifHyP+KYbvpjbdoFNs+aK7DXlji0Tw=
 github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY=
 github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
+github.com/golang/snappy v0.0.3/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
 github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM=
 github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
 github.com/google/btree v0.0.0-20180813153112-4030bb1f1f0c/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ=
@@ -129,8 +137,9 @@ github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMyw
 github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
 github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
-github.com/google/go-cmp v0.5.5 h1:Khx7svrCpmxxtHBq5j2mp/xVjsi8hQMfNLvJFAlrGgU=
 github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
+github.com/google/go-cmp v0.5.6 h1:BKbKCqvP6I+rmFHt06ZmyQtvB8xAkWdhFyr0ZUNZcxQ=
+github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
 github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI=
 github.com/google/uuid v1.0.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
@@ -181,6 +190,7 @@ github.com/kisielk/errcheck v1.1.0/go.mod h1:EZBBE59ingxPouuu3KfxchcWSUPOHkagtvW
 github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
 github.com/klauspost/asmfmt v1.3.1 h1:7xZi1N7s9gTLbqiM8KUv8TLyysavbTRGBT5/ly0bRtw=
 github.com/klauspost/asmfmt v1.3.1/go.mod h1:AG8TuvYojzulgDAMCnYn50l/5QV3Bs/tp6j0HLHbNSE=
+github.com/klauspost/compress v1.13.1/go.mod h1:8dP1Hq4DHOhN9w426knH3Rhby4rFm6D8eO+e+Dq5Gzg=
 github.com/klauspost/compress v1.13.6 h1:P76CopJELS0TiO2mebmnzgWaajssP/EszplttgQxcgc=
 github.com/klauspost/compress v1.13.6/go.mod h1:/3/Vjq9QcHkK5uEr5lBEmyoZ1iFhe47etQ6QUkpK6sk=
 github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
@@ -246,6 +256,7 @@ github.com/phpdave11/gofpdi v1.0.12/go.mod h1:vBmVV0Do6hSBHC8uKUQ71JGW+ZGQq74llk
 github.com/pierrec/lz4 v1.0.2-0.20190131084431-473cd7ce01a1/go.mod h1:3/3N9NVKO0jef7pBehbT1qWhCMrIgbYNnFAZCqQ5LRc=
 github.com/pierrec/lz4 v2.0.5+incompatible h1:2xWsjqPFWcplujydGg4WmhC/6fZqK42wMM8aXeqhl0I=
 github.com/pierrec/lz4 v2.0.5+incompatible/go.mod h1:pdkljMzZIN41W+lC3N2tnIh5sFi+IEE17M5jbnwPHcY=
+github.com/pierrec/lz4/v4 v4.1.8/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4=
 github.com/pierrec/lz4/v4 v4.1.9 h1:xkrjwpOP5xg1k4Nn4GX4a4YFGhscyQL/3EddJ1Xxqm8=
 github.com/pierrec/lz4/v4 v4.1.9/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4=
 github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
@@ -309,6 +320,7 @@ github.com/urfave/cli v1.20.0/go.mod h1:70zkFmudgCuE/ngEzBv17Jvp/497gISqfk5gWijb
 github.com/urfave/cli v1.22.1/go.mod h1:Gos4lmkARVdJ6EkW0WaNv/tZAAMe9V7XWyB60NtXRu0=
 github.com/xiang90/probing v0.0.0-20190116061207-43a291ad63a2/go.mod h1:UETIi67q53MR2AWcXfiuqkDkRtnGDLqkBTpCHuJHxtU=
 github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
+github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k=
 github.com/zeebo/xxh3 v0.13.0 h1:Dmwt3ytycfDL+wm9ljWTS3gdtaQHMwJN9tOKwNJBxJ0=
 github.com/zeebo/xxh3 v0.13.0/go.mod h1:AQY73TOrhF3jNsdiM9zZOb8MThrYbZONHj7ryDBaLpg=
 go.etcd.io/bbolt v1.3.3/go.mod h1:IbVyRI1SCnLcuJnV2u8VeU0CEYM7e686BmAb1XKL+uU=
@@ -364,6 +376,7 @@ golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvx
 golang.org/x/lint v0.0.0-20190301231843-5614ed5bae6f/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
 golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
 golang.org/x/lint v0.0.0-20190930215403-16217165b5de/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
+golang.org/x/lint v0.0.0-20210508222113-6edffad5e616/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY=
 golang.org/x/mobile v0.0.0-20190312151609-d3739f865fa6/go.mod h1:z+o9i4GpDbdi3rU15maQ/Ox0txvL9dWGYEHz965HBQE=
 golang.org/x/mobile v0.0.0-20190719004257-d2bd2a29d028/go.mod h1:E/iHnbuqvinMTCcRqshq8CkpyQDoeVncDDYHnLhea+o=
 golang.org/x/mobile v0.0.0-20201217150744-e6ae53a27f4f/go.mod h1:skQtrUTUwhdJvXM/2KKJzY8pDgNr9I/FOMqDVRPBUS4=
@@ -372,6 +385,8 @@ golang.org/x/mod v0.1.0/go.mod h1:0QHyrYULN0/3qlju5TqG8bIK38QM8yzMo5ekMj3DlcY=
 golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg=
 golang.org/x/mod v0.1.1-0.20191209134235-331c550502dd/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
 golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
+golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
+golang.org/x/mod v0.5.1-0.20210830214625-1b1db11ec8f4 h1:7Qds88gNaRx0Dz/1wOwXlR7asekh1B1u26wEwN6FcEI=
 golang.org/x/mod v0.5.1-0.20210830214625-1b1db11ec8f4/go.mod h1:5OXOZSfqPIIbmVBIIKWRFfZjPR0E5r58TLhUjH0a2Ro=
 golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
 golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
@@ -390,8 +405,10 @@ golang.org/x/net v0.0.0-20190613194153-d28f0bde5980/go.mod h1:z5CRVTTTmAJ677TzLL
 golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
 golang.org/x/net v0.0.0-20190813141303-74dc4d7220e7/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
 golang.org/x/net v0.0.0-20200822124328-c89045814202/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA=
-golang.org/x/net v0.0.0-20201021035429-f5854403a974 h1:IX6qOQeG5uLjB/hjjwjedwfjND0hgjPMMyO1RoIXQNI=
 golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
+golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96bSt6lcn1PtDYWL6XObtHCRCNQM=
+golang.org/x/net v0.0.0-20210614182718-04defd469f4e h1:XpT3nA5TvE525Ne3hInMh6+GETgn27Zfm9dxsThnX2Q=
+golang.org/x/net v0.0.0-20210614182718-04defd469f4e/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
 golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
 golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
 golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
@@ -402,6 +419,7 @@ golang.org/x/sync v0.0.0-20190227155943-e225da77a7e6/go.mod h1:RxMgew5VJxzue5/jJ
 golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sys v0.0.0-20180823144017-11551d06cbcc/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
@@ -422,15 +440,22 @@ golang.org/x/sys v0.0.0-20191220142924-d4481acd189f/go.mod h1:h1NjWce9XRLGQEsW7w
 golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200727154430-2d971f7391a4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20210119212857-b64e53b001e4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20210304124612-50617c2ba197/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210330210617-4fbd30eecc44/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210510120138-977fb7262007/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.0.0-20211025201205-69cdffdb9359 h1:2B5p2L5IfGiD7+b9BOoRMC6DgObAVZV+Fsp050NqXik=
 golang.org/x/sys v0.0.0-20211025201205-69cdffdb9359/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
 golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
 golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
 golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
-golang.org/x/text v0.3.5 h1:i6eZZ+zk0SOf0xgBpEpPD18qWcJda6q1sxt3S0kzyUQ=
 golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
+golang.org/x/text v0.3.6 h1:aRYxNxv6iGQlyVaZmk6ZgYEDa+Jg18DxebPSrd6bg1M=
+golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/time v0.0.0-20180412165947-fbb02b2291d2/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
 golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
 golang.org/x/tools v0.0.0-20180221164845-07fd8470d635/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
@@ -452,7 +477,10 @@ golang.org/x/tools v0.0.0-20191029190741-b9c20aec41a5/go.mod h1:b+2E5dAYhXwXZwtn
 golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
 golang.org/x/tools v0.0.0-20200103221440-774c71fcf114/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
 golang.org/x/tools v0.0.0-20200117012304-6edc0a871e69/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
+golang.org/x/tools v0.0.0-20200130002326-2f3ba24bd6e7/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
 golang.org/x/tools v0.1.0/go.mod h1:xkSsbof2nBLbhDlRMhhhyNLN/zl3eTqcnHD5viDpcZ0=
+golang.org/x/tools v0.1.4 h1:cVngSRcfgyZCzys3KYOpCFa+4dqX/Oub9tAq00ttGVs=
+golang.org/x/tools v0.1.4/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk=
 golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
@@ -476,8 +504,9 @@ google.golang.org/genproto v0.0.0-20190425155659-357c62f0e4bb/go.mod h1:VzzqZJRn
 google.golang.org/genproto v0.0.0-20190530194941-fb225487d101/go.mod h1:z3L6/3dTEVtUr6QSP8miRzeRqwQOioJ9I66odjN4I7s=
 google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc=
 google.golang.org/genproto v0.0.0-20200513103714-09dca8ec2884/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c=
-google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013 h1:+kGHl1aib/qcwaRi1CbqBZ1rk19r85MNUf8HaBghugY=
 google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo=
+google.golang.org/genproto v0.0.0-20210630183607-d20f26d13c79 h1:s1jFTXJryg4a1mew7xv03VZD8N9XjxFhk1o4Js4WvPQ=
+google.golang.org/genproto v0.0.0-20210630183607-d20f26d13c79/go.mod h1:yiaVoXHpRzHGyxV3o4DktVWY4mSUErTKaeEOq6C3t3U=
 google.golang.org/grpc v1.17.0/go.mod h1:6QZJwpn2B+Zp71q/5VxRsJ6NXXVCE5NRUHRo+f3cWCs=
 google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c=
 google.golang.org/grpc v1.20.0/go.mod h1:chYK+tFQF0nDUGJgXMSgLCQk3phJEuONr2DCgLDdAQM=
@@ -491,6 +520,8 @@ google.golang.org/grpc v1.26.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8
 google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk=
 google.golang.org/grpc v1.33.1/go.mod h1:fr5YgcSWrqhRRxogOsw7RzIpsmvOZ6IcH4kBYTpR3n0=
 google.golang.org/grpc v1.36.0/go.mod h1:qjiiYl8FncCW8feJPdyg3v6XW24KsRHe+dy9BAGRRjU=
+google.golang.org/grpc v1.38.0/go.mod h1:NREThFqKR1f3iQ6oBuvc5LadQuXVGo9rkm5ZGrQdJfM=
+google.golang.org/grpc v1.39.0/go.mod h1:PImNr+rS9TWYb2O4/emRugxiyHZ5JyHW5F+RPnDzfrE=
 google.golang.org/grpc v1.41.0 h1:f+PlOh7QV4iIJkPrx5NQ7qaNGFQ3OTse67yaDHfju4E=
 google.golang.org/grpc v1.41.0/go.mod h1:U3l9uK9J0sini8mHphKoXyaqDA/8VyGnDee1zzIUK6k=
 google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8=

From 987d47adb0a2c277ae7fe665cac844a42e6f64c4 Mon Sep 17 00:00:00 2001
From: Jonathan Keane <jkeane@gmail.com>
Date: Fri, 19 Nov 2021 12:58:21 -0600
Subject: [PATCH 179/194] ARROW-14746: [CI] Allow (temporary) disabling of
 constantly failing nightlies

Closes #11731 from jonkeane/crossbow-group-droplist

Authored-by: Jonathan Keane <jkeane@gmail.com>
Signed-off-by: Jonathan Keane <jkeane@gmail.com>
---
 dev/archery/archery/crossbow/core.py          | 47 +++++++++++++++----
 .../crossbow/tests/fixtures/azure-linux.yaml  | 38 +++++++++++++++
 .../crossbow/tests/fixtures/tasks.yaml        | 47 +++++++++++++++++++
 .../archery/crossbow/tests/test_core.py       | 46 ++++++++++++++++++
 dev/tasks/tasks.yml                           | 17 +++++++
 5 files changed, 186 insertions(+), 9 deletions(-)
 create mode 100644 dev/archery/archery/crossbow/tests/fixtures/azure-linux.yaml
 create mode 100644 dev/archery/archery/crossbow/tests/fixtures/tasks.yaml

diff --git a/dev/archery/archery/crossbow/core.py b/dev/archery/archery/crossbow/core.py
index 0f2309e47e230..06a0b664f2085 100644
--- a/dev/archery/archery/crossbow/core.py
+++ b/dev/archery/archery/crossbow/core.py
@@ -1079,11 +1079,11 @@ def select(self, tasks=None, groups=None):
         config_tasks = dict(self['tasks'])
         valid_groups = set(config_groups.keys())
         valid_tasks = set(config_tasks.keys())
-        group_whitelist = list(groups or [])
-        task_whitelist = list(tasks or [])
+        group_allowlist = list(groups or [])
+        task_allowlist = list(tasks or [])
 
         # validate that the passed groups are defined in the config
-        requested_groups = set(group_whitelist)
+        requested_groups = set(group_allowlist)
         invalid_groups = requested_groups - valid_groups
         if invalid_groups:
             msg = 'Invalid group(s) {!r}. Must be one of {!r}'.format(
@@ -1091,13 +1091,9 @@ def select(self, tasks=None, groups=None):
             )
             raise CrossbowError(msg)
 
-        # merge the tasks defined in the selected groups
-        task_patterns = [list(config_groups[name]) for name in group_whitelist]
-        task_patterns = set(sum(task_patterns, task_whitelist))
-
         # treat the task names as glob patterns to select tasks more easily
         requested_tasks = set()
-        for pattern in task_patterns:
+        for pattern in task_allowlist:
             matches = fnmatch.filter(valid_tasks, pattern)
             if len(matches):
                 requested_tasks.update(matches)
@@ -1106,6 +1102,37 @@ def select(self, tasks=None, groups=None):
                     "Unable to match any tasks for `{}`".format(pattern)
                 )
 
+        requested_group_tasks = set()
+        for group in group_allowlist:
+            # separate the patterns from the blocklist patterns
+            task_patterns = list(config_groups[group])
+            task_blocklist_patterns = [
+                x.strip("~") for x in task_patterns if x.startswith("~")]
+            task_patterns = [x for x in task_patterns if not x.startswith("~")]
+
+            # treat the task names as glob patterns to select tasks more easily
+            for pattern in task_patterns:
+                matches = fnmatch.filter(valid_tasks, pattern)
+                if len(matches):
+                    requested_group_tasks.update(matches)
+                else:
+                    raise CrossbowError(
+                        "Unable to match any tasks for `{}`".format(pattern)
+                    )
+
+            # remove any tasks that are negated with ~task-name
+            for block_pattern in task_blocklist_patterns:
+                matches = fnmatch.filter(valid_tasks, block_pattern)
+                if len(matches):
+                    requested_group_tasks = requested_group_tasks.difference(
+                        matches)
+                else:
+                    raise CrossbowError(
+                        "Unable to match any tasks for `{}`".format(pattern)
+                    )
+
+        requested_tasks = requested_tasks.union(requested_group_tasks)
+
         # validate that the passed and matched tasks are defined in the config
         invalid_tasks = requested_tasks - valid_tasks
         if invalid_tasks:
@@ -1119,9 +1146,11 @@ def select(self, tasks=None, groups=None):
         }
 
     def validate(self):
-        # validate that the task groups are properly referening the tasks
+        # validate that the task groups are properly refering to the tasks
         for group_name, group in self['groups'].items():
             for pattern in group:
+                # remove the negation character for blocklisted tasks
+                pattern = pattern.strip("~")
                 tasks = self.select(tasks=[pattern])
                 if not tasks:
                     raise CrossbowError(
diff --git a/dev/archery/archery/crossbow/tests/fixtures/azure-linux.yaml b/dev/archery/archery/crossbow/tests/fixtures/azure-linux.yaml
new file mode 100644
index 0000000000000..ff54205e347b4
--- /dev/null
+++ b/dev/archery/archery/crossbow/tests/fixtures/azure-linux.yaml
@@ -0,0 +1,38 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+jobs:
+  - job: linux
+    pool:
+      vmImage: ubuntu-latest
+    timeoutInMinutes: 360
+    steps:
+      - script: |
+          set -ex
+          git clone --no-checkout {{ arrow.remote }} arrow
+          git -C arrow fetch -t {{ arrow.remote }} {{ arrow.branch }}
+          git -C arrow checkout FETCH_HEAD
+          git -C arrow submodule update --init --recursive
+        displayName: Clone arrow
+
+      - script: |
+          set -ex
+          docker -v
+          docker-compose -v
+          cd arrow
+          docker-compose pull --ignore-pull-failures r
+          docker-compose build r
+        displayName: Docker build
\ No newline at end of file
diff --git a/dev/archery/archery/crossbow/tests/fixtures/tasks.yaml b/dev/archery/archery/crossbow/tests/fixtures/tasks.yaml
new file mode 100644
index 0000000000000..757a75f1ac803
--- /dev/null
+++ b/dev/archery/archery/crossbow/tests/fixtures/tasks.yaml
@@ -0,0 +1,47 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+groups:
+  test:
+    - test-*
+
+  nightly:
+    - test-*
+    - nightly-*
+    - ~nightly-not-fine
+    - ~test-a-test-two
+
+  nightly-no-test:
+    - test-*
+    - nightly-*
+    - ~test-*
+
+tasks:
+  test-a-test:
+    ci: azure
+    template: azure-linux.yaml
+
+  test-a-test-two:
+    ci: azure
+    template: azure-linux.yaml
+
+  nightly-fine:
+    ci: azure
+    template: azure-linux.yaml
+
+  nightly-not-fine:
+    ci: azure
+    template: azure-linux.yaml
diff --git a/dev/archery/archery/crossbow/tests/test_core.py b/dev/archery/archery/crossbow/tests/test_core.py
index 518474236aca1..847aae2240fa1 100644
--- a/dev/archery/archery/crossbow/tests/test_core.py
+++ b/dev/archery/archery/crossbow/tests/test_core.py
@@ -18,8 +18,54 @@
 from archery.utils.source import ArrowSources
 from archery.crossbow import Config
 
+import pathlib
+
 
 def test_config():
     src = ArrowSources.find()
     conf = Config.load_yaml(src.dev / "tasks" / "tasks.yml")
     conf.validate()
+
+
+def test_task_select(request):
+    conf = Config.load_yaml(pathlib.Path(
+        request.node.fspath).parent / "fixtures" / "tasks.yaml")
+    conf.validate()
+
+    test_out = conf.select(tasks=["test-a-test-two"])
+    assert test_out.keys() >= {"test-a-test-two"}
+
+
+def test_group_select(request):
+    conf = Config.load_yaml(pathlib.Path(
+        request.node.fspath).parent / "fixtures" / "tasks.yaml")
+    conf.validate()
+
+    test_out = conf.select(groups=["test"])
+    assert test_out.keys() >= {"test-a-test-two", "test-a-test"}
+
+
+def test_group_select_blocklist(request):
+    conf = Config.load_yaml(pathlib.Path(
+        request.node.fspath).parent / "fixtures" / "tasks.yaml")
+    conf.validate()
+
+    # we respect the nightly blocklist
+    nightly_out = conf.select(groups=["nightly"])
+    assert nightly_out.keys() >= {"test-a-test", "nightly-fine"}
+
+    # but if a task is not blocked in both groups, it shows up at least once
+    test_nightly_out = conf.select(groups=["nightly", "test"])
+    assert test_nightly_out.keys() >= {
+        "test-a-test-two", "test-a-test", "nightly-fine"}
+
+    # but can then over-ride by requesting the task
+    test_nightly_out = conf.select(
+        tasks=["nightly-not-fine", "nightly-fine"], groups=["nightly", "test"])
+    assert test_nightly_out.keys() >= {
+        "test-a-test-two", "test-a-test", "nightly-fine", "nightly-not-fine"}
+
+    # and we can glob with the blocklist too!
+    test_nightly_no_test_out = conf.select(groups=["nightly-no-test"])
+    assert test_nightly_no_test_out.keys(
+    ) >= {"nightly-fine", "nightly-not-fine"}
diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml
index aaf107728b119..1191c964eaf1d 100644
--- a/dev/tasks/tasks.yml
+++ b/dev/tasks/tasks.yml
@@ -143,6 +143,23 @@ groups:
     - example-*
     - wheel-*
     - python-sdist
+    # ARROW-14256
+    - ~conda-linux-gcc-py310-arm64
+    - ~conda-linux-gcc-py310-cuda
+    - ~conda-linux-gcc-py36-arm64
+    - ~conda-linux-gcc-py37-arm64
+    - ~conda-linux-gcc-py37-cpu-r41
+    - ~conda-linux-gcc-py37-cuda
+    - ~conda-linux-gcc-py38-arm64
+    - ~conda-linux-gcc-py38-cpu
+    - ~conda-linux-gcc-py38-cuda
+    - ~conda-linux-gcc-py39-arm64
+    - ~conda-linux-gcc-py39-cpu
+    - ~conda-linux-gcc-py39-cuda
+    - ~conda-win-vs2017-py37-r41
+    # ARROW-14215
+    - ~conda-win-vs2017-py38
+    - ~conda-win-vs2017-py39
 
 tasks:
   # arbitrary_task_name:

From c0fe679aa4369f3f0ea85209d385fdd89d79e3b3 Mon Sep 17 00:00:00 2001
From: Dewey Dunnington <dewey@fishandwhistle.net>
Date: Fri, 19 Nov 2021 14:14:11 -0600
Subject: [PATCH 180/194] ARROW-13400 [R] Update fs.Rmd (Working with S3)
 vignette

Just a few updates and fixes to rough edges according to the notes in ARROW-13400! In particular,

- Added a section on using `proxy_options`
- Added that you can use `$ls()` to view a directory listing (I found this useful when testing the S3 proxy server stuff)

Closes #11729 from paleolimbot/r-s3-vignette

Authored-by: Dewey Dunnington <dewey@fishandwhistle.net>
Signed-off-by: Jonathan Keane <jkeane@gmail.com>
---
 r/vignettes/fs.Rmd | 40 ++++++++++++++++++++++++++++------------
 1 file changed, 28 insertions(+), 12 deletions(-)

diff --git a/r/vignettes/fs.Rmd b/r/vignettes/fs.Rmd
index 5d699c49df01b..6990469af2153 100644
--- a/r/vignettes/fs.Rmd
+++ b/r/vignettes/fs.Rmd
@@ -32,7 +32,7 @@ For example, one of the NYC taxi data files used in `vignette("dataset", package
 s3://ursa-labs-taxi-data/2019/06/data.parquet
 ```
 
-Given this URI, we can pass it to `read_parquet()` just as if it were a local file path:
+Given this URI, you can pass it to `read_parquet()` just as if it were a local file path:
 
 ```r
 df <- read_parquet("s3://ursa-labs-taxi-data/2019/06/data.parquet")
@@ -54,7 +54,7 @@ This may be convenient when dealing with
 long URIs, and it's necessary for some options and authentication methods
 that aren't supported in the URI format.
 
-With a `FileSystem` object, we can point to specific files in it with the `$path()` method.
+With a `FileSystem` object, you can point to specific files in it with the `$path()` method.
 In the previous example, this would look like:
 
 ```r
@@ -62,13 +62,20 @@ bucket <- s3_bucket("ursa-labs-taxi-data")
 df <- read_parquet(bucket$path("2019/06/data.parquet"))
 ```
 
-See the help for `FileSystem` for a list of options that `s3_bucket()` and `S3FileSystem$create()`
+You can list the files and/or directories in an S3 bucket or subdirectory using
+the `$ls()` method:
+
+```r
+bucket$ls()
+```
+
+See `help(FileSystem)` for a list of options that `s3_bucket()` and `S3FileSystem$create()`
 can take. `region`, `scheme`, and `endpoint_override` can be encoded as query
 parameters in the URI (though `region` will be auto-detected in `s3_bucket()` or from the URI if omitted).
 `access_key` and `secret_key` can also be included,
 but other options are not supported in the URI.
 
-The object that `s3_bucket()` returns is technically a `SubTreeFileSystem`, which holds a path and a file system to which it corresponds. `SubTreeFileSystem`s can be useful for holding a reference to a subdirectory somewhere, on S3 or elsewhere.
+The object that `s3_bucket()` returns is technically a `SubTreeFileSystem`, which holds a path and a file system to which it corresponds. `SubTreeFileSystem`s can be useful for holding a reference to a subdirectory somewhere (on S3 or elsewhere).
 
 One way to get a subtree is to call the `$cd()` method on a `FileSystem`
 
@@ -86,21 +93,30 @@ june2019 <- SubTreeFileSystem$create("s3://ursa-labs-taxi-data/2019/06")
 ## Authentication
 
 To access private S3 buckets, you need typically need two secret parameters:
-a `access_key`, which is like a user id,
-and `secret_key`, like a token.
-There are a few options for passing these credentials:
+a `access_key`, which is like a user id, and `secret_key`, which is like a token
+or password. There are a few options for passing these credentials:
 
-1. Include them in the URI, like `s3://access_key:secret_key@bucket-name/path/to/file`. Be sure to [URL-encode](https://en.wikipedia.org/wiki/Percent-encoding) your secrets if they contain special characters like "/".
+- Include them in the URI, like `s3://access_key:secret_key@bucket-name/path/to/file`. Be sure to [URL-encode](https://en.wikipedia.org/wiki/Percent-encoding) your secrets if they contain special characters like "/" (e.g., `URLencode("123/456", reserved = TRUE)`).
 
-2. Pass them as `access_key` and `secret_key` to `S3FileSystem$create()` or `s3_bucket()`
+- Pass them as `access_key` and `secret_key` to `S3FileSystem$create()` or `s3_bucket()`
 
-3. Set them as environment variables named `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`, respectively.
+- Set them as environment variables named `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`, respectively.
 
-4. Define them in a `~/.aws/credentials` file, according to the [AWS documentation](https://docs.aws.amazon.com/sdk-for-cpp/v1/developer-guide/credentials.html).
+- Define them in a `~/.aws/credentials` file, according to the [AWS documentation](https://docs.aws.amazon.com/sdk-for-cpp/v1/developer-guide/credentials.html).
 
-You can also use an [AccessRole](https://docs.aws.amazon.com/STS/latest/APIReference/API_AssumeRole.html)
+- Use an [AccessRole](https://docs.aws.amazon.com/STS/latest/APIReference/API_AssumeRole.html)
 for temporary access by passing the `role_arn` identifier to `S3FileSystem$create()` or `s3_bucket()`.
 
+## Using a proxy server
+
+If you need to use a proxy server to connect to an S3 bucket, you can provide
+a URI in the form `http://user:password@host:port` to `proxy_options`. For
+example, a local proxy server running on port 1316 can be used like this:
+
+```r
+bucket <- s3_bucket("ursa-labs-taxi-data", proxy_options = "http://localhost:1316")
+```
+
 ## File systems that emulate S3
 
 The `S3FileSystem` machinery enables you to work with any file system that

From 598d16d62df34760b843f9c460c12c236971647e Mon Sep 17 00:00:00 2001
From: Dominik Moritz <domoritz@gmail.com>
Date: Fri, 19 Nov 2021 16:05:16 -0500
Subject: [PATCH 181/194] ARROW-14773: [JS] Fix sourcemap paths

This fixes an issue where the source maps point to files in the root directory, not `src`, which is where the sources actually are.

Closes #11741 from domoritz/dom/sourcemaps

Authored-by: Dominik Moritz <domoritz@gmail.com>
Signed-off-by: Dominik Moritz <domoritz@gmail.com>
---
 js/gulp/typescript-task.js | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/js/gulp/typescript-task.js b/js/gulp/typescript-task.js
index ed03b84536cbc..cc1d277453a82 100644
--- a/js/gulp/typescript-task.js
+++ b/js/gulp/typescript-task.js
@@ -66,12 +66,12 @@ function compileTypescript(out, tsconfigPath, tsconfigOverrides) {
     const writeSources = observableFromStreams(tsProject.src(), gulp.dest(path.join(out, 'src')));
     const writeDTypes = observableFromStreams(dts, sourcemaps.write('./', { includeContent: false, sourceRoot: 'src' }), gulp.dest(out));
     const mapFile = tsProject.options.module === 5 ? esmMapFile : cjsMapFile;
-    const writeJS = observableFromStreams(js, sourcemaps.write('./', { mapFile, includeContent: false }), gulp.dest(out));
+    const writeJS = observableFromStreams(js, sourcemaps.write('./', { mapFile, includeContent: false, sourceRoot: 'src' }), gulp.dest(out));
     return ObservableForkJoin([writeSources, writeDTypes, writeJS]);
 }
 
-function cjsMapFile(mapFilePath) { return mapFilePath; }
-function esmMapFile(mapFilePath) { return mapFilePath.replace('.js.map', '.mjs.map'); }
+const cjsMapFile = (mapFilePath) => mapFilePath;
+const esmMapFile = (mapFilePath) => mapFilePath.replace('.js.map', '.mjs.map');
 
 module.exports = typescriptTask;
 module.exports.typescriptTask = typescriptTask;

From 5b68c8e8584b2e6261627f4996b1ffbbb873a0bc Mon Sep 17 00:00:00 2001
From: Dominik Moritz <domoritz@gmail.com>
Date: Fri, 19 Nov 2021 16:06:27 -0500
Subject: [PATCH 182/194] ARROW-14774: [JS] Correct package exports

Closes #11742 from domoritz/dom/correct-exports

Authored-by: Dominik Moritz <domoritz@gmail.com>
Signed-off-by: Dominik Moritz <domoritz@gmail.com>
---
 js/gulp/package-task.js | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/js/gulp/package-task.js b/js/gulp/package-task.js
index 321e65a30c0e5..994ebe0ef8ad5 100644
--- a/js/gulp/package-task.js
+++ b/js/gulp/package-task.js
@@ -64,8 +64,12 @@ const createMainPackageJson = (target, format) => (orig) => ({
         [`./${mainExport}.node.mjs`]: `./${mainExport}.dom.mjs`
     },
     exports: {
-        import: `./${mainExport}.node.mjs`,
-        require: `./${mainExport}.node.js`,
+        node: {
+            import: `./${mainExport}.node.mjs`,
+            require: `./${mainExport}.node.js`,
+        },
+        import: `./${mainExport}.dom.mjs`,
+        require: `./${mainExport}.dom.js`,
     },
     types: `${mainExport}.node.d.ts`,
     unpkg: `${mainExport}.es2015.min.js`,

From 2155d46494a340111453a9696e304dd5eca83919 Mon Sep 17 00:00:00 2001
From: Dhruv Vats <dhruv25vats@gmail.com>
Date: Fri, 19 Nov 2021 17:19:44 -1000
Subject: [PATCH 183/194] ARROW-14641: [C++][Compute] Reduce print statements
 from unit tests

Update unconditional exposed `std::cout`s in unittests to use ARROW_SCOPED_TRACE or SCOPED_TRACE macros as appropriate.

Closes #11663 from dhruv9vats/arrow-14641

Authored-by: Dhruv Vats <dhruv25vats@gmail.com>
Signed-off-by: Weston Pace <weston.pace@gmail.com>
---
 .../arrow/compute/exec/hash_join_node_test.cc | 14 ++-------
 cpp/src/arrow/util/future_test.cc             | 30 +++++++------------
 cpp/src/gandiva/tests/date_time_test.cc       |  6 ++--
 cpp/src/parquet/column_writer_test.cc         |  4 +--
 4 files changed, 17 insertions(+), 37 deletions(-)

diff --git a/cpp/src/arrow/compute/exec/hash_join_node_test.cc b/cpp/src/arrow/compute/exec/hash_join_node_test.cc
index 9afddf3c5dc2f..40738d1e229be 100644
--- a/cpp/src/arrow/compute/exec/hash_join_node_test.cc
+++ b/cpp/src/arrow/compute/exec/hash_join_node_test.cc
@@ -1059,18 +1059,8 @@ TEST(HashJoin, Random) {
                        &(key_fields[i]), &(output_fields[i]), &(output_field_ids[i]));
     }
 
-    // Print test case parameters
-    // print num_rows, batch_size, join_type, join_cmp
-    std::cout << join_type_name << " " << key_cmp_str << " ";
-    key_types.Print();
-    std::cout << " payload_l: ";
-    payload_types[0].Print();
-    std::cout << " payload_r: ";
-    payload_types[1].Print();
-    std::cout << " num_rows_l = " << num_rows_l << " num_rows_r = " << num_rows_r
-              << " batch size = " << batch_size
-              << " parallel = " << (parallel ? "true" : "false");
-    std::cout << std::endl;
+    ARROW_SCOPED_TRACE(join_type_name, " ", key_cmp_str,
+                       " parallel = ", (parallel ? "true" : "false"));
 
     // Run reference join implementation
     std::vector<bool> null_in_key_vectors[2];
diff --git a/cpp/src/arrow/util/future_test.cc b/cpp/src/arrow/util/future_test.cc
index 0db355433e885..84a9fa15554e8 100644
--- a/cpp/src/arrow/util/future_test.cc
+++ b/cpp/src/arrow/util/future_test.cc
@@ -1313,32 +1313,22 @@ TEST(FutureLoopTest, AllowsBreakFutToBeDiscarded) {
 
 class MoveTrackingCallable {
  public:
-  MoveTrackingCallable() {
-    // std::cout << "CONSTRUCT" << std::endl;
-  }
-  ~MoveTrackingCallable() {
-    valid_ = false;
-    // std::cout << "DESTRUCT" << std::endl;
-  }
-  MoveTrackingCallable(const MoveTrackingCallable& other) {
-    // std::cout << "COPY CONSTRUCT" << std::endl;
-  }
-  MoveTrackingCallable(MoveTrackingCallable&& other) {
-    other.valid_ = false;
-    // std::cout << "MOVE CONSTRUCT" << std::endl;
-  }
-  MoveTrackingCallable& operator=(const MoveTrackingCallable& other) {
-    // std::cout << "COPY ASSIGN" << std::endl;
-    return *this;
-  }
+  MoveTrackingCallable() {}
+
+  ~MoveTrackingCallable() { valid_ = false; }
+
+  MoveTrackingCallable(const MoveTrackingCallable& other) {}
+
+  MoveTrackingCallable(MoveTrackingCallable&& other) { other.valid_ = false; }
+
+  MoveTrackingCallable& operator=(const MoveTrackingCallable& other) { return *this; }
+
   MoveTrackingCallable& operator=(MoveTrackingCallable&& other) {
     other.valid_ = false;
-    // std::cout << "MOVE ASSIGN" << std::endl;
     return *this;
   }
 
   Status operator()() {
-    // std::cout << "TRIGGER" << std::endl;
     if (valid_) {
       return Status::OK();
     } else {
diff --git a/cpp/src/gandiva/tests/date_time_test.cc b/cpp/src/gandiva/tests/date_time_test.cc
index 77139125f4a58..b840e739e8b4f 100644
--- a/cpp/src/gandiva/tests/date_time_test.cc
+++ b/cpp/src/gandiva/tests/date_time_test.cc
@@ -452,7 +452,7 @@ TEST_F(TestProjector, TestTimestampDiffMonth) {
   std::shared_ptr<Projector> projector;
   auto status =
       Projector::Make(schema, {diff_months_expr}, TestConfiguration(), &projector);
-  std::cout << status.message();
+
   ASSERT_TRUE(status.ok());
 
   time_t epoch = Epoch();
@@ -510,7 +510,7 @@ TEST_F(TestProjector, TestMonthsBetween) {
   std::shared_ptr<Projector> projector;
   auto status =
       Projector::Make(schema, {months_between_expr}, TestConfiguration(), &projector);
-  std::cout << status.message();
+
   ASSERT_TRUE(status.ok());
 
   time_t epoch = Epoch();
@@ -560,7 +560,7 @@ TEST_F(TestProjector, TestLastDay) {
 
   std::shared_ptr<Projector> projector;
   auto status = Projector::Make(schema, {last_day_expr}, TestConfiguration(), &projector);
-  std::cout << status.message();
+
   ASSERT_TRUE(status.ok());
 
   time_t epoch = Epoch();
diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc
index e895b735996a8..429208314130e 100644
--- a/cpp/src/parquet/column_writer_test.cc
+++ b/cpp/src/parquet/column_writer_test.cc
@@ -254,7 +254,7 @@ class TestPrimitiveWriter : public PrimitiveTypedTest<TestType> {
     for (size_t i = 0; i < this->values_.size(); i++) {
       if (comparator->Compare(this->values_[i], this->values_out_[i]) ||
           comparator->Compare(this->values_out_[i], this->values_[i])) {
-        std::cout << "Failed at " << i << std::endl;
+        ARROW_SCOPED_TRACE("i = ", i);
       }
       ASSERT_FALSE(comparator->Compare(this->values_[i], this->values_out_[i]));
       ASSERT_FALSE(comparator->Compare(this->values_out_[i], this->values_[i]));
@@ -355,7 +355,7 @@ void TestPrimitiveWriter<Int96Type>::ReadAndCompare(Compression::type compressio
   for (size_t i = 0; i < this->values_.size(); i++) {
     if (comparator->Compare(this->values_[i], this->values_out_[i]) ||
         comparator->Compare(this->values_out_[i], this->values_[i])) {
-      std::cout << "Failed at " << i << std::endl;
+      ARROW_SCOPED_TRACE("i = ", i);
     }
     ASSERT_FALSE(comparator->Compare(this->values_[i], this->values_out_[i]));
     ASSERT_FALSE(comparator->Compare(this->values_out_[i], this->values_[i]));

From 9c935d44ccf6b8ed312c5c9250ebdce2d3b2ef57 Mon Sep 17 00:00:00 2001
From: Alessandro Molina <amol@turbogears.org>
Date: Mon, 22 Nov 2021 13:31:21 +0100
Subject: [PATCH 184/194] ARROW-13943: [Python] Hide hash_aggregate functions
 from compute module

Closes #11745 from amol-/ARROW-13943

Authored-by: Alessandro Molina <amol@turbogears.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 python/pyarrow/compute.py            |  4 ++++
 python/pyarrow/tests/test_compute.py | 14 +++++++++++++-
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py
index bf2bf5f35db7b..be4e3ed8f3bcc 100644
--- a/python/pyarrow/compute.py
+++ b/python/pyarrow/compute.py
@@ -254,6 +254,10 @@ def _make_global_functions():
     for cpp_name in reg.list_functions():
         name = rewrites.get(cpp_name, cpp_name)
         func = reg.get_function(cpp_name)
+        if func.kind == "hash_aggregate":
+            # Hash aggregate functions are not callable,
+            # so let's not expose them at module level.
+            continue
         assert name not in g, name
         g[cpp_name] = g[name] = _wrap_function(name, func)
 
diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py
index eac01236b64fe..e84d50ad65ae0 100644
--- a/python/pyarrow/tests/test_compute.py
+++ b/python/pyarrow/tests/test_compute.py
@@ -96,6 +96,14 @@ def test_exported_functions():
             func(*args)
 
 
+def test_hash_aggregate_not_exported():
+    # Ensure we are not leaking hash aggregate functions
+    # which are not callable by themselves.
+    for func in exported_functions:
+        arrow_f = pc.get_function(func.__arrow_compute_function__["name"])
+        assert arrow_f.kind != "hash_aggregate"
+
+
 def test_exported_option_classes():
     classes = exported_option_classes
     assert len(classes) >= 10
@@ -241,7 +249,11 @@ def test_pickle_functions():
 def test_pickle_global_functions():
     # Pickle global wrappers (manual or automatic) of registered functions
     for name in pc.list_functions():
-        func = getattr(pc, name)
+        try:
+            func = getattr(pc, name)
+        except AttributeError:
+            # hash_aggregate functions are not exported as callables.
+            continue
         reconstructed = pickle.loads(pickle.dumps(func))
         assert reconstructed is func
 

From 92ee2952155070f7c0bfe3c05f9ccddebf7620db Mon Sep 17 00:00:00 2001
From: Ian Alexander Joiner <iajoiner809@gmail.com>
Date: Mon, 22 Nov 2021 08:07:24 -0500
Subject: [PATCH 185/194] ARROW-14788: [C++] Fix warning in
 dataset/file_orc_test.cc

This is a very minor issue that is likely to have blocked our tests in master for a while. I do wonder why it was only discovered last night though. Is it because we are not testing ORC-related code by default?

Closes #11750 from iajoiner/ARROW-14788

Authored-by: Ian Alexander Joiner <iajoiner809@gmail.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 cpp/src/arrow/dataset/file_orc_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/arrow/dataset/file_orc_test.cc b/cpp/src/arrow/dataset/file_orc_test.cc
index 197d7afeb6984..9505fb67036e5 100644
--- a/cpp/src/arrow/dataset/file_orc_test.cc
+++ b/cpp/src/arrow/dataset/file_orc_test.cc
@@ -44,7 +44,7 @@ class OrcFormatHelper {
     ARROW_ASSIGN_OR_RAISE(auto writer, adapters::orc::ORCFileWriter::Open(sink.get()));
     std::shared_ptr<Table> table;
     RETURN_NOT_OK(reader->ReadAll(&table));
-    writer->Write(*table);
+    RETURN_NOT_OK(writer->Write(*table));
     RETURN_NOT_OK(writer->Close());
     return sink->Finish();
   }

From 14fa2cbb837bdfc5131b944df155804098299a91 Mon Sep 17 00:00:00 2001
From: Alenka Frim <frim.alenka@gmail.com>
Date: Mon, 22 Nov 2021 14:53:01 +0100
Subject: [PATCH 186/194] MINOR: [Docs][Python] Small updates to the
 documentation about building pyarrow

I added some additional information and some changes to the documentation for building pyarrow from source. The changes are meant as a proposal to make this part of the documentation more clear and up-to-date and not as a final correction.

Closes #11505 from AlenkaF/pyarrow-build-docs

Authored-by: Alenka Frim <frim.alenka@gmail.com>
Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 docs/source/developers/cpp/building.rst | 18 ++++++++-
 docs/source/developers/python.rst       | 50 ++++++++++++++++---------
 2 files changed, 49 insertions(+), 19 deletions(-)

diff --git a/docs/source/developers/cpp/building.rst b/docs/source/developers/cpp/building.rst
index 6b18c73120686..6c0f3675bc9e2 100644
--- a/docs/source/developers/cpp/building.rst
+++ b/docs/source/developers/cpp/building.rst
@@ -113,6 +113,8 @@ On MSYS2:
      mingw-w64-${MSYSTEM_CARCH}-zlib \
      mingw-w64-${MSYSTEM_CARCH}-zstd
 
+.. _cpp-building-building:
+
 Building
 ========
 
@@ -121,9 +123,21 @@ argument is omitted then a release build will be produced.
 
 .. note::
 
-   You need to more options to build on Windows. See
+   You need to set more options to build on Windows. See
    :ref:`developers-cpp-windows` for details.
 
+Several build types are possible:
+
+* ``Debug``: doesn't apply any compiler optimizations and adds debugging
+  information in the binary.
+* ``RelWithDebInfo``: applies compiler optimizations while adding debug
+  information in the binary.
+* ``Release``: applies compiler optimizations and removes debug information
+  from the binary.
+
+You can also run default build with flag ``-DARROW_EXTRA_ERROR_CONTEXT=ON``, see
+:ref:`cpp-extra-debugging`.
+
 Minimal release build:
 
 .. code-block:: shell
@@ -424,6 +438,8 @@ this can be accomplished with the ``Threads`` built-in package:
    find_package(Threads REQUIRED)
    target_link_libraries(my_target PRIVATE Threads::Threads)
 
+.. _cpp-extra-debugging:
+
 Extra debugging help
 ~~~~~~~~~~~~~~~~~~~~
 
diff --git a/docs/source/developers/python.rst b/docs/source/developers/python.rst
index 69bb76a0bda2a..78f2760fd39c8 100644
--- a/docs/source/developers/python.rst
+++ b/docs/source/developers/python.rst
@@ -55,12 +55,16 @@ like so:
 
 .. code-block:: shell
 
-   pytest pyarrow
+   pytest arrow/python/pyarrow
 
 Package requirements to run the unit tests are found in
 ``requirements-test.txt`` and can be installed if needed with ``pip install -r
 requirements-test.txt``.
 
+If you get import errors for ``pyarrow._lib`` or another PyArrow module when
+trying to run the tests, run ``python -m pytest arrow/python/pyarrow`` and check
+if the editable version of pyarrow was installed correctly.
+
 The project has a number of custom command line options for its test
 suite. Some tests are disabled by default, for example. To see all the options,
 run
@@ -106,8 +110,8 @@ Building on Linux and MacOS
 System Requirements
 -------------------
 
-On macOS, any modern XCode (6.4 or higher; the current version is 10) is
-sufficient.
+On macOS, any modern XCode (6.4 or higher; the current version is 13) or
+Xcode Command Line Tools (``xcode-select --install``) are sufficient.
 
 On Linux, for this guide, we require a minimum of gcc 4.8, or clang 3.7 or
 higher. You can check your version by running
@@ -157,16 +161,8 @@ Pull in the test data and setup the environment variables:
 Using Conda
 ~~~~~~~~~~~
 
-.. note::
-
-   Using conda to build Arrow on macOS is complicated by the
-   fact that the `conda-forge compilers require an older macOS SDK <https://stackoverflow.com/a/55798942>`_.
-   Conda offers some `installation instructions <https://docs.conda.io/projects/conda-build/en/latest/resources/compiler-tools.html#macos-sdk>`_;
-   the alternative would be to use :ref:`Homebrew <python-homebrew>` and
-   ``pip`` instead.
-
 Let's create a conda environment with all the C++ build and Python dependencies
-from conda-forge, targeting development for Python 3.7:
+from conda-forge, targeting development for Python 3.9:
 
 On Linux and macOS:
 
@@ -178,7 +174,7 @@ On Linux and macOS:
         --file arrow/ci/conda_env_python.txt \
         --file arrow/ci/conda_env_gandiva.txt \
         compilers \
-        python=3.7 \
+        python=3.9 \
         pandas
 
 As of January 2019, the ``compilers`` package is needed on many Linux
@@ -248,8 +244,8 @@ folder as the repositories and a target installation folder:
 
 .. code-block:: shell
 
-   virtualenv pyarrow
-   source ./pyarrow/bin/activate
+   virtualenv -p python3.9 pyarrow-dev
+   source ./pyarrow-dev/bin/activate
    pip install -r arrow/python/requirements-build.txt \
         -r arrow/python/requirements-test.txt
 
@@ -280,6 +276,7 @@ Now build and install the Arrow C++ libraries:
 
    cmake -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \
          -DCMAKE_INSTALL_LIBDIR=lib \
+         -DCMAKE_BUILD_TYPE=debug \
          -DARROW_WITH_BZ2=ON \
          -DARROW_WITH_ZLIB=ON \
          -DARROW_WITH_ZSTD=ON \
@@ -306,6 +303,11 @@ adding flags with ``ON``:
 Anything set to ``ON`` above can also be turned off. Note that some compression
 libraries are needed for Parquet support.
 
+To enable C++ debugging information, pass the option ``-DCMAKE_BUILD_TYPE=debug``.
+
+.. seealso::
+   :ref:`cpp-building-building`.
+
 If multiple versions of Python are installed in your environment, you may have
 to pass additional parameters to cmake so that it can find the right
 executable, headers and libraries.  For example, specifying
@@ -335,6 +337,11 @@ virtualenv) enables cmake to choose the python executable which you are using.
 
 For any other C++ build challenges, see :ref:`cpp-development`.
 
+In case you may need to rebuild the C++ part due to errors in the process it is
+advisable to delete the build folder with command ``rm -rf /arrow/cpp/build``.
+If the build has passed successfully and you need to rebuild due to latest pull from master,
+then this step is not needed.
+
 Now, build pyarrow:
 
 .. code-block:: shell
@@ -344,8 +351,11 @@ Now, build pyarrow:
    python setup.py build_ext --inplace
    popd
 
-If you did not build one of the optional components, set the corresponding
-``PYARROW_WITH_$COMPONENT`` environment variable to 0.
+If you did build one of the optional components (in C++), you need to set the
+corresponding ``PYARROW_WITH_$COMPONENT`` environment variable to 1.
+
+If you wish to delete pyarrow build before rebuilding navigate to the ``arrow/python``
+folder and run ``git clean -Xfd .``.
 
 Now you are ready to install test dependencies and run `Unit Testing`_, as
 described above.
@@ -359,6 +369,10 @@ libraries), one can set ``--bundle-arrow-cpp``:
    python setup.py build_ext --build-type=$ARROW_BUILD_TYPE \
           --bundle-arrow-cpp bdist_wheel
 
+.. note::
+   To install an editable PyArrow build run ``pip install -e . --no-build-isolation``
+   in the ``arrow/python`` directory.
+
 Docker examples
 ~~~~~~~~~~~~~~~
 
@@ -432,7 +446,7 @@ First, starting from fresh clones of Apache Arrow:
        --file arrow\ci\conda_env_cpp.txt ^
        --file arrow\ci\conda_env_python.txt ^
        --file arrow\ci\conda_env_gandiva.txt ^
-       python=3.7
+       python=3.9
    conda activate pyarrow-dev
 
 Now, we build and install Arrow C++ libraries.

From 0409498819332fc479f8df38babe3426d707fb9e Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Mon, 22 Nov 2021 15:06:56 +0100
Subject: [PATCH 187/194] ARROW-14699: [C++] Fix lz4 undefined behaviour issues

LZ4 got a bunch of undefined behaviour fixes on its development branch, bump the bundled version.

Closes #11713 from pitrou/ARROW-14699-lz4-sanitize

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 ci/scripts/r_sanitize.sh     | 7 +++++++
 cpp/thirdparty/versions.txt  | 5 +++--
 r/inst/build_arrow_static.sh | 2 ++
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/ci/scripts/r_sanitize.sh b/ci/scripts/r_sanitize.sh
index 8b305a2378c8b..33b4361163031 100755
--- a/ci/scripts/r_sanitize.sh
+++ b/ci/scripts/r_sanitize.sh
@@ -21,6 +21,7 @@ set -ex
 : ${R_BIN:=RDsan}
 
 source_dir=${1}/r
+rhome=$(${R_BIN} RHOME)
 
 pushd ${source_dir}
 
@@ -28,6 +29,12 @@ pushd ${source_dir}
 export CMAKE_UNITY_BUILD=OFF
 # Make installation verbose so that the CI job doesn't time out due to silence
 export ARROW_R_DEV=TRUE
+# Get line numbers in sanitizer tracebacks
+export CMAKE_BUILD_TYPE=RelWithDebInfo
+
+ncores=$(${R_BIN} -s -e 'cat(parallel::detectCores())')
+echo "MAKEFLAGS=-j${ncores}" >> ${rhome}/etc/Renviron.site
+
 ${R_BIN} CMD INSTALL ${INSTALL_ARGS} .
 # But unset the env var so that it doesn't cause us to run extra dev tests
 unset ARROW_R_DEV
diff --git a/cpp/thirdparty/versions.txt b/cpp/thirdparty/versions.txt
index b47cb110db813..fe47cf901618e 100644
--- a/cpp/thirdparty/versions.txt
+++ b/cpp/thirdparty/versions.txt
@@ -57,8 +57,9 @@ ARROW_GTEST_BUILD_VERSION=1.11.0
 ARROW_GTEST_BUILD_SHA256_CHECKSUM=b4870bf121ff7795ba20d20bcdd8627b8e088f2d1dab299a031c1034eddc93d5
 ARROW_JEMALLOC_BUILD_VERSION=5.2.1
 ARROW_JEMALLOC_BUILD_SHA256_CHECKSUM=34330e5ce276099e2e8950d9335db5a875689a4c6a56751ef3b1d8c537f887f6
-ARROW_LZ4_BUILD_VERSION=v1.9.3
-ARROW_LZ4_BUILD_SHA256_CHECKSUM=030644df4611007ff7dc962d981f390361e6c97a34e5cbc393ddfbe019ffe2c1
+# Include post-1.9.3 fixes for undefined behaviour
+ARROW_LZ4_BUILD_VERSION=8f61d8eb7c6979769a484cde8df61ff7c4c77765
+ARROW_LZ4_BUILD_SHA256_CHECKSUM=5b072d848f2f93fddb97e1143e22c1bd7fa19fc8431ee69e21758190a88125b4
 # mimalloc 1.6.7 didn't build on Visual Studio 2015
 # https://github.com/microsoft/mimalloc/issues/353
 ARROW_MIMALLOC_BUILD_VERSION=v1.7.2
diff --git a/r/inst/build_arrow_static.sh b/r/inst/build_arrow_static.sh
index 4512d12df7db3..38db2bf3fcf04 100755
--- a/r/inst/build_arrow_static.sh
+++ b/r/inst/build_arrow_static.sh
@@ -82,5 +82,7 @@ ${CMAKE} -DARROW_BOOST_USE_SHARED=OFF \
     ${EXTRA_CMAKE_FLAGS} \
     -G ${CMAKE_GENERATOR:-"Unix Makefiles"} \
     ${SOURCE_DIR}
+
 ${CMAKE} --build . --target install
+
 popd

From 50990ad5e9df494bf6dd108eeac1c8a64c550b52 Mon Sep 17 00:00:00 2001
From: Diana Clarke <diana.joan.clarke@gmail.com>
Date: Mon, 22 Nov 2021 10:12:58 -0500
Subject: [PATCH 188/194] ARROW-14022: [Dev] Remove arrow/dev/benchmarking

The following directory contains a WIP that hasn't been modified in 3 years [1]:

- arrow/dev/benchmarking

The directory with benchmarking code that is currently in use is here:

- arrow/dev/archery/archery/benchmark

I was asked to answer questions about Arrow benchmarking today, and I naturally traversed into the higher level, WIP, benchmarking directory, but the one in use is nested deeper. Perhaps it's time to remove the WIP directory.

PS. This WIP was the jumping off point for Conbench (https://github.com/conbench/conbench). Much appreciated!

[1] Sadly, the author passed away. Thank you Tanya! RIP ;(

https://www.alumni.caltech.edu/in-memoriam/2019-9-24-tanya-schlusser-bs-99-1

Closes #11173 from dianaclarke/ARROW-14022

Authored-by: Diana Clarke <diana.joan.clarke@gmail.com>
Signed-off-by: Neal Richardson <neal.p.richardson@gmail.com>
---
 dev/benchmarking/.env                         |  18 -
 dev/benchmarking/.gitignore                   |   1 -
 dev/benchmarking/Dockerfile                   |  23 -
 dev/benchmarking/README.md                    | 255 -------
 dev/benchmarking/data_model.dot               | 219 ------
 dev/benchmarking/data_model.rst               | 373 ----------
 dev/benchmarking/ddl/0_setup.sql              |  23 -
 .../ddl/1_00_table_public_project.sql         |  45 --
 .../ddl/1_01_table_public_cpu.sql             |  63 --
 .../ddl/1_02_table_public_gpu.sql             |  43 --
 dev/benchmarking/ddl/1_03_table_public_os.sql |  57 --
 .../1_04_table_public_benchmark_language.sql  |  35 -
 .../ddl/1_05_table_public_dependencies.sql    |  31 -
 ...public_language_implementation_version.sql |  46 --
 .../ddl/1_07_table_public_benchmark_type.sql  |  39 --
 .../ddl/1_08_table_public_machine.sql         |  69 --
 .../ddl/1_09_table_public_unit.sql            |  37 -
 .../ddl/1_10_table_public_environment.sql     |  51 --
 .../ddl/1_11_table_public_benchmark.sql       |  54 --
 .../ddl/1_12_table_public_benchmark_run.sql   | 112 ---
 dev/benchmarking/ddl/2_00_views.sql           | 324 ---------
 .../ddl/3_00_functions_helpers.sql            | 643 ------------------
 .../ddl/3_01_functions_triggers.sql           | 574 ----------------
 .../ddl/3_02_functions_ingestion.sql          | 323 ---------
 .../ddl/3_10_functions_documentation.sql      | 395 -----------
 dev/benchmarking/ddl/4_00_triggers.sql        |  61 --
 dev/benchmarking/ddl/5_00_permissions.sql     |  73 --
 dev/benchmarking/docker-compose.yml           |  43 --
 .../examples/benchmark_example.json           |  32 -
 .../examples/benchmark_run_example.csv        |   6 -
 .../examples/benchmark_run_example.json       |  97 ---
 .../benchmark_with_context_example.json       |  73 --
 dev/benchmarking/examples/example.sql         | 232 -------
 .../examples/example_graphql_mutation.json    |  12 -
 .../graphql_query_environment_view.json       |   3 -
 dev/benchmarking/examples/machine.json        |  22 -
 dev/benchmarking/graphql_submit.sh            |  75 --
 dev/benchmarking/make_data_model_rst.sh       |  69 --
 dev/benchmarking/make_dotfile.sh              |  70 --
 dev/benchmarking/make_machine_json.sh         |  55 --
 40 files changed, 4776 deletions(-)
 delete mode 100644 dev/benchmarking/.env
 delete mode 100644 dev/benchmarking/.gitignore
 delete mode 100644 dev/benchmarking/Dockerfile
 delete mode 100644 dev/benchmarking/README.md
 delete mode 100644 dev/benchmarking/data_model.dot
 delete mode 100644 dev/benchmarking/data_model.rst
 delete mode 100644 dev/benchmarking/ddl/0_setup.sql
 delete mode 100644 dev/benchmarking/ddl/1_00_table_public_project.sql
 delete mode 100644 dev/benchmarking/ddl/1_01_table_public_cpu.sql
 delete mode 100644 dev/benchmarking/ddl/1_02_table_public_gpu.sql
 delete mode 100644 dev/benchmarking/ddl/1_03_table_public_os.sql
 delete mode 100644 dev/benchmarking/ddl/1_04_table_public_benchmark_language.sql
 delete mode 100644 dev/benchmarking/ddl/1_05_table_public_dependencies.sql
 delete mode 100644 dev/benchmarking/ddl/1_06_table_public_language_implementation_version.sql
 delete mode 100644 dev/benchmarking/ddl/1_07_table_public_benchmark_type.sql
 delete mode 100644 dev/benchmarking/ddl/1_08_table_public_machine.sql
 delete mode 100644 dev/benchmarking/ddl/1_09_table_public_unit.sql
 delete mode 100644 dev/benchmarking/ddl/1_10_table_public_environment.sql
 delete mode 100644 dev/benchmarking/ddl/1_11_table_public_benchmark.sql
 delete mode 100644 dev/benchmarking/ddl/1_12_table_public_benchmark_run.sql
 delete mode 100644 dev/benchmarking/ddl/2_00_views.sql
 delete mode 100644 dev/benchmarking/ddl/3_00_functions_helpers.sql
 delete mode 100644 dev/benchmarking/ddl/3_01_functions_triggers.sql
 delete mode 100644 dev/benchmarking/ddl/3_02_functions_ingestion.sql
 delete mode 100644 dev/benchmarking/ddl/3_10_functions_documentation.sql
 delete mode 100644 dev/benchmarking/ddl/4_00_triggers.sql
 delete mode 100644 dev/benchmarking/ddl/5_00_permissions.sql
 delete mode 100644 dev/benchmarking/docker-compose.yml
 delete mode 100644 dev/benchmarking/examples/benchmark_example.json
 delete mode 100644 dev/benchmarking/examples/benchmark_run_example.csv
 delete mode 100644 dev/benchmarking/examples/benchmark_run_example.json
 delete mode 100644 dev/benchmarking/examples/benchmark_with_context_example.json
 delete mode 100644 dev/benchmarking/examples/example.sql
 delete mode 100644 dev/benchmarking/examples/example_graphql_mutation.json
 delete mode 100644 dev/benchmarking/examples/graphql_query_environment_view.json
 delete mode 100644 dev/benchmarking/examples/machine.json
 delete mode 100755 dev/benchmarking/graphql_submit.sh
 delete mode 100755 dev/benchmarking/make_data_model_rst.sh
 delete mode 100755 dev/benchmarking/make_dotfile.sh
 delete mode 100755 dev/benchmarking/make_machine_json.sh

diff --git a/dev/benchmarking/.env b/dev/benchmarking/.env
deleted file mode 100644
index 7485f5866d7a2..0000000000000
--- a/dev/benchmarking/.env
+++ /dev/null
@@ -1,18 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-PG_USER=benchmark
-PG_PASS=benchmark
diff --git a/dev/benchmarking/.gitignore b/dev/benchmarking/.gitignore
deleted file mode 100644
index cda00d658189d..0000000000000
--- a/dev/benchmarking/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-/machine.json
diff --git a/dev/benchmarking/Dockerfile b/dev/benchmarking/Dockerfile
deleted file mode 100644
index f470333979ca4..0000000000000
--- a/dev/benchmarking/Dockerfile
+++ /dev/null
@@ -1,23 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-
-#   http://www.apache.org/licenses/LICENSE-2.0
-
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-FROM postgres:11-alpine
-
-# Any `.sh` and `.sql` files copied to the entrypoint directory
-# will be run during startup. See `docker-entrypoint.sh` in
-# https://github.com/docker-library/postgres/blob/master/11/alpine/
-COPY ddl/* /docker-entrypoint-initdb.d/
diff --git a/dev/benchmarking/README.md b/dev/benchmarking/README.md
deleted file mode 100644
index c5ddd62e026fd..0000000000000
--- a/dev/benchmarking/README.md
+++ /dev/null
@@ -1,255 +0,0 @@
-<!--
-  ~ Licensed to the Apache Software Foundation (ASF) under one
-  ~ or more contributor license agreements.  See the NOTICE file
-  ~ distributed with this work for additional information
-  ~ regarding copyright ownership.  The ASF licenses this file
-  ~ to you under the Apache License, Version 2.0 (the
-  ~ "License"); you may not use this file except in compliance
-  ~ with the License.  You may obtain a copy of the License at
-  ~
-  ~   http://www.apache.org/licenses/LICENSE-2.0
-  ~
-  ~ Unless required by applicable law or agreed to in writing,
-  ~ software distributed under the License is distributed on an
-  ~ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-  ~ KIND, either express or implied.  See the License for the
-  ~ specific language governing permissions and limitations
-  ~ under the License.
-  -->
-
-
-> NOTE: For those deploying this database, Postgres does not by default use
-> UTF-8, however it is [required for the jsonb][pg-jsonb] format used in
-> some columns to always work. This [stackoverflow post][so-utf8] describes
-> how to do it for Amazon RDS. This [section of the docs][pg-charset]
-> states how to do it in general, i.e.: `initdb -E UTF8`.
-
-# Benchmark database
-
-This directory contains files related to the benchmark database.
-
-- 'ddl/\*.sql' contains the database definition.
-- 'examples/' contain code to test the database and demonstrate its use.
-- 'Dockerfile' and 'docker-compose.yml' are for developing benchmarks
-  against a testing database.
-- An auto-generated summary of views in the [Data model][./data_model.rst].
-
-## Setup
-
-To create a 'machine.json' file that will uniquely identify a computer for
-benchmark submission, run the provided shell script and fill in the prompts
-to identify the GPU.
-
-> NOTE: this does not work on VMs or Windows.
-
-```shell
-./make_machine_json.sh
-```
-
-Submit the machine details via http using the command
-
-> NOTE: This will only work if we have selected graphql as a client
-> and have it running in production or if during development
-> you have run `docker-compose up` to create and run both a
-> database Docker container and graphql client Docker container.
-
-```shell
-./graphql_submit.sh machine machine.json localhost:5000/graphql
-```
-
-or submit after starting up the psql client from this directory, using
-
-```
-\set content `cat machine.json`
-SELECT ingest_machine_view(:'content'::jsonb);
-```
-
-> NOTE: If you don't have a "machine.json" file generated,
-> use the example file "examples/machine.json" instead.
-
-## Local testing
-
-There is a file named "[.env][.env]" in this directory that is used by
-`docker-compose` to set up the postgres user and password for the
-local containers. Currently the name and password are both
-`benchmark`. This will be the password for the psql client as well.
-
-The Postgres Alpine image runs any added '\*.sql' and '\*.sh' scripts placed
-in '/docker-entrypoint-initdb.d/' during its startup script, so the local
-database will be set up automatically once the container is running.
-
-To start the containers, be sure to have [Docker installed][docker],
-and then run the following from this directory (arrow/dev/benchmarking).
-
-
-```
-docker-compose up
-```
-
-This will start a process that will show logs from both the running
-Postgres container and the running GraphQL container.
-To stop the running containers gracefully, background the process
-and run
-
-```
-docker-compose down
-fg  # To re-foreground the backgrounded process while it exits
-```
-
-You will still have the container images "benchmarking_pg",
-"graphile/postgraphile", and "postgres:11-alpine" on your
-computer. You should keep them if you want to run this again.
-If you don't, then remove them with the command:
-
-```
-docker rmi benchmarking_pg postgres:11-alpine graphile/postgraphile
-```
-
-### Postgres client
-
-The `psql` shell client is bundled with the PostgreSQL core distribution
-available from the [Postgres download page][postgres-downloads].
-Using the `PG_USER` defined in the `.env` file (currently "benchmark"),
-the command to connect to the container is:
-```shell
-psql -h localhost -p 5432 -U benchmark
-```
-There is an example script in [examples/example.sql](examples/example.sql) that
-runs some queries against the database. To run it in the psql client, type
-the following in the psql command-line interface:
-
-```
-\i examples/example.sql
-```
-
-#### Bulk ingestion using CSV
-
-An example CSV file for bulk ingestion is in
-[examples/benchmark_run_example.csv](examples/benchmark_run_example.csv).
-The columns are listed in the same order as they are defined, to avoid having
-to explicitly name every column in ingestion. The "id" column is left empty
-and will be automatically assigned on insert.
-
-To ingest the example CSV file from the command line,
-use the command below:
-
-```shell
-CSV='examples/benchmark_run_example.csv' && \
-psql -U benchmark -h localhost -p 5432 \
- -c "\copy benchmark_run_view FROM '${CSV}' WITH (FORMAT csv, HEADER);"
-```
-
-#### Bulk ingestion using JSON
-
-To ingest the example JSON file using the psql client, use the command below.
-
-```
-\set content `cat examples/benchmark_example.json`
-SELECT ingest_benchmark_view(:'content'::jsonb);
-```
-
-### HTTP client
-
-This section requires an actual HTTP client to be up, either
-for the production database or via the testing setup.
-(See the [local testing section](#local-testing) for how to set it up).
-
-The 'graphile/postgraphile' container provides an HTTP interface
-to the database via two url routes:
-
-- A GraphiQL page ([localhost:5000/graphiql][graphiql])
-  to aid visual exploration of the data model.
-  (The `--watch` flag on the command line. Not recommended for production.)
-- An endpoint that receives POST requests only (localhost:5000/graphql).
-
-#### Ingestion
-
-The script [graphql_submit.sh](./graphql_submit.sh) simplifies submission
-to the database via curl. Examples:
-
-```shell
-./graphql_submit.sh benchmarks examples/benchmark_example.json 
-./graphql_submit.sh runs examples/benchmark_run_example.json
-```
-
-#### Querying
-
-The output of the query is a JSON object that is hard to read on the command line.
-Here is an example query in the shell:
-```shell
-curl -X POST \
-  -H "Content-Type: application/json"  \
-  --data '{"query": "{projectDetails{ projectName }}"}' \
-  localhost:5000/graphql
-```
-
-which (if you have previously run the "examples.sql" command) yields
-
-```
-{"data":{"projectDetails":{"projectName":"Apache Arrow"}}}
-```
-
-Here is an example query using Python:
-```python
-import json
-import requests
-
-uri = "http://localhost:5000/graphql"
-query = json.load(open("examples/graphql_query_environment_view.json"))
-response = requests.post(uri, json=query)
-message = "{benchmarkLanguage}: {languageImplementationVersion}, {dependencies}"
-
-for row in response.json()['data']['allEnvironmentViews']['edges']:
-    print(message.format(**row['node']))
-
-# result:
-#
-# Python: CPython 2.7, {"six":"","numpy":"1.14","other_lib":"1.0"}
-# Python: CPython 2.7, {"six":"","numpy":"1.15","other_lib":"1.0"}
-# Python: CPython 3.6, {"boost":"1.42","numpy":"1.15"}
-```
-
-## Deployment
-
-(work in progress).
-
-> NOTE: For those deploying this database, Postgres does not by default use
-> UTF-8, however it is [required for the jsonb][pg-jsonb] format used in
-> some columns to always work. This [stackoverflow post][so-utf8] describes
-> how to do it for Amazon RDS. This [section of the docs][pg-charset]
-> states how to do it in general, i.e.: `initdb -E UTF8`.
-
-
-## Quick reference
-
-- String variables `'have single quotes'`
-- Arrays `'{"have", "curly", "braces"}'::text[]` or `'{1, 2, 3}'::integer[]`
-- JSONb `'{"has":"this", "format":42}'::jsonb`
-- Elements inserted using JSON-formatted strings can use standard
-  JSON-formatted arrays (`[1, 2, 3]`) and do not have to use the above
-  string formats.
-- When comparing nullable values use `x IS NOT DISTINCT FROM y` rather than `x = y`
-- An auto-generated summary of the [Data model][./data_model.rst].
-
-## Data model documentation
-
-To recreate the data model documentation,
-(1) install the [psql client][postgres-downloads]
-(sorry you need to download the whole thing),
-(2) start the docker container using `docker-compose up`,
-(3) and then run these scripts:
-
-```
-./make_dotfile.sh
-./make_data_model_rst.sh
-```
-
-[pg-jsonb]: https://www.postgresql.org/docs/11/datatype-json.html#id-1.5.7.22.3
-[so-utf8]: https://stackoverflow.com/a/33557023
-[pg-charset]: https://www.postgresql.org/docs/9.3/multibyte.html#AEN34424
-[docker]: https://www.docker.com/get-started
-[citext-limitations]: https://www.postgresql.org/docs/11/citext.html#id-1.11.7.17.7
-[postgres-downloads]: https://www.postgresql.org/download/
-[graphiql]: http://localhost:5000/graphiql
-[postgraphile-lambda]: https://github.com/graphile/postgraphile-lambda-example
-[postgraphile-cli]: https://www.graphile.org/postgraphile/usage-cli/
diff --git a/dev/benchmarking/data_model.dot b/dev/benchmarking/data_model.dot
deleted file mode 100644
index d311acd4e5f1c..0000000000000
--- a/dev/benchmarking/data_model.dot
+++ /dev/null
@@ -1,219 +0,0 @@
-/*
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements.See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership.The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License.You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied.See the License for the
- specific language governing permissions and limitations
- under the License.
-*/
-
-/*
- WARNING
-   This is an auto-generated file. Please do not edit.
-
-   To reproduce, please run :code:`./make_data_model_rst.sh`.
-   (This requires you have the
-   `psql client <https://www.postgresql.org/download/>`_
-   and have started the docker containers using
-   :code:`docker-compose up`).
-*/
-digraph database {
-  concentrate = true;
-  rankdir = LR;
-  ratio = ".75";
-  node [shape = none, fontsize="11", fontname="Helvetica"];
-  edge [fontsize="8", fontname="Helvetica"];
-legend
-[fontsize = "14"
-label =
-<<table border="0" cellpadding="0">
-  <tr><td align="left"><font point-size="16">Legend</font></td></tr>
-  <tr><td align="left">pk = primary key</td></tr>
-  <tr><td align="left">fk = foreign key</td></tr>
-  <tr><td align="left">u = unique*</td></tr>
-  <tr><td align="left">o = optional</td></tr>
-  <tr><td align="left">* multiple uniques in the same table are a unique group</td></tr>
-</table>>
-];
-benchmark
-[label =
-  <<table border="0" cellborder="1" cellspacing="0" cellpadding="2">
-    <tr><td border="0"><font point-size="14">benchmark</font></td></tr>
-    <tr><td port="benchmark_id"><b>benchmark_id (pk)</b></td></tr>
-    <tr><td port="benchmark_language_id"><b>benchmark_language_id (pk)</b></td></tr>
-    <tr><td>benchmark_name (u)</td></tr>
-    <tr><td>parameter_names (o)</td></tr>
-    <tr><td>benchmark_description</td></tr>
-    <tr><td>benchmark_version (u)</td></tr>
-    <tr><td port="unit_id">unit_id (fk) </td></tr>
-  </table>>
-];
-benchmark_language
-[label =
-  <<table border="0" cellborder="1" cellspacing="0" cellpadding="2">
-    <tr><td border="0"><font point-size="14">benchmark_language</font></td></tr>
-    <tr><td port="benchmark_language_id"><b>benchmark_language_id (pk)</b></td></tr>
-    <tr><td>benchmark_language (u)</td></tr>
-  </table>>
-];
-benchmark_run
-[label =
-  <<table border="0" cellborder="1" cellspacing="0" cellpadding="2">
-    <tr><td border="0"><font point-size="14">benchmark_run</font></td></tr>
-    <tr><td port="benchmark_run_id"><b>benchmark_run_id (pk)</b></td></tr>
-    <tr><td>parameter_values (u)</td></tr>
-    <tr><td>value</td></tr>
-    <tr><td>git_commit_timestamp (u)</td></tr>
-    <tr><td>git_hash</td></tr>
-    <tr><td>val_min (o)</td></tr>
-    <tr><td>val_q1 (o)</td></tr>
-    <tr><td>val_q3 (o)</td></tr>
-    <tr><td>val_max (o)</td></tr>
-    <tr><td>std_dev</td></tr>
-    <tr><td>n_obs</td></tr>
-    <tr><td>run_timestamp (u)</td></tr>
-    <tr><td>run_metadata (o)</td></tr>
-    <tr><td>run_notes (o)</td></tr>
-    <tr><td port="machine_id">machine_id (u) (fk) </td></tr>
-    <tr><td port="environment_id">environment_id (u) (fk) </td></tr>
-    <tr><td port="language_implementation_version_id">language_implementation_version_id (fk) </td></tr>
-    <tr><td port="benchmark_language_id">benchmark_language_id (fk) </td></tr>
-    <tr><td port="benchmark_id">benchmark_id (u) (fk) </td></tr>
-  </table>>
-];
-benchmark_type
-[label =
-  <<table border="0" cellborder="1" cellspacing="0" cellpadding="2">
-    <tr><td border="0"><font point-size="14">benchmark_type</font></td></tr>
-    <tr><td port="benchmark_type_id"><b>benchmark_type_id (pk)</b></td></tr>
-    <tr><td>benchmark_type (u)</td></tr>
-    <tr><td>lessisbetter</td></tr>
-  </table>>
-];
-cpu
-[label =
-  <<table border="0" cellborder="1" cellspacing="0" cellpadding="2">
-    <tr><td border="0"><font point-size="14">cpu</font></td></tr>
-    <tr><td port="cpu_id"><b>cpu_id (pk)</b></td></tr>
-    <tr><td>cpu_model_name (u)</td></tr>
-    <tr><td>cpu_core_count</td></tr>
-    <tr><td>cpu_thread_count</td></tr>
-    <tr><td>cpu_frequency_max_hz</td></tr>
-    <tr><td>cpu_frequency_min_hz</td></tr>
-    <tr><td>cpu_l1d_cache_bytes</td></tr>
-    <tr><td>cpu_l1i_cache_bytes</td></tr>
-    <tr><td>cpu_l2_cache_bytes</td></tr>
-    <tr><td>cpu_l3_cache_bytes</td></tr>
-  </table>>
-];
-dependencies
-[label =
-  <<table border="0" cellborder="1" cellspacing="0" cellpadding="2">
-    <tr><td border="0"><font point-size="14">dependencies</font></td></tr>
-    <tr><td port="dependencies_id"><b>dependencies_id (pk)</b></td></tr>
-    <tr><td>dependencies (u)</td></tr>
-  </table>>
-];
-gpu
-[label =
-  <<table border="0" cellborder="1" cellspacing="0" cellpadding="2">
-    <tr><td border="0"><font point-size="14">gpu</font></td></tr>
-    <tr><td port="gpu_id"><b>gpu_id (pk)</b></td></tr>
-    <tr><td>gpu_information (u)</td></tr>
-    <tr><td>gpu_part_number</td></tr>
-    <tr><td>gpu_product_name</td></tr>
-  </table>>
-];
-language_implementation_version
-[label =
-  <<table border="0" cellborder="1" cellspacing="0" cellpadding="2">
-    <tr><td border="0"><font point-size="14">language_implementation_version</font></td></tr>
-    <tr><td port="language_implementation_version_id"><b>language_implementation_version_id (pk)</b></td></tr>
-    <tr><td port="benchmark_language_id"><b>benchmark_language_id (pk)</b></td></tr>
-    <tr><td>language_implementation_version (u)</td></tr>
-  </table>>
-];
-machine
-[label =
-  <<table border="0" cellborder="1" cellspacing="0" cellpadding="2">
-    <tr><td border="0"><font point-size="14">machine</font></td></tr>
-    <tr><td port="machine_id"><b>machine_id (pk)</b></td></tr>
-    <tr><td>machine_name</td></tr>
-    <tr><td>mac_address (u)</td></tr>
-    <tr><td>memory_bytes</td></tr>
-    <tr><td>cpu_actual_frequency_hz</td></tr>
-    <tr><td>machine_other_attributes (o)</td></tr>
-    <tr><td port="cpu_id">cpu_id (fk) </td></tr>
-    <tr><td port="gpu_id">gpu_id (fk) </td></tr>
-    <tr><td port="os_id">os_id (fk) </td></tr>
-  </table>>
-];
-os
-[label =
-  <<table border="0" cellborder="1" cellspacing="0" cellpadding="2">
-    <tr><td border="0"><font point-size="14">os</font></td></tr>
-    <tr><td port="os_id"><b>os_id (pk)</b></td></tr>
-    <tr><td>os_name (u)</td></tr>
-    <tr><td>architecture_name (u)</td></tr>
-    <tr><td>kernel_name (u)</td></tr>
-  </table>>
-];
-project
-[label =
-  <<table border="0" cellborder="1" cellspacing="0" cellpadding="2">
-    <tr><td border="0"><font point-size="14">project</font></td></tr>
-    <tr><td port="project_id"><b>project_id (pk)</b></td></tr>
-    <tr><td>project_name (u)</td></tr>
-    <tr><td>project_url (u)</td></tr>
-    <tr><td>repo_url (u)</td></tr>
-    <tr><td>last_changed</td></tr>
-  </table>>
-];
-unit
-[label =
-  <<table border="0" cellborder="1" cellspacing="0" cellpadding="2">
-    <tr><td border="0"><font point-size="14">unit</font></td></tr>
-    <tr><td port="unit_id"><b>unit_id (pk)</b></td></tr>
-    <tr><td>units (u)</td></tr>
-    <tr><td port="benchmark_type_id">benchmark_type_id (fk) </td></tr>
-  </table>>
-];
-environment
-[label =
-  <<table border="0" cellborder="1" cellspacing="0" cellpadding="2">
-    <tr><td border="0"><font point-size="14">environment</font></td></tr>
-    <tr><td port="environment_id"><b>environment_id (pk)</b></td></tr>
-    <tr><td port="language_implementation_version_id"><b>language_implementation_version_id (pk)</b></td></tr>
-    <tr><td port="benchmark_language_id"><b>benchmark_language_id (pk)</b></td></tr>
-    <tr><td port="dependencies_id">dependencies_id (u) (fk) </td></tr>
-  </table>>
-];
-machine:cpu_id -> cpu:cpu_id;
-machine:gpu_id -> gpu:gpu_id;
-machine:os_id -> os:os_id;
-benchmark:benchmark_language_id -> benchmark_language:benchmark_language_id;
-environment:benchmark_language_id -> benchmark_language:benchmark_language_id;
-language_implementation_version:benchmark_language_id -> benchmark_language:benchmark_language_id;
-environment:dependencies_id -> dependencies:dependencies_id;
-environment:benchmark_language_id -> language_implementation_version:benchmark_language_id;
-environment:language_implementation_version_id -> language_implementation_version:language_implementation_version_id;
-unit:benchmark_type_id -> benchmark_type:benchmark_type_id;
-benchmark_run:machine_id -> machine:machine_id;
-benchmark:unit_id -> unit:unit_id;
-benchmark_run:language_implementation_version_id -> environment:language_implementation_version_id;
-benchmark_run:benchmark_language_id -> environment:benchmark_language_id;
-benchmark_run:environment_id -> environment:environment_id;
-benchmark_run:benchmark_language_id -> benchmark:benchmark_language_id;
-benchmark_run:benchmark_id -> benchmark:benchmark_id;
-}
-
diff --git a/dev/benchmarking/data_model.rst b/dev/benchmarking/data_model.rst
deleted file mode 100644
index d0f3dc7fc996a..0000000000000
--- a/dev/benchmarking/data_model.rst
+++ /dev/null
@@ -1,373 +0,0 @@
-.. Licensed to the Apache Software Foundation (ASF) under one
-.. or more contributor license agreements.  See the NOTICE file
-.. distributed with this work for additional information
-.. regarding copyright ownership.  The ASF licenses this file
-.. to you under the Apache License, Version 2.0 (the
-.. "License"); you may not use this file except in compliance
-.. with the License.  You may obtain a copy of the License at
-
-..   http://www.apache.org/licenses/LICENSE-2.0
-
-.. Unless required by applicable law or agreed to in writing,
-.. software distributed under the License is distributed on an
-.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-.. KIND, either express or implied.  See the License for the
-.. specific language governing permissions and limitations
-.. under the License.
-
-
-.. WARNING
-..   This is an auto-generated file. Please do not edit.
-
-..   To reproduce, please run :code:`./make_data_model_rst.sh`.
-..   (This requires you have the
-..   `psql client <https://www.postgresql.org/download/>`_
-..   and have started the docker containers using
-..   :code:`docker-compose up`).
-
-
-.. _benchmark-data-model:
-
-Benchmark data model
-====================
-
-
-.. graphviz:: data_model.dot
-
-
-.. _benchmark-ingestion:
-
-Benchmark ingestion helper functions
-====================================
-
-ingest_benchmark_run_view
--------------------------
-
-:code:`ingest_benchmark_run_view(from_jsonb jsonb)`
-
-The argument is a JSON object. NOTE: key names must be entirely
-lowercase, or the insert will fail. Extra key-value pairs are ignored.
-Example::
-
-  [
-    {
-      "benchmark_name": "Benchmark 2",
-      "benchmark_version": "version 0",
-      "parameter_values": {"arg0": 100, "arg1": 5},
-      "value": 2.5,
-      "git_commit_timestamp": "2019-02-08 22:35:53 +0100",
-      "git_hash": "324d3cf198444a",
-      "val_min": 1,
-      "val_q1": 2,
-      "val_q3": 3,
-      "val_max": 4,
-      "std_dev": 1.41,
-      "n_obs": 8,
-      "run_timestamp": "2019-02-14 03:00:05 -0600",
-      "mac_address": "08:00:2b:01:02:03",
-      "benchmark_language": "Python",
-      "language_implementation_version": "CPython 2.7",
-      "dependencies": {"six": "", "numpy": "1.14", "other_lib": "1.0"}
-    },
-    {
-      "benchmark_name": "Benchmark 2",
-      "benchmark_version": "version 0",
-      "parameter_values": {"arg0": 1000, "arg1": 5},
-      "value": 5,
-      "git_commit_timestamp": "2019-02-08 22:35:53 +0100",
-      "git_hash": "324d3cf198444a",
-      "std_dev": 3.14,
-      "n_obs": 8,
-      "run_timestamp": "2019-02-14 03:00:10 -0600",
-      "mac_address": "08:00:2b:01:02:03",
-      "benchmark_language": "Python",
-      "language_implementation_version": "CPython 2.7",
-      "dependencies": {"six": "", "numpy": "1.14", "other_lib": "1.0"}
-    }
-  ]
-To identify which columns in "benchmark_run_view" are required,
-please see the view documentation in :ref:`benchmark-data-model`.
-
-
-
-back to `Benchmark data model <benchmark-data-model>`_
-
-
-ingest_benchmark_view
----------------------
-
-:code:`ingest_benchmark_view(from_jsonb jsonb)`
-
-The argument is a JSON object. NOTE: key names must be entirely
-lowercase, or the insert will fail. Extra key-value pairs are ignored.
-Example::
-
-  [
-    {
-      "benchmark_name": "Benchmark 1",
-      "parameter_names": ["arg0", "arg1", "arg2"],
-      "benchmark_description": "First benchmark",
-      "benchmark_type": "Time",
-      "units": "miliseconds",
-      "lessisbetter": true,
-      "benchmark_version": "second version",
-      "benchmark_language": "Python"
-    },
-    {
-      "benchmark_name": "Benchmark 2",
-      "parameter_names": ["arg0", "arg1"],
-      "benchmark_description": "Description 2.",
-      "benchmark_type": "Time",
-      "units": "nanoseconds",
-      "lessisbetter": true,
-      "benchmark_version": "second version",
-      "benchmark_language": "Python"
-    }
-  ]
-
-To identify which columns in "benchmark_view" are required,
-please see the view documentation in :ref:`benchmark-data-model`.
-
-
-
-back to `Benchmark data model <benchmark-data-model>`_
-
-
-ingest_benchmark_runs_with_context
-----------------------------------
-
-:code:`ingest_benchmark_runs_with_context(from_jsonb jsonb)`
-
-The argument is a JSON object. NOTE: key names must be entirely
-lowercase, or the insert will fail. Extra key-value pairs are ignored.
-The object contains three key-value pairs::
-
-    {"context": {
-        "mac_address": "08:00:2b:01:02:03",
-        "benchmark_language": "Python",
-        "language_implementation_version": "CPython 3.6",
-        "dependencies": {"six": "", "numpy": "1.14", "other_lib": "1.0"},
-        "git_commit_timestamp": "2019-02-14 22:42:22 +0100",
-        "git_hash": "123456789abcde",
-        "run_timestamp": "2019-02-14 03:00:40 -0600",
-        "extra stuff": "does not hurt anything and will not be added."
-      },
-     "benchmark_version": {
-        "Benchmark Name 1": "Any string can be a version.",
-        "Benchmark Name 2": "A git hash can be a version.",
-        "An Unused Benchmark Name": "Will be ignored."
-      },
-     "benchmarks": [
-        {
-          "benchmark_name": "Benchmark Name 1",
-          "parameter_values": {"argument1": 1, "argument2": "value2"},
-          "value": 42,
-          "val_min": 41.2,
-          "val_q1":  41.5,
-          "val_q3":  42.5,
-          "val_max": 42.8,
-          "std_dev": 0.5,
-          "n_obs": 100,
-          "run_metadata": {"any": "key-value pairs"},
-          "run_notes": "Any relevant notes."
-        },
-        {
-          "benchmark_name": "Benchmark Name 2",
-          "parameter_values": {"not nullable": "Use {} if no params."},
-          "value": 8,
-          "std_dev": 1,
-          "n_obs": 2,
-        }
-      ]
-    }
-
-- The entry for "context" contains the machine, environment, and timestamp
-  information common to all of the runs
-- The entry for "benchmark_version" maps benchmark
-  names to their version strings. (Which can be a git hash,
-  the entire code string, a number, or any other string of your choice.)
-- The entry for "benchmarks" is a list of benchmark run data
-  for the given context and benchmark versions. The first example
-  benchmark run entry contains all possible values, even
-  nullable ones, and the second entry omits all nullable values.
-
-
-
-
-back to `Benchmark data model <benchmark-data-model>`_
-
-
-ingest_machine_view
--------------------
-
-:code:`ingest_machine_view(from_jsonb jsonb)`
-
-The argument is a JSON object. NOTE: key names must be entirely
-lowercase, or the insert will fail. Extra key-value pairs are ignored.
-Example::
-
-  {
-    "mac_address": "0a:00:2d:01:02:03",
-    "machine_name": "Yet-Another-Machine-Name",
-    "memory_bytes": 8589934592,
-    "cpu_actual_frequency_hz": 2300000000,
-    "os_name": "OSX",
-    "architecture_name": "x86_64",
-    "kernel_name": "18.2.0",
-    "cpu_model_name": "Intel(R) Core(TM) i5-7360U CPU @ 2.30GHz",
-    "cpu_core_count": 2,
-    "cpu_thread_count": 4,
-    "cpu_frequency_max_hz": 2300000000,
-    "cpu_frequency_min_hz": 2300000000,
-    "cpu_l1d_cache_bytes": 32768,
-    "cpu_l1i_cache_bytes": 32768,
-    "cpu_l2_cache_bytes": 262144,
-    "cpu_l3_cache_bytes": 4194304,
-    "machine_other_attributes": {"just": "an example"},
-    "gpu_information": "",
-    "gpu_part_number": "",
-    "gpu_product_name": ""
-  }
-
-To identify which columns in "machine_view" are required,
-please see the view documentation in :ref:`benchmark-data-model`.
-
-
-
-back to `Benchmark data model <benchmark-data-model>`_
-
-
-
-.. _benchmark-views:
-
-Benchmark views
-===============
-
-
-benchmark_run_view
-------------------
-
-Each benchmark run.
-
-- Each entry is unique on the machine, environment, benchmark,
-  and git commit timestamp.
-
-===============================  ===========  ========  ===========  ===========
-Column                           Type         Nullable  Default      Description
-===============================  ===========  ========  ===========  ===========
-benchmark_run_id                 int8         not null  serial       primary key
-benchmark_name                   citext       not null               unique
-benchmark_version                citext       not null               unique
-parameter_values                 jsonb        not null  '{}'::jsonb  unique
-value                            numeric      not null
-git_commit_timestamp             timestamptz  not null               unique
-git_hash                         text         not null
-val_min                          numeric
-val_q1                           numeric
-val_q3                           numeric
-val_max                          numeric
-std_dev                          numeric      not null
-n_obs                            int4         not null
-run_timestamp                    timestamptz  not null               unique
-run_metadata                     jsonb
-run_notes                        text
-mac_address                      macaddr      not null               unique
-benchmark_language               citext       not null               unique
-language_implementation_version  citext       not null  ''::citext   unique
-dependencies                     jsonb        not null  '{}'::jsonb  unique
-===============================  ===========  ========  ===========  ===========
-
-back to `Benchmark data model <benchmark-data-model>`_
-
-benchmark_view
---------------
-
-The details about a particular benchmark.
-
-- "benchmark_name" is unique for a given "benchmark_language"
-- Each entry is unique on
-  ("benchmark_language", "benchmark_name", "benchmark_version")
-
-=====================  ======  ========  =======  ===========
-Column                 Type    Nullable  Default  Description
-=====================  ======  ========  =======  ===========
-benchmark_id           int4    not null  serial   primary key
-benchmark_name         citext  not null           unique
-parameter_names        _text
-benchmark_description  text    not null
-benchmark_type         citext  not null           unique
-units                  citext  not null           unique
-lessisbetter           bool    not null
-benchmark_version      citext  not null           unique
-benchmark_language     citext  not null           unique
-=====================  ======  ========  =======  ===========
-
-back to `Benchmark data model <benchmark-data-model>`_
-
-environment_view
-----------------
-
-The build environment used for a reported benchmark run.
-(Will be inferred from each "benchmark_run" if not explicitly added).
-
-- Each entry is unique on
-  ("benchmark_language", "language_implementation_version", "dependencies")
-- "benchmark_language" is unique in the "benchmark_language" table
-- "benchmark_language" plus "language_implementation_version" is unique in
-  the "language_implementation_version" table
-- "dependencies" is unique in the "dependencies" table
-
-===============================  ======  ========  ===========  ===========
-Column                           Type    Nullable  Default      Description
-===============================  ======  ========  ===========  ===========
-environment_id                   int4    not null  serial       primary key
-benchmark_language               citext  not null               unique
-language_implementation_version  citext  not null  ''::citext   unique
-dependencies                     jsonb   not null  '{}'::jsonb  unique
-===============================  ======  ========  ===========  ===========
-
-back to `Benchmark data model <benchmark-data-model>`_
-
-machine_view
-------------
-
-The machine environment (CPU, GPU, OS) used for each benchmark run.
-
-- "mac_address" is unique in the "machine" table
-- "gpu_part_number" is unique in the "gpu" (graphics processing unit) table
-  Empty string (''), not null, is used for machines that won't use the GPU
-- "cpu_model_name" is unique in the "cpu" (central processing unit) table
-- "os_name", "os_architecture_name", and "os_kernel_name"
-  are unique in the "os" (operating system) table
-- "machine_other_attributes" is a key-value store for any other relevant
-  data, e.g. '{"hard_disk_type": "solid state"}'
-
-========================  =======  ========  ==========  ===========
-Column                    Type     Nullable  Default     Description
-========================  =======  ========  ==========  ===========
-machine_id                int4     not null  serial      primary key
-mac_address               macaddr  not null              unique
-machine_name              citext   not null
-memory_bytes              int8     not null
-cpu_actual_frequency_hz   int8     not null
-os_name                   citext   not null              unique
-architecture_name         citext   not null              unique
-kernel_name               citext   not null  ''::citext  unique
-cpu_model_name            citext   not null              unique
-cpu_core_count            int4     not null
-cpu_thread_count          int4     not null
-cpu_frequency_max_hz      int8     not null
-cpu_frequency_min_hz      int8     not null
-cpu_l1d_cache_bytes       int4     not null
-cpu_l1i_cache_bytes       int4     not null
-cpu_l2_cache_bytes        int4     not null
-cpu_l3_cache_bytes        int4     not null
-gpu_information           citext   not null  ''::citext  unique
-gpu_part_number           citext   not null  ''::citext
-gpu_product_name          citext   not null  ''::citext
-machine_other_attributes  jsonb
-========================  =======  ========  ==========  ===========
-
-back to `Benchmark data model <benchmark-data-model>`_
-
-
diff --git a/dev/benchmarking/ddl/0_setup.sql b/dev/benchmarking/ddl/0_setup.sql
deleted file mode 100644
index ec10446412434..0000000000000
--- a/dev/benchmarking/ddl/0_setup.sql
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
-  Licensed to the Apache Software Foundation (ASF) under one
-  or more contributor license agreements.  See the NOTICE file
-  distributed with this work for additional information
-  regarding copyright ownership.  The ASF licenses this file
-  to you under the Apache License, Version 2.0 (the
-  "License"); you may not use this file except in compliance
-  with the License.  You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing,
-  software distributed under the License is distributed on an
-  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-  KIND, either express or implied.  See the License for the
-  specific language governing permissions and limitations
-  under the License.
-*/
-
-CREATE EXTENSION IF NOT EXISTS "citext";  -- type for case-insensitive text
-
--- For future fine-grained control over function execution by user group.
-ALTER DEFAULT PRIVILEGES REVOKE EXECUTE ON functions FROM public;
diff --git a/dev/benchmarking/ddl/1_00_table_public_project.sql b/dev/benchmarking/ddl/1_00_table_public_project.sql
deleted file mode 100644
index c52d66cfd950d..0000000000000
--- a/dev/benchmarking/ddl/1_00_table_public_project.sql
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
-  Licensed to the Apache Software Foundation (ASF) under one
-  or more contributor license agreements.  See the NOTICE file
-  distributed with this work for additional information
-  regarding copyright ownership.  The ASF licenses this file
-  to you under the Apache License, Version 2.0 (the
-  "License"); you may not use this file except in compliance
-  with the License.  You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing,
-  software distributed under the License is distributed on an
-  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-  KIND, either express or implied.  See the License for the
-  specific language governing permissions and limitations
-  under the License.
-*/
-
-
--- PROJECT
-CREATE TABLE IF NOT EXISTS public.project
-(
-  project_id SERIAL PRIMARY KEY
-  , project_name citext NOT NULL
-  , project_url text NOT NULL
-  , repo_url text NOT NULL
-  , last_changed timestamp (0) without time zone NOT NULL DEFAULT now()
-);
-COMMENT ON TABLE public.project
-  IS 'Project name and relevant URLs.';
-COMMENT ON COLUMN public.project.project_url
-  IS 'Homepage URL.';
-COMMENT ON COLUMN public.project.repo_url
-  IS 'Git repo URL to link stored commit hashes to code in a webpage.';
-COMMENT ON COLUMN public.project.last_changed
-  IS 'New project details are added with a new timestamp. '
-     'The project details with the newest timestamp will be used.';
-
--- CONSTRAINTS
-CREATE UNIQUE INDEX project_unique_index_on_project_name_urls
-  ON public.project(project_name, project_url, repo_url);
-COMMENT ON INDEX
-  public.project_unique_index_on_project_name_urls
-  IS 'Enforce uniqueness of project name and urls.';
diff --git a/dev/benchmarking/ddl/1_01_table_public_cpu.sql b/dev/benchmarking/ddl/1_01_table_public_cpu.sql
deleted file mode 100644
index df1a9e757d251..0000000000000
--- a/dev/benchmarking/ddl/1_01_table_public_cpu.sql
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
-  Licensed to the Apache Software Foundation (ASF) under one
-  or more contributor license agreements.  See the NOTICE file
-  distributed with this work for additional information
-  regarding copyright ownership.  The ASF licenses this file
-  to you under the Apache License, Version 2.0 (the
-  "License"); you may not use this file except in compliance
-  with the License.  You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing,
-  software distributed under the License is distributed on an
-  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-  KIND, either express or implied.  See the License for the
-  specific language governing permissions and limitations
-  under the License.
-*/
-
-
--- CPU
-CREATE TABLE IF NOT EXISTS public.cpu
-(
-  cpu_id SERIAL PRIMARY KEY
-  , cpu_model_name citext NOT NULL UNIQUE
-  , cpu_core_count integer NOT NULL
-  , cpu_thread_count integer NOT NULL
-  , cpu_frequency_max_Hz bigint NOT NULL
-  , cpu_frequency_min_Hz bigint NOT NULL
-  , cpu_L1d_cache_bytes integer NOT NULL
-  , cpu_L1i_cache_bytes integer NOT NULL
-  , cpu_L2_cache_bytes integer NOT NULL
-  , cpu_L3_cache_bytes integer NOT NULL
-);
-COMMENT ON TABLE public.cpu
-  IS 'CPU model and its specifications.';
-COMMENT ON COLUMN public.cpu.cpu_id
-  IS 'The primary key for the CPU table. '
-     'NOTE: This is a synthetic primary key and not meant to represent a '
-     'processor instruction to read capabilities.';
-COMMENT ON COLUMN public.cpu.cpu_model_name
-  IS 'The output of `sysctl -n machdep.cpu.brand_stringp`.';
-COMMENT ON COLUMN public.cpu.cpu_core_count
-  IS 'The output of `sysctl -n hw.physicalcpu`.';
-COMMENT ON COLUMN public.cpu.cpu_thread_count
-  IS 'The output of `sysctl -n hw.logicalcpu`.';
-COMMENT ON COLUMN public.cpu.cpu_frequency_max_Hz
-  IS 'The output of `sysctl -n hw.cpufrequency_max`.';
-COMMENT ON COLUMN public.cpu.cpu_frequency_min_Hz
-  IS 'The output of `sysctl -n hw.cpufrequency_min`.';
-COMMENT ON COLUMN public.cpu.cpu_L1d_cache_bytes
-  IS 'The output of `sysctl -n hw.l1dcachesize`.';
-COMMENT ON COLUMN public.cpu.cpu_L1i_cache_bytes
-  IS 'The output of `sysctl -n hw.l1icachesize`.';
-COMMENT ON COLUMN public.cpu.cpu_L2_cache_bytes
-  IS 'The output of `sysctl -n hw.l2cachesize`.';
-COMMENT ON COLUMN public.cpu.cpu_L3_cache_bytes
-  IS 'The output of `sysctl -n hw.l3cachesize`.';
-
--- CONSTRAINTS
-ALTER TABLE public.cpu
-  ADD CONSTRAINT cpu_check_cpu_model_name_length
-  CHECK (char_length(cpu_model_name) < 255);
diff --git a/dev/benchmarking/ddl/1_02_table_public_gpu.sql b/dev/benchmarking/ddl/1_02_table_public_gpu.sql
deleted file mode 100644
index 564af19de7a6e..0000000000000
--- a/dev/benchmarking/ddl/1_02_table_public_gpu.sql
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
-  Licensed to the Apache Software Foundation (ASF) under one
-  or more contributor license agreements.  See the NOTICE file
-  distributed with this work for additional information
-  regarding copyright ownership.  The ASF licenses this file
-  to you under the Apache License, Version 2.0 (the
-  "License"); you may not use this file except in compliance
-  with the License.  You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing,
-  software distributed under the License is distributed on an
-  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-  KIND, either express or implied.  See the License for the
-  specific language governing permissions and limitations
-  under the License.
-*/
-
-
--- GPU
-CREATE TABLE IF NOT EXISTS public.gpu
-(
-  gpu_id SERIAL PRIMARY KEY
-  , gpu_information citext UNIQUE NOT NULL DEFAULT ''
-  , gpu_part_number citext NOT NULL DEFAULT ''
-  , gpu_product_name citext NOT NULL DEFAULT ''
-);
-COMMENT ON TABLE public.gpu IS 'GPU specifications.';
-COMMENT ON COLUMN public.gpu.gpu_information
-  IS 'The output of `nvidia-smi -q` (on Linux or Windows), or `cuda-smi` '
-     'or `kextstat | grep -i cuda` on OSX, or another command; anything '
-     'that gets a string to uniquely identify the GPU.';
-
--- CONSTRAINTS
-CREATE INDEX gpu_index_on_part_number
-  ON public.gpu (gpu_part_number);
-
-CREATE INDEX gpu_index_on_product_name
-  ON public.gpu (gpu_product_name);
-
-CREATE INDEX gpu_index_on_product_name_and_part_number
-  ON public.gpu (gpu_product_name, gpu_part_number);
diff --git a/dev/benchmarking/ddl/1_03_table_public_os.sql b/dev/benchmarking/ddl/1_03_table_public_os.sql
deleted file mode 100644
index 7b03d82f48748..0000000000000
--- a/dev/benchmarking/ddl/1_03_table_public_os.sql
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
-  Licensed to the Apache Software Foundation (ASF) under one
-  or more contributor license agreements.  See the NOTICE file
-  distributed with this work for additional information
-  regarding copyright ownership.  The ASF licenses this file
-  to you under the Apache License, Version 2.0 (the
-  "License"); you may not use this file except in compliance
-  with the License.  You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing,
-  software distributed under the License is distributed on an
-  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-  KIND, either express or implied.  See the License for the
-  specific language governing permissions and limitations
-  under the License.
-*/
-
-
--- OS
-CREATE TABLE IF NOT EXISTS public.os
-(
-  os_id SERIAL PRIMARY KEY
-  , os_name citext NOT NULL
-  , architecture_name citext NOT NULL
-  , kernel_name citext NOT NULL DEFAULT ''
-);
--- @name os. forces retention of an 's' in the Graphile GraphQL api.
-COMMENT ON TABLE public.os
-  IS E'@name os.\nOperating system name and kernel (version).';
-COMMENT ON COLUMN public.os.os_name
-  IS 'Operating system name. For example, OSX, Ubuntu, Windows`.';
-COMMENT ON COLUMN public.os.architecture_name
-  IS 'Operating system architecture; the output of `uname -m`.';
-COMMENT ON COLUMN public.os.kernel_name
-  IS 'Operating system kernel, or NULL. '
-     'On Linux/OSX, the output of `uname -r`. '
-     'On Windows, the output of `ver`.';
-
--- CONSTRAINTS
-ALTER TABLE public.os
-  ADD CONSTRAINT os_check_os_name_length
-  CHECK (char_length(os_name) < 63);
-
-ALTER TABLE public.os
-  ADD CONSTRAINT os_check_architecture_name_length
-  CHECK (char_length(architecture_name) < 63);
-
-ALTER TABLE public.os
-  ADD CONSTRAINT os_check_kernel_name_length
-  CHECK (char_length(kernel_name) < 63);
-
-CREATE UNIQUE INDEX os_unique_index
-  ON public.os(os_name, architecture_name, kernel_name);
-COMMENT ON INDEX public.os_unique_index
-  IS 'Enforce uniqueness of os, architecture, and kernel names.';
diff --git a/dev/benchmarking/ddl/1_04_table_public_benchmark_language.sql b/dev/benchmarking/ddl/1_04_table_public_benchmark_language.sql
deleted file mode 100644
index 2e35536770932..0000000000000
--- a/dev/benchmarking/ddl/1_04_table_public_benchmark_language.sql
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
-  Licensed to the Apache Software Foundation (ASF) under one
-  or more contributor license agreements.  See the NOTICE file
-  distributed with this work for additional information
-  regarding copyright ownership.  The ASF licenses this file
-  to you under the Apache License, Version 2.0 (the
-  "License"); you may not use this file except in compliance
-  with the License.  You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing,
-  software distributed under the License is distributed on an
-  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-  KIND, either express or implied.  See the License for the
-  specific language governing permissions and limitations
-  under the License.
-*/
-
-
--- BENCHMARK_LANGUAGE
-CREATE TABLE IF NOT EXISTS public.benchmark_language
-(
-  benchmark_language_id SERIAL PRIMARY KEY
-  , benchmark_language citext NOT NULL UNIQUE
-);
-COMMENT ON TABLE public.benchmark_language
-  IS 'The language the benchmark was written in (and presumably for).';
-COMMENT ON COLUMN public.benchmark_language.benchmark_language
-  IS 'The benchmark language. For example: Python';
-
--- CONSTRAINTS
-ALTER TABLE public.benchmark_language
-  ADD CONSTRAINT benchmark_language_check_language_length
-  CHECK (char_length(benchmark_language) < 63);
diff --git a/dev/benchmarking/ddl/1_05_table_public_dependencies.sql b/dev/benchmarking/ddl/1_05_table_public_dependencies.sql
deleted file mode 100644
index 3744a0c35a873..0000000000000
--- a/dev/benchmarking/ddl/1_05_table_public_dependencies.sql
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
-  Licensed to the Apache Software Foundation (ASF) under one
-  or more contributor license agreements.  See the NOTICE file
-  distributed with this work for additional information
-  regarding copyright ownership.  The ASF licenses this file
-  to you under the Apache License, Version 2.0 (the
-  "License"); you may not use this file except in compliance
-  with the License.  You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing,
-  software distributed under the License is distributed on an
-  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-  KIND, either express or implied.  See the License for the
-  specific language governing permissions and limitations
-  under the License.
-*/
-
-
--- DEPENDENCIES
-CREATE TABLE IF NOT EXISTS public.dependencies
-(
-  dependencies_id SERIAL PRIMARY KEY
-  , dependencies jsonb UNIQUE NOT NULL DEFAULT '{}'::jsonb
-);
-COMMENT ON TABLE public.dependencies
-  IS E'@name dependencies.\n'
-      'A JSON object mapping dependencies to their versions.';
-COMMENT ON COLUMN public.dependencies.dependencies
-  IS 'For example: ''{"boost": "1.69", "conda": "", "numpy": "1.15"}''.';
diff --git a/dev/benchmarking/ddl/1_06_table_public_language_implementation_version.sql b/dev/benchmarking/ddl/1_06_table_public_language_implementation_version.sql
deleted file mode 100644
index f7d26e4e2d2e5..0000000000000
--- a/dev/benchmarking/ddl/1_06_table_public_language_implementation_version.sql
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-  Licensed to the Apache Software Foundation (ASF) under one
-  or more contributor license agreements.  See the NOTICE file
-  distributed with this work for additional information
-  regarding copyright ownership.  The ASF licenses this file
-  to you under the Apache License, Version 2.0 (the
-  "License"); you may not use this file except in compliance
-  with the License.  You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing,
-  software distributed under the License is distributed on an
-  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-  KIND, either express or implied.  See the License for the
-  specific language governing permissions and limitations
-  under the License.
-*/
-
-
--- LANGUAGE_IMPLEMENTATION_VERSION
-CREATE TABLE IF NOT EXISTS public.language_implementation_version
-(
-  language_implementation_version_id SERIAL
-  , language_implementation_version citext NOT NULL DEFAULT ''
-  , benchmark_language_id integer NOT NULL
-  , PRIMARY KEY (language_implementation_version_id, benchmark_language_id)
-  , FOREIGN KEY (benchmark_language_id) REFERENCES public.benchmark_language
-);
-COMMENT ON TABLE public.language_implementation_version
-  IS 'The benchmark language implementation or compiler version, e.g. '
-     '''CPython 2.7'' or ''PyPy x.y'' or ''gcc 7.3.0'' or '
-     '''gcc (Ubuntu 7.3.0-27ubuntu1~18.04) 7.3.0''.';
-COMMENT ON COLUMN public.language_implementation_version.language_implementation_version
-  IS 'The version number used in the benchmark environment (e.g. ''2.7'').';
-
--- CONSTRAINTS
-ALTER TABLE public.language_implementation_version
-  ADD CONSTRAINT language_implementation_version_check_version_length
-  CHECK (char_length(language_implementation_version) < 255);
-
-CREATE UNIQUE INDEX language_implementation_version_unique_index
-  ON public.language_implementation_version
-    (benchmark_language_id, language_implementation_version);
-COMMENT ON INDEX language_implementation_version_unique_index
-  IS 'Enforce unique implementation versions for the languages.';
diff --git a/dev/benchmarking/ddl/1_07_table_public_benchmark_type.sql b/dev/benchmarking/ddl/1_07_table_public_benchmark_type.sql
deleted file mode 100644
index 1143cdb0015d4..0000000000000
--- a/dev/benchmarking/ddl/1_07_table_public_benchmark_type.sql
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
-  Licensed to the Apache Software Foundation (ASF) under one
-  or more contributor license agreements.  See the NOTICE file
-  distributed with this work for additional information
-  regarding copyright ownership.  The ASF licenses this file
-  to you under the Apache License, Version 2.0 (the
-  "License"); you may not use this file except in compliance
-  with the License.  You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing,
-  software distributed under the License is distributed on an
-  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-  KIND, either express or implied.  See the License for the
-  specific language governing permissions and limitations
-  under the License.
-*/
-
-
--- BENCHMARK_TYPE
-CREATE TABLE IF NOT EXISTS public.benchmark_type
-(
-  benchmark_type_id SERIAL PRIMARY KEY
-  , benchmark_type citext NOT NULL UNIQUE
-  , lessisbetter boolean NOT NULL
-);
-COMMENT ON TABLE public.benchmark_type
-  IS 'The type of benchmark. For example "time", "mem", "peakmem", "track"';
-COMMENT ON COLUMN public.benchmark_type.benchmark_type
-  IS 'The type of units, so ''time'' for seconds, miliseconds, or '
-     '''mem'' for kilobytes, megabytes.';
-COMMENT ON COLUMN public.benchmark_type.lessisbetter
-  IS 'True if a smaller benchmark value is better.';
-
--- CONSTRAINTS
-ALTER TABLE public.benchmark_type
-  ADD CONSTRAINT benchmark_type_check_benchmark_type_char_length
-  CHECK (char_length(benchmark_type) < 63);
diff --git a/dev/benchmarking/ddl/1_08_table_public_machine.sql b/dev/benchmarking/ddl/1_08_table_public_machine.sql
deleted file mode 100644
index 8f219d3e0cfa4..0000000000000
--- a/dev/benchmarking/ddl/1_08_table_public_machine.sql
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
-  Licensed to the Apache Software Foundation (ASF) under one
-  or more contributor license agreements.  See the NOTICE file
-  distributed with this work for additional information
-  regarding copyright ownership.  The ASF licenses this file
-  to you under the Apache License, Version 2.0 (the
-  "License"); you may not use this file except in compliance
-  with the License.  You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing,
-  software distributed under the License is distributed on an
-  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-  KIND, either express or implied.  See the License for the
-  specific language governing permissions and limitations
-  under the License.
-*/
-
-
--- MACHINE
-CREATE TABLE IF NOT EXISTS public.machine
-(
-  machine_id SERIAL PRIMARY KEY
-  , machine_name citext NOT NULL
-  , mac_address macaddr NOT NULL
-  , memory_bytes bigint NOT NULL
-  , cpu_actual_frequency_Hz bigint NOT NULL
-  , machine_other_attributes jsonb
-  , cpu_id integer NOT NULL
-  , gpu_id integer NOT NULL
-  , os_id integer NOT NULL
-  , FOREIGN KEY (cpu_id) REFERENCES public.cpu
-  , FOREIGN KEY (gpu_id) REFERENCES public.gpu
-  , FOREIGN KEY (os_id) REFERENCES public.os
-);
-COMMENT ON TABLE public.machine
-  IS 'Unique identifiers for a machine.';
-COMMENT ON COLUMN public.machine.machine_name
-  IS 'A machine name of your choice.';
-COMMENT ON COLUMN public.machine.mac_address
-  IS 'The mac_address of a physical network interface to uniquely '
-     'identify a computer. Postgres accepts standard formats, including '
-     '''08:00:2b:01:02:03'', ''08-00-2b-01-02-03'', ''08002b:010203''';
-COMMENT ON COLUMN public.machine.memory_bytes
-  IS 'The output of `sysctl -n hw.memsize`.';
-COMMENT ON COLUMN public.machine.cpu_actual_frequency_Hz
-  IS 'The output of `sysctl -n hw.cpufrequency`.';
-COMMENT ON COLUMN public.machine.machine_other_attributes
-  IS 'Additional attributes of interest, as a JSON object. '
-     'For example: ''{"hard_disk_type": "solid state"}''::jsonb.';
-
--- CONSTRAINTS
-CREATE UNIQUE INDEX machine_index_on_mac_address
-  ON public.machine(mac_address);
-COMMENT ON INDEX machine_index_on_mac_address
-  IS 'Enforce unique mac address';
-
-CREATE INDEX machine_index_on_cpu_id
-  ON public.machine(cpu_id);
-
-CREATE INDEX machine_index_on_gpu_id
-  ON public.machine(gpu_id);
-
-CREATE INDEX machine_index_on_os_id
-  ON public.machine(os_id);
-
-CREATE INDEX machine_index_on_cpu_gpu_os_id
-  ON public.machine(cpu_id, gpu_id, os_id);
diff --git a/dev/benchmarking/ddl/1_09_table_public_unit.sql b/dev/benchmarking/ddl/1_09_table_public_unit.sql
deleted file mode 100644
index a8cf576696d10..0000000000000
--- a/dev/benchmarking/ddl/1_09_table_public_unit.sql
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
-  Licensed to the Apache Software Foundation (ASF) under one
-  or more contributor license agreements.  See the NOTICE file
-  distributed with this work for additional information
-  regarding copyright ownership.  The ASF licenses this file
-  to you under the Apache License, Version 2.0 (the
-  "License"); you may not use this file except in compliance
-  with the License.  You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing,
-  software distributed under the License is distributed on an
-  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-  KIND, either express or implied.  See the License for the
-  specific language governing permissions and limitations
-  under the License.
-*/
-
-
--- UNIT
-CREATE TABLE IF NOT EXISTS public.unit
-(
-  unit_id SERIAL PRIMARY KEY
-  , units citext NOT NULL UNIQUE
-  , benchmark_type_id integer NOT NULL
-  , FOREIGN KEY (benchmark_type_id)
-    REFERENCES public.benchmark_type(benchmark_type_id)
-);
-COMMENT ON TABLE public.unit IS 'The actual units for a reported benchmark.';
-COMMENT ON COLUMN public.unit.units
-  IS 'For example: nanoseconds, microseconds, bytes, megabytes.';
-
--- CONSTRAINTS
-ALTER TABLE public.unit
-  ADD CONSTRAINT unit_check_units_string_length
-  CHECK (char_length(units) < 63);
diff --git a/dev/benchmarking/ddl/1_10_table_public_environment.sql b/dev/benchmarking/ddl/1_10_table_public_environment.sql
deleted file mode 100644
index e3a6d23957f2d..0000000000000
--- a/dev/benchmarking/ddl/1_10_table_public_environment.sql
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
-  Licensed to the Apache Software Foundation (ASF) under one
-  or more contributor license agreements.  See the NOTICE file
-  distributed with this work for additional information
-  regarding copyright ownership.  The ASF licenses this file
-  to you under the Apache License, Version 2.0 (the
-  "License"); you may not use this file except in compliance
-  with the License.  You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing,
-  software distributed under the License is distributed on an
-  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-  KIND, either express or implied.  See the License for the
-  specific language governing permissions and limitations
-  under the License.
-*/
-
-
--- ENVIRONMENT
-CREATE TABLE IF NOT EXISTS public.environment
-(
-  environment_id SERIAL
-  , language_implementation_version_id integer NOT NULL
-  , benchmark_language_id integer NOT NULL
-  , dependencies_id integer NOT NULL
-  , PRIMARY KEY
-      (environment_id, benchmark_language_id, language_implementation_version_id)
-  , FOREIGN KEY
-      (benchmark_language_id)
-      REFERENCES public.benchmark_language
-  , FOREIGN KEY
-      (language_implementation_version_id, benchmark_language_id)
-      REFERENCES public.language_implementation_version(
-        language_implementation_version_id
-        , benchmark_language_id
-      )
-  , FOREIGN KEY
-      (dependencies_id)
-      REFERENCES public.dependencies
-);
-COMMENT ON TABLE public.environment
-  IS 'Identifies a build environment for a specific suite of benchmarks.';
-
--- CONSTRAINTS
-CREATE UNIQUE INDEX environment_unique_index
-  ON public.environment
-    (benchmark_language_id, language_implementation_version_id, dependencies_id);
-COMMENT ON INDEX environment_unique_index
-  IS 'Enforce unique combinations of language version and dependencies.';
diff --git a/dev/benchmarking/ddl/1_11_table_public_benchmark.sql b/dev/benchmarking/ddl/1_11_table_public_benchmark.sql
deleted file mode 100644
index 18895823df68c..0000000000000
--- a/dev/benchmarking/ddl/1_11_table_public_benchmark.sql
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
-  Licensed to the Apache Software Foundation (ASF) under one
-  or more contributor license agreements.  See the NOTICE file
-  distributed with this work for additional information
-  regarding copyright ownership.  The ASF licenses this file
-  to you under the Apache License, Version 2.0 (the
-  "License"); you may not use this file except in compliance
-  with the License.  You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing,
-  software distributed under the License is distributed on an
-  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-  KIND, either express or implied.  See the License for the
-  specific language governing permissions and limitations
-  under the License.
-*/
-
-
--- BENCHMARK
-CREATE TABLE IF NOT EXISTS public.benchmark
-(
-  benchmark_id SERIAL
-  , benchmark_name citext NOT NULL
-  , parameter_names text[]
-  , benchmark_description text NOT NULL
-  , benchmark_version citext NOT NULL
-  , unit_id integer NOT NULL
-  , benchmark_language_id integer NOT NULL
-  , PRIMARY KEY (benchmark_id, benchmark_language_id)
-  , FOREIGN KEY (benchmark_language_id) REFERENCES public.benchmark_language
-  , FOREIGN KEY (unit_id) REFERENCES public.unit
-);
-COMMENT ON TABLE public.benchmark
-  IS 'Identifies an individual benchmark.';
-COMMENT ON COLUMN public.benchmark.parameter_names
-  IS 'A list of strings identifying the parameter names in the benchmark.';
-COMMENT ON COLUMN public.benchmark.benchmark_version
-  IS 'Can be any string. In Airspeed Velocity, the version is '
-     'by default the hash of the entire code string for the benchmark.';
-
--- CONSTRAINTS
-CREATE INDEX benchmark_index_on_benchmark_language_id
-  ON public.benchmark(benchmark_language_id);
-
-CREATE INDEX benchmark_index_on_unit_id
-  ON public.benchmark(unit_id);
-
-CREATE UNIQUE INDEX benchmark_unique_index_on_language_benchmark_version
-  ON public.benchmark
-    (benchmark_language_id, benchmark_name, benchmark_version);
-COMMENT ON INDEX public.benchmark_unique_index_on_language_benchmark_version
-  IS 'Enforce uniqueness of benchmark name and version for a given language.';
diff --git a/dev/benchmarking/ddl/1_12_table_public_benchmark_run.sql b/dev/benchmarking/ddl/1_12_table_public_benchmark_run.sql
deleted file mode 100644
index 20b9ef0bb9639..0000000000000
--- a/dev/benchmarking/ddl/1_12_table_public_benchmark_run.sql
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
-  Licensed to the Apache Software Foundation (ASF) under one
-  or more contributor license agreements.  See the NOTICE file
-  distributed with this work for additional information
-  regarding copyright ownership.  The ASF licenses this file
-  to you under the Apache License, Version 2.0 (the
-  "License"); you may not use this file except in compliance
-  with the License.  You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing,
-  software distributed under the License is distributed on an
-  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-  KIND, either express or implied.  See the License for the
-  specific language governing permissions and limitations
-  under the License.
-*/
-
-
--- BENCHMARK_RUN
-CREATE TABLE IF NOT EXISTS public.benchmark_run
-(
-  benchmark_run_id BIGSERIAL PRIMARY KEY
-  , parameter_values jsonb NOT NULL DEFAULT '{}'::jsonb
-  , value numeric NOT NULL
-  , git_commit_timestamp timestamp (0) with time zone NOT NULL
-  , git_hash text NOT NULL
-  , val_min numeric
-  , val_q1 numeric
-  , val_q3 numeric
-  , val_max numeric
-  , std_dev numeric NOT NULL
-  , n_obs integer NOT NULL
-  , run_timestamp timestamp (0) with time zone NOT NULL
-  , run_metadata jsonb
-  , run_notes text
-  , machine_id integer NOT NULL
-  , environment_id integer NOT NULL
-  , language_implementation_version_id integer NOT NULL
-  , benchmark_language_id integer NOT NULL
-  , benchmark_id integer NOT NULL
-  , FOREIGN KEY (machine_id) REFERENCES public.machine
-  , FOREIGN KEY
-      (environment_id, benchmark_language_id, language_implementation_version_id)
-      REFERENCES public.environment
-  , FOREIGN KEY (benchmark_id, benchmark_language_id)
-      REFERENCES public.benchmark(benchmark_id, benchmark_language_id)
-);
-COMMENT ON TABLE public.benchmark_run
-  IS 'One run per benchmark run.';
-COMMENT ON COLUMN public.benchmark_run.parameter_values
-  IS 'A JSON object mapping the parameter names from '
-     '"benchmark.parameter_names" to values.';
-COMMENT ON COLUMN public.benchmark_run.value
-  IS 'The average value from the benchmark run.';
-COMMENT ON COLUMN public.benchmark_run.git_commit_timestamp
-  IS 'Get this using `git show -s --date=local --format="%ci" <hash>`. '
-     'ISO 8601 is recommended, e.g. ''2019-01-30 03:12 -0600''.';
-COMMENT ON COLUMN public.benchmark_run.git_hash
-  IS 'The commit has of the codebase currently being benchmarked.';
-COMMENT ON COLUMN public.benchmark_run.val_min
-  IS 'The smallest benchmark run value for this run.';
-COMMENT ON COLUMN public.benchmark_run.val_q1
-  IS 'The first quartile of the benchmark run values for this run.';
-COMMENT ON COLUMN public.benchmark_run.val_q3
-  IS 'The third quartile of the benchmark run values for this run.';
-COMMENT ON COLUMN public.benchmark_run.val_max
-  IS 'The largest benchmark run value for this run.';
-COMMENT ON COLUMN public.benchmark_run.std_dev
-  IS 'The standard deviation of the run values for this benchmark run.';
-COMMENT ON COLUMN public.benchmark_run.n_obs
-  IS 'The number of observations for this benchmark run.';
-COMMENT ON COLUMN public.benchmark_run.run_metadata
-  IS 'Additional metadata of interest, as a JSON object. '
-     'For example: ''{"ci_99": [2.7e-06, 3.1e-06]}''::jsonb.';
-COMMENT ON COLUMN public.benchmark_run.run_notes
-  IS 'Additional notes of interest, as a text string. ';
-
--- CONSTRAINTS
-ALTER TABLE public.benchmark_run
-  ADD CONSTRAINT benchmark_run_check_std_dev_nonnegative
-  CHECK (std_dev >= 0);
-
-ALTER TABLE public.benchmark_run
-  ADD CONSTRAINT benchmark_run_check_n_obs_positive
-  CHECK (n_obs > 0);
-
-CREATE INDEX benchmark_run_index_on_environment_id
-  ON public.benchmark_run(environment_id);
-
-CREATE INDEX benchmark_run_index_on_machine_id
-  ON public.benchmark_run(machine_id);
-
-CREATE INDEX benchmark_run_index_on_benchmark_id
-  ON public.benchmark_run(benchmark_id, benchmark_language_id);
-
-CREATE INDEX benchmark_run_index_on_benchmark_environment_time
-  ON public.benchmark_run
-    (benchmark_id, environment_id, git_commit_timestamp);
-COMMENT ON INDEX
-  public.benchmark_run_index_on_benchmark_environment_time
-  IS 'Index to improve sorting by benchmark, environment, and timestamp.';
-
-CREATE UNIQUE INDEX
-  benchmark_run_unique_index_on_env_benchmark_timestamp_params
-  ON public.benchmark_run
-    (machine_id, environment_id, benchmark_id, git_commit_timestamp, parameter_values, run_timestamp);
-COMMENT ON INDEX
-  public.benchmark_run_unique_index_on_env_benchmark_timestamp_params
-  IS 'Enforce uniqueness of benchmark run for a given machine, '
-     'environment, benchmark, git commit timestamp, and parameter values.';
diff --git a/dev/benchmarking/ddl/2_00_views.sql b/dev/benchmarking/ddl/2_00_views.sql
deleted file mode 100644
index cbd295e506d8b..0000000000000
--- a/dev/benchmarking/ddl/2_00_views.sql
+++ /dev/null
@@ -1,324 +0,0 @@
-/*
-  Licensed to the Apache Software Foundation (ASF) under one
-  or more contributor license agreements.  See the NOTICE file
-  distributed with this work for additional information
-  regarding copyright ownership.  The ASF licenses this file
-  to you under the Apache License, Version 2.0 (the
-  "License"); you may not use this file except in compliance
-  with the License.  You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing,
-  software distributed under the License is distributed on an
-  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-  KIND, either express or implied.  See the License for the
-  specific language governing permissions and limitations
-  under the License.
-*/
-
--- NOTE:
--- The function for documentation depends on view columns
--- being named exactly the same as in the table view.
-
--- MACHINE_VIEW
-CREATE OR REPLACE VIEW public.machine_view AS
-  SELECT
-    machine.machine_id
-    , mac_address
-    , machine_name
-    , memory_bytes
-    , cpu_actual_frequency_Hz
-    , os_name
-    , architecture_name
-    , kernel_name
-    , cpu_model_name
-    , cpu_core_count
-    , cpu_thread_count
-    , cpu_frequency_max_Hz
-    , cpu_frequency_min_Hz
-    , cpu_L1d_cache_bytes
-    , cpu_L1i_cache_bytes
-    , cpu_L2_cache_bytes
-    , cpu_L3_cache_bytes
-    , gpu_information
-    , gpu_part_number
-    , gpu_product_name
-    , machine_other_attributes
-  FROM public.machine AS machine
-    JOIN public.cpu AS cpu ON machine.cpu_id = cpu.cpu_id
-    JOIN public.gpu AS gpu ON machine.gpu_id = gpu.gpu_id
-    JOIN public.os AS os ON machine.os_id = os.os_id;
-COMMENT ON VIEW public.machine_view IS
-E'The machine environment (CPU, GPU, OS) used for each benchmark run.\n\n'
- '- "mac_address" is unique in the "machine" table\n'
- '- "gpu_part_number" is unique in the "gpu" (graphics processing unit) table\n'
- '  Empty string (''''), not null, is used for machines that won''t use the GPU\n'
- '- "cpu_model_name" is unique in the "cpu" (central processing unit) table\n'
- '- "os_name", "os_architecture_name", and "os_kernel_name"\n'
- '  are unique in the "os" (operating system) table\n'
- '- "machine_other_attributes" is a key-value store for any other relevant\n'
- '  data, e.g. ''{"hard_disk_type": "solid state"}''';
-
-
--- LANGUAGE_IMPLEMENTATION_VERSION_VIEW
-CREATE OR REPLACE VIEW public.language_implementation_version_view AS
-  SELECT
-    lv.language_implementation_version_id
-    , bl.benchmark_language
-    , lv.language_implementation_version
-  FROM public.language_implementation_version AS lv
-    JOIN public.benchmark_language AS bl
-      ON lv.benchmark_language_id = bl.benchmark_language_id;
-
--- ENVIRONMENT_VIEW
-CREATE OR REPLACE VIEW public.environment_view AS
-  SELECT
-    env.environment_id
-    , benchmark_language
-    , language_implementation_version
-    , dependencies
-  FROM public.environment AS env
-    JOIN public.benchmark_language AS language
-      ON env.benchmark_language_id = language.benchmark_language_id
-    JOIN public.language_implementation_version AS version
-      ON env.language_implementation_version_id = version.language_implementation_version_id
-    JOIN public.dependencies AS deps
-      ON env.dependencies_id = deps.dependencies_id;
-COMMENT ON VIEW public.environment_view IS
-E'The build environment used for a reported benchmark run.\n'
- '(Will be inferred from each "benchmark_run" if not explicitly added).\n\n'
- '- Each entry is unique on\n'
- '  ("benchmark_language", "language_implementation_version", "dependencies")\n'
- '- "benchmark_language" is unique in the "benchmark_language" table\n'
- '- "benchmark_language" plus "language_implementation_version" is unique in\n'
- '  the "language_implementation_version" table\n'
- '- "dependencies" is unique in the "dependencies" table';
-
--- UNIT_VIEW
-CREATE OR REPLACE VIEW public.unit_view AS
-  SELECT
-    unit.unit_id
-    , units
-    , benchmark_type
-    , lessisbetter
-  FROM public.unit AS unit
-    JOIN public.benchmark_type AS bt
-      ON unit.benchmark_type_id = bt.benchmark_type_id;
-
--- BENCHMARK_VIEW
-CREATE OR REPLACE VIEW public.benchmark_view AS
-  SELECT
-    b.benchmark_id
-    , benchmark_name
-    , parameter_names
-    , benchmark_description
-    , benchmark_type
-    , units
-    , lessisbetter
-    , benchmark_version
-    , benchmark_language
-  FROM public.benchmark AS b
-    JOIN public.benchmark_language AS benchmark_language
-      ON b.benchmark_language_id = benchmark_language.benchmark_language_id
-    JOIN public.unit AS unit
-      ON b.unit_id = unit.unit_id
-    JOIN public.benchmark_type AS benchmark_type
-      ON unit.benchmark_type_id = benchmark_type.benchmark_type_id;
-COMMENT ON VIEW public.benchmark_view IS
-E'The details about a particular benchmark.\n\n'
- '- "benchmark_name" is unique for a given "benchmark_language"\n'
- '- Each entry is unique on\n'
- '  ("benchmark_language", "benchmark_name", "benchmark_version")';
-
--- BENCHMARK_RUN_VIEW
-CREATE OR REPLACE VIEW public.benchmark_run_view AS
-  SELECT
-    run.benchmark_run_id
-    -- benchmark_view (name, version, language only)
-    , benchmark_name
-    , benchmark_version
-    -- datum
-    , parameter_values
-    , value
-    , git_commit_timestamp
-    , git_hash
-    , val_min
-    , val_q1
-    , val_q3
-    , val_max
-    , std_dev
-    , n_obs
-    , run_timestamp
-    , run_metadata
-    , run_notes
-    -- machine_view (mac address only)
-    , mac_address
-    -- environment_view
-    , env.benchmark_language
-    , language_implementation_version
-    , dependencies
-  FROM public.benchmark_run AS run
-    JOIN public.benchmark_view AS benchmark
-      ON run.benchmark_id = benchmark.benchmark_id
-    JOIN public.machine_view AS machine
-      ON run.machine_id = machine.machine_id
-    JOIN public.environment_view AS env
-      ON run.environment_id = env.environment_id;
-COMMENT ON VIEW public.benchmark_run_view IS
-E'Each benchmark run.\n\n'
- '- Each entry is unique on the machine, environment, benchmark,\n'
- '  and git commit timestamp.';
-
--- FULL_BENCHMARK_RUN_VIEW
-CREATE OR REPLACE VIEW public.full_benchmark_run_view AS
-  SELECT
-    run.benchmark_run_id
-    -- benchmark_view
-    , benchmark_name
-    , parameter_names
-    , benchmark_description
-    , benchmark_type
-    , units
-    , lessisbetter
-    , benchmark_version
-    -- datum
-    , parameter_values
-    , value
-    , git_commit_timestamp
-    , git_hash
-    , val_min
-    , val_q1
-    , val_q3
-    , val_max
-    , std_dev
-    , n_obs
-    , run_timestamp
-    , run_metadata
-    , run_notes
-    -- machine_view
-    , machine_name
-    , mac_address
-    , memory_bytes
-    , cpu_actual_frequency_Hz
-    , os_name
-    , architecture_name
-    , kernel_name
-    , cpu_model_name
-    , cpu_core_count
-    , cpu_thread_count
-    , cpu_frequency_max_Hz
-    , cpu_frequency_min_Hz
-    , cpu_L1d_cache_bytes
-    , cpu_L1i_cache_bytes
-    , cpu_L2_cache_bytes
-    , cpu_L3_cache_bytes
-    , gpu_information
-    , gpu_part_number
-    , gpu_product_name
-    , machine_other_attributes
-    -- environment_view
-    , env.benchmark_language
-    , env.language_implementation_version
-    , dependencies
-  FROM public.benchmark_run AS run
-    JOIN public.benchmark_view AS benchmark
-      ON run.benchmark_id = benchmark.benchmark_id
-    JOIN public.machine_view AS machine
-      ON run.machine_id = machine.machine_id
-    JOIN public.environment_view AS env
-      ON run.environment_id = env.environment_id;
-
--- SUMMARIZED_TABLES_VIEW
-CREATE VIEW public.summarized_tables_view AS
-    WITH chosen AS (
-          SELECT
-            cls.oid AS id
-            , cls.relname as tbl_name
-          FROM pg_catalog.pg_class AS cls
-          JOIN pg_catalog.pg_namespace AS ns ON cls.relnamespace = ns.oid
-          WHERE
-            cls.relkind = 'r'
-            AND ns.nspname = 'public'
-        ), all_constraints AS (
-          SELECT
-            chosen.id AS tbl_id
-            , chosen.tbl_name
-            , unnest(conkey) AS col_id
-            , 'foreign key' AS col_constraint
-            FROM pg_catalog.pg_constraint
-            JOIN chosen ON chosen.id = conrelid
-            WHERE contype = 'f'
-
-          UNION
-
-          SELECT
-            chosen.id
-            , chosen.tbl_name
-            , unnest(indkey)
-            , 'unique'
-            FROM pg_catalog.pg_index i
-            JOIN chosen ON chosen.id = i.indrelid
-            WHERE i.indisunique AND NOT i.indisprimary
-
-          UNION
-
-          SELECT
-            chosen.id
-            , chosen.tbl_name
-            , unnest(indkey)
-            , 'primary key'
-            FROM pg_catalog.pg_index i
-            JOIN chosen on chosen.id = i.indrelid
-            WHERE i.indisprimary
-        ), gathered_constraints AS (
-          SELECT
-            tbl_id
-            , tbl_name
-            , col_id
-            , string_agg(col_constraint, ', ' ORDER BY col_constraint)
-              AS  col_constraint
-          FROM all_constraints
-          GROUP BY tbl_id, tbl_name, col_id
-    )
-    SELECT
-      chosen.tbl_name AS table_name
-      , columns.attnum AS column_number
-      , columns.attname AS column_name
-      , typ.typname AS type_name
-      , CASE
-          WHEN columns.attnotnull
-            THEN 'not null'
-          ELSE ''
-        END AS nullable
-      , CASE
-          WHEN defaults.adsrc like 'nextval%'
-            THEN 'serial'
-          ELSE defaults.adsrc
-        END AS default_value
-      , CASE
-          WHEN gc.col_constraint = '' OR gc.col_constraint IS NULL
-            THEN cnstrnt.consrc
-          WHEN cnstrnt.consrc IS NULL
-            THEN gc.col_constraint
-          ELSE gc.col_constraint || ', ' || cnstrnt.consrc
-        END AS description
-    FROM pg_catalog.pg_attribute AS columns
-      JOIN chosen ON columns.attrelid = chosen.id
-      JOIN pg_catalog.pg_type AS typ
-        ON typ.oid = columns.atttypid
-      LEFT JOIN gathered_constraints AS gc
-        ON gc.col_id = columns.attnum
-        AND gc.tbl_id = columns.attrelid
-      LEFT JOIN pg_attrdef AS defaults
-        ON defaults.adrelid = chosen.id
-        AND defaults.adnum = columns.attnum
-      LEFT JOIN pg_catalog.pg_constraint AS cnstrnt
-        ON cnstrnt.conrelid = columns.attrelid
-        AND columns.attrelid = ANY(cnstrnt.conkey)
-    WHERE
-      columns.attnum > 0
-    ORDER BY table_name, column_number;
-COMMENT ON VIEW public.summarized_tables_view
-  IS 'A summary of all columns from all tables in the public schema, '
-  ' identifying nullability, primary/foreign keys, and data type.';
diff --git a/dev/benchmarking/ddl/3_00_functions_helpers.sql b/dev/benchmarking/ddl/3_00_functions_helpers.sql
deleted file mode 100644
index b10b69a4e914e..0000000000000
--- a/dev/benchmarking/ddl/3_00_functions_helpers.sql
+++ /dev/null
@@ -1,643 +0,0 @@
-/*
-  Licensed to the Apache Software Foundation (ASF) under one
-  or more contributor license agreements.  See the NOTICE file
-  distributed with this work for additional information
-  regarding copyright ownership.  The ASF licenses this file
-  to you under the Apache License, Version 2.0 (the
-  "License"); you may not use this file except in compliance
-  with the License.  You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing,
-  software distributed under the License is distributed on an
-  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-  KIND, either express or implied.  See the License for the
-  specific language governing permissions and limitations
-  under the License.
-*/
-
-
--- PROJECT_DETAILS
-CREATE TYPE public.type_project_details AS (
-  project_name text
-  , project_url text
-  , repo_url text
-);
-
-CREATE OR REPLACE FUNCTION public.project_details()
-RETURNS public.type_project_details AS
-$$
-  SELECT project_name, project_url, repo_url
-  FROM public.project
-  ORDER BY last_changed DESC
-  LIMIT 1
-$$
-LANGUAGE sql STABLE;
-COMMENT ON FUNCTION public.project_details()
-IS 'Get the current project name, url, and repo url.';
-
-
--------------------------- GET-OR-SET FUNCTIONS --------------------------
---  The following functions have the naming convention "get_<tablename>_id".
---  All of them attempt to SELECT the desired row given the column
---  values, and if it does not exist will INSERT it.
---
---  When functions are overloaded with fewer columns, it is to allow
---  selection only, given columns that comprise a unique index.
-
--- GET_CPU_ID
-CREATE OR REPLACE FUNCTION public.get_cpu_id(
-  cpu_model_name citext
-  , cpu_core_count integer
-  , cpu_thread_count integer
-  , cpu_frequency_max_Hz bigint
-  , cpu_frequency_min_Hz bigint
-  , cpu_L1d_cache_bytes integer
-  , cpu_L1i_cache_bytes integer
-  , cpu_L2_cache_bytes integer
-  , cpu_L3_cache_bytes integer
-)
-RETURNS integer AS
-$$
-  DECLARE
-    result integer;
-  BEGIN
-    SELECT cpu_id INTO result FROM public.cpu AS cpu
-    WHERE cpu.cpu_model_name = $1
-      AND cpu.cpu_core_count = $2
-      AND cpu.cpu_thread_count = $3
-      AND cpu.cpu_frequency_max_Hz = $4
-      AND cpu.cpu_frequency_min_Hz = $5
-      AND cpu.cpu_L1d_cache_bytes = $6
-      AND cpu.cpu_L1i_cache_bytes = $7
-      AND cpu.cpu_L2_cache_bytes = $8
-      AND cpu.cpu_L3_cache_bytes = $9;
-
-    IF result IS NULL THEN
-      INSERT INTO public.cpu(
-        cpu_model_name
-        , cpu_core_count
-        , cpu_thread_count
-        , cpu_frequency_max_Hz
-        , cpu_frequency_min_Hz
-        , cpu_L1d_cache_bytes
-        , cpu_L1i_cache_bytes
-        , cpu_L2_cache_bytes
-        , cpu_L3_cache_bytes
-      )
-      VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)
-      RETURNING cpu_id INTO result;
-    END IF;
-
-    RETURN result;
-  END
-$$
-LANGUAGE plpgsql;
-COMMENT ON FUNCTION public.get_cpu_id(
-  citext
-  , integer
-  , integer
-  , bigint -- cpu_frequency_max_Hz
-  , bigint -- cpu_frequency_min_Hz
-  , integer
-  , integer
-  , integer
-  , integer
-)
-IS 'Insert or select CPU data, returning "cpu.cpu_id".';
-
--- GET_GPU_ID
-CREATE OR REPLACE FUNCTION public.get_gpu_id(
-  gpu_information citext DEFAULT NULL
-  , gpu_part_number citext DEFAULT NULL
-  , gpu_product_name citext DEFAULT NULL
-)
-RETURNS integer AS
-$$
-  DECLARE
-    result integer;
-  BEGIN
-    SELECT gpu_id INTO result FROM public.gpu AS gpu
-    WHERE
-      gpu.gpu_information = COALESCE($1, '')
-      AND gpu.gpu_part_number = COALESCE($2, '')
-      AND gpu.gpu_product_name = COALESCE($3, '');
-
-    IF result IS NULL THEN
-      INSERT INTO public.gpu(
-        gpu_information
-        , gpu_part_number
-        , gpu_product_name
-      )
-      VALUES (COALESCE($1, ''), COALESCE($2, ''), COALESCE($3, ''))
-      RETURNING gpu_id INTO result;
-    END IF;
-
-    RETURN result;
-  END
-$$
-LANGUAGE plpgsql;
-COMMENT ON FUNCTION public.get_gpu_id(citext, citext, citext)
-IS 'Insert or select GPU data, returning "gpu.gpu_id".';
-
--- GET_OS_ID
-CREATE OR REPLACE FUNCTION public.get_os_id(
-  os_name citext
-  , architecture_name citext
-  , kernel_name citext DEFAULT ''
-)
-RETURNS integer AS
-$$
-  DECLARE
-    result integer;
-  BEGIN
-    SELECT os_id INTO result FROM public.os AS os
-    WHERE os.os_name = $1
-      AND os.architecture_name = $2
-      AND os.kernel_name = COALESCE($3, '');
-
-    IF result is NULL THEN
-      INSERT INTO public.os(os_name, architecture_name, kernel_name)
-      VALUES ($1, $2, COALESCE($3, ''))
-      RETURNING os_id INTO result;
-    END IF;
-
-    RETURN result;
-  END
-$$
-LANGUAGE plpgsql;
-COMMENT ON FUNCTION public.get_os_id(citext, citext, citext)
-IS 'Insert or select OS data, returning "os.os_id".';
-
--- GET_MACHINE_ID (full signature)
-CREATE OR REPLACE FUNCTION public.get_machine_id(
-  mac_address macaddr
-  , machine_name citext
-  , memory_bytes bigint
-  , cpu_actual_frequency_Hz bigint
-  -- os
-  , os_name citext
-  , architecture_name citext
-  , kernel_name citext
-  -- cpu
-  , cpu_model_name citext
-  , cpu_core_count integer
-  , cpu_thread_count integer
-  , cpu_frequency_max_Hz bigint
-  , cpu_frequency_min_Hz bigint
-  , L1d_cache_bytes integer
-  , L1i_cache_bytes integer
-  , L2_cache_bytes integer
-  , L3_cache_bytes integer
-  -- gpu
-  , gpu_information citext DEFAULT ''
-  , gpu_part_number citext DEFAULT NULL
-  , gpu_product_name citext DEFAULT NULL
-  -- nullable machine attributes
-  , machine_other_attributes jsonb DEFAULT NULL
-)
-RETURNS integer AS
-$$
-  DECLARE
-    found_cpu_id integer;
-    found_gpu_id integer;
-    found_os_id integer;
-    result integer;
-  BEGIN
-    -- Can't bypass looking up all the values because of unique constraint.
-    SELECT public.get_cpu_id(
-      cpu_model_name
-      , cpu_core_count
-      , cpu_thread_count
-      , cpu_frequency_max_Hz
-      , cpu_frequency_min_Hz
-      , L1d_cache_bytes
-      , L1i_cache_bytes
-      , L2_cache_bytes
-      , L3_cache_bytes
-    ) INTO found_cpu_id;
-
-    SELECT public.get_gpu_id(
-      gpu_information
-      , gpu_part_number
-      , gpu_product_name
-    ) INTO found_gpu_id;
-
-    SELECT public.get_os_id(
-      os_name
-      , architecture_name
-      , kernel_name
-    ) INTO found_os_id;
-
-    SELECT machine_id INTO result FROM public.machine AS m
-    WHERE m.os_id = found_os_id
-      AND m.cpu_id = found_cpu_id
-      AND m.gpu_id = found_gpu_id
-      AND m.mac_address = $1
-      AND m.machine_name = $2
-      AND m.memory_bytes = $3
-      AND m.cpu_actual_frequency_Hz = $4;
-
-    IF result IS NULL THEN
-      INSERT INTO public.machine(
-        os_id
-        , cpu_id
-        , gpu_id
-        , mac_address
-        , machine_name
-        , memory_bytes
-        , cpu_actual_frequency_Hz
-        , machine_other_attributes
-      )
-      VALUES (found_os_id, found_cpu_id, found_gpu_id, $1, $2, $3, $4, $20)
-      RETURNING machine_id INTO result;
-    END IF;
-
-    RETURN result;
-  END
-$$
-LANGUAGE plpgsql;
-COMMENT ON FUNCTION public.get_machine_id(
-  macaddr
-  , citext
-  , bigint  -- memory_bytes
-  , bigint  -- cpu_frequency_actual_Hz
-  -- os
-  , citext
-  , citext
-  , citext
-  -- cpu
-  , citext
-  , integer
-  , integer
-  , bigint  -- cpu_frequency_max_Hz
-  , bigint  -- cpu_frequency_min_Hz
-  , integer
-  , integer
-  , integer
-  , integer
-  -- gpu
-  , citext
-  , citext
-  , citext
-  -- nullable machine attributes
-  , jsonb
-)
-IS 'Insert or select machine data, returning "machine.machine_id".';
-
--- GET_MACHINE_ID (given unique mac_address)
-CREATE OR REPLACE FUNCTION public.get_machine_id(mac_address macaddr)
-RETURNS integer AS
-$$
-  SELECT machine_id FROM public.machine AS m
-  WHERE m.mac_address = $1;
-$$
-LANGUAGE sql STABLE;
-COMMENT ON FUNCTION public.get_machine_id(macaddr)
-IS 'Select machine_id given its mac address, returning "machine.machine_id".';
-
--- GET_BENCHMARK_LANGUAGE_ID
-CREATE OR REPLACE FUNCTION public.get_benchmark_language_id(language citext)
-RETURNS integer AS
-$$
-  DECLARE
-    result integer;
-  BEGIN
-    SELECT benchmark_language_id INTO result
-    FROM public.benchmark_language AS bl
-    WHERE bl.benchmark_language = language;
-
-    IF result IS NULL THEN
-      INSERT INTO public.benchmark_language(benchmark_language)
-      VALUES (language)
-      RETURNING benchmark_language_id INTO result;
-    END IF;
-
-    RETURN result;
-  END
-$$
-LANGUAGE plpgsql;
-COMMENT ON FUNCTION public.get_benchmark_language_id(citext)
-IS 'Insert or select benchmark_language returning '
-   '"benchmark_language.benchmark_language_id".';
-
--- GET_LANGUAGE_IMPLEMENTATION_VERSION_ID
-CREATE OR REPLACE FUNCTION public.get_language_implementation_version_id(
-  language citext
-  , language_implementation_version citext DEFAULT ''
-)
-RETURNS integer AS
-$$
-  DECLARE
-    language_id integer;
-    result integer;
-  BEGIN
-    SELECT public.get_benchmark_language_id($1) INTO language_id;
-
-    SELECT language_implementation_version_id INTO result FROM public.language_implementation_version AS lv
-    WHERE lv.benchmark_language_id = language_id
-      AND lv.language_implementation_version = COALESCE($2, '');
-
-    IF result IS NULL THEN
-      INSERT INTO
-        public.language_implementation_version(benchmark_language_id, language_implementation_version)
-      VALUES (language_id, COALESCE($2, ''))
-      RETURNING language_implementation_version_id INTO result;
-    END IF;
-
-    RETURN result;
-  END
-$$
-LANGUAGE plpgsql;
-COMMENT ON FUNCTION public.get_language_implementation_version_id(citext, citext)
-IS 'Insert or select language and version data, '
-      'returning "language_implementation_version.language_implementation_version_id".';
-
-CREATE OR REPLACE FUNCTION public.get_language_implementation_version_id(
-  -- overload for when language_id is known
-  language_id integer
-  , language_implementation_version citext DEFAULT ''
-)
-RETURNS integer AS
-$$
-  DECLARE
-    result integer;
-  BEGIN
-    SELECT language_implementation_version_id INTO result FROM public.language_implementation_version AS lv
-    WHERE lv.benchmark_language_id = language_id
-      AND lv.language_implementation_version = COALESCE($2, '');
-
-    IF result IS NULL THEN
-      INSERT INTO
-        public.language_implementation_version(benchmark_language_id, language_implementation_version)
-      VALUES (language_id, COALESCE($2, ''))
-      RETURNING language_implementation_version_id INTO result;
-    END IF;
-
-    RETURN result;
-  END
-$$
-LANGUAGE plpgsql;
-
--- GET_LANGUAGE_DEPENDENCY_LOOKUP_ID
-CREATE OR REPLACE FUNCTION public.get_dependencies_id(
-  dependencies jsonb DEFAULT '{}'::jsonb
-)
-RETURNS integer AS
-$$
-  DECLARE
-    result integer;
-  BEGIN
-    SELECT dependencies_id INTO result
-    FROM public.dependencies AS ldl
-    WHERE ldl.dependencies = COALESCE($1, '{}'::jsonb);
-
-    IF result IS NULL THEN
-      INSERT INTO
-        public.dependencies(dependencies)
-      VALUES (COALESCE($1, '{}'::jsonb))
-      RETURNING dependencies_id INTO result;
-    END IF;
-
-    RETURN result;
-  END
-$$
-LANGUAGE plpgsql;
-COMMENT ON FUNCTION public.get_dependencies_id(jsonb)
-IS 'Insert or select dependencies, returning "dependencies.dependencies_id".';
-
--- GET_ENVIRONMENT_ID
-CREATE OR REPLACE FUNCTION public.get_environment_id(
-  language citext,
-  language_implementation_version citext DEFAULT '',
-  dependencies jsonb DEFAULT '{}'::jsonb
-)
-RETURNS integer AS
-$$
-  DECLARE
-    found_language_id integer;
-    found_version_id integer;
-    found_dependencies_id integer;
-    result integer;
-  BEGIN
-    SELECT public.get_benchmark_language_id($1) INTO found_language_id;
-    SELECT
-      public.get_language_implementation_version_id(found_language_id, $2)
-      INTO found_version_id;
-    SELECT
-      public.get_dependencies_id ($3)
-      INTO found_dependencies_id;
-
-    SELECT environment_id INTO result FROM public.environment AS e
-    WHERE e.benchmark_language_id = found_language_id
-      AND e.language_implementation_version_id = found_version_id
-      AND e.dependencies_id = found_dependencies_id;
-
-    IF result IS NULL THEN
-      INSERT INTO
-        public.environment(
-          benchmark_language_id
-          , language_implementation_version_id
-          , dependencies_id
-        )
-      VALUES (found_language_id, found_version_id, found_dependencies_id)
-      RETURNING environment_id INTO result;
-    END IF;
-
-    RETURN result;
-  END
-$$
-LANGUAGE plpgsql;
-COMMENT ON FUNCTION public.get_environment_id(citext, citext, jsonb)
-IS 'Insert or select language, language version, and dependencies, '
-      'returning "environment.environment_id".';
-
--- GET_BENCHMARK_TYPE_ID (full signature)
-CREATE OR REPLACE FUNCTION public.get_benchmark_type_id(
-  benchmark_type citext
-  , lessisbetter boolean
-)
-RETURNS integer AS
-$$
-  DECLARE
-    result integer;
-  BEGIN
-    SELECT benchmark_type_id INTO result FROM public.benchmark_type AS bt
-    WHERE bt.benchmark_type = $1
-      AND bt.lessisbetter = $2;
-
-    IF result IS NULL THEN
-      INSERT INTO public.benchmark_type(benchmark_type, lessisbetter)
-      VALUES($1, $2)
-      RETURNING benchmark_type_id INTO result;
-    END IF;
-
-    RETURN result;
-  END
-$$
-LANGUAGE plpgsql;
-COMMENT ON FUNCTION public.get_benchmark_type_id(citext, boolean)
-IS 'Insert or select benchmark type and lessisbetter, '
-   'returning "benchmark_type.benchmark_type_id".';
-
--- GET_BENCHMARK_TYPE_ID (given unique benchmark_type string only)
-CREATE OR REPLACE FUNCTION public.get_benchmark_type_id(
-  benchmark_type citext
-)
-RETURNS integer AS
-$$
-  DECLARE
-    result integer;
-  BEGIN
-    SELECT benchmark_type_id INTO result FROM public.benchmark_type AS bt
-    WHERE bt.benchmark_type = $1;
-
-    RETURN result;
-  END
-$$
-LANGUAGE plpgsql;
-COMMENT ON FUNCTION public.get_benchmark_type_id(citext)
-IS 'Select benchmark_type_id given benchmark type (e.g. ''time''), '
-   'returning "benchmark_type.benchmark_type_id".';
-
--- GET_UNIT_ID (full signature)
-CREATE OR REPLACE FUNCTION public.get_unit_id(
-  benchmark_type citext
-  , units citext
-  , lessisbetter boolean DEFAULT NULL
-)
-RETURNS integer AS
-$$
-  DECLARE
-    found_benchmark_type_id integer;
-    result integer;
-  BEGIN
-  
-    IF ($3 IS NOT NULL)  -- if lessisbetter is not null
-    THEN
-      SELECT public.get_benchmark_type_id($1, $3)
-        INTO found_benchmark_type_id;
-    ELSE
-      SELECT public.get_benchmark_type_id($1)
-        INTO found_benchmark_type_id;
-    END IF;
-
-    SELECT unit_id INTO result FROM public.unit AS u
-    WHERE u.benchmark_type_id = found_benchmark_type_id
-      AND u.units = $2;
-
-    IF result IS NULL THEN
-      INSERT INTO public.unit(benchmark_type_id, units)
-      VALUES(found_benchmark_type_id, $2)
-      RETURNING unit_id INTO result;
-    END IF;
-
-    RETURN result;
-  END
-$$
-LANGUAGE plpgsql;
-COMMENT ON FUNCTION public.get_unit_id(citext, citext, boolean)
-IS 'Insert or select benchmark type (e.g. ''time''), '
-   'units string (e.g. ''miliseconds''), '
-   'and "lessisbetter" (true if smaller benchmark values are better), '
-   'returning "unit.unit_id".';
-
--- GET_UNIT_ID (given unique units string only)
-CREATE OR REPLACE FUNCTION public.get_unit_id(units citext)
-RETURNS integer AS
-$$
-  SELECT unit_id FROM public.unit AS u
-  WHERE u.units = units;
-$$
-LANGUAGE sql STABLE;
-COMMENT ON FUNCTION public.get_unit_id(citext)
-IS 'Select unit_id given unit name, returning "unit.unit_id".';
-
--- GET_BENCHMARK_ID (full signature)
-CREATE OR REPLACE FUNCTION public.get_benchmark_id(
-  benchmark_language citext
-  , benchmark_name citext
-  , parameter_names text[]
-  , benchmark_description text
-  , benchmark_version citext
-  , benchmark_type citext
-  , units citext
-  , lessisbetter boolean
-)
-RETURNS integer AS
-$$
-  DECLARE
-    found_benchmark_language_id integer;
-    found_unit_id integer;
-    result integer;
-  BEGIN
-    SELECT public.get_benchmark_language_id(
-      benchmark_language
-    ) INTO found_benchmark_language_id;
-
-    SELECT public.get_unit_id(
-      benchmark_type
-      , units
-      , lessisbetter
-    ) INTO found_unit_id;
-
-    SELECT benchmark_id INTO result FROM public.benchmark AS b
-    WHERE b.benchmark_language_id = found_benchmark_language_id
-      AND b.benchmark_name = $2
-      -- handle nullable "parameter_names"
-      AND b.parameter_names IS NOT DISTINCT FROM $3
-      AND b.benchmark_description = $4
-      AND b.benchmark_version = $5
-      AND b.unit_id = found_unit_id;
-
-    IF result IS NULL THEN
-      INSERT INTO public.benchmark(
-        benchmark_language_id
-        , benchmark_name
-        , parameter_names
-        , benchmark_description
-        , benchmark_version
-        , unit_id
-      )
-      VALUES (found_benchmark_language_id, $2, $3, $4, $5, found_unit_id)
-      RETURNING benchmark_id INTO result;
-    END IF;
-
-    RETURN result;
-  END
-$$
-LANGUAGE plpgsql;
-COMMENT ON FUNCTION public.get_benchmark_id(
-  citext
-  , citext
-  , text[]
-  , text
-  , citext
-  , citext
-  , citext
-  , boolean
-)
-IS 'Insert/select benchmark given data, returning "benchmark.benchmark_id".';
-
--- GET_BENCHMARK_ID (by unique columns)
-CREATE OR REPLACE FUNCTION public.get_benchmark_id(
-  benchmark_language citext
-  , benchmark_name citext
-  , benchmark_version citext
-)
-RETURNS integer AS
-$$
-  WITH language AS (
-    SELECT public.get_benchmark_language_id(benchmark_language) AS id
-  )
-  SELECT b.benchmark_id
-  FROM public.benchmark AS b
-  JOIN language ON b.benchmark_language_id = language.id
-  WHERE b.benchmark_name = benchmark_name
-    AND benchmark_version = benchmark_version
-$$
-LANGUAGE sql STABLE;
-COMMENT ON FUNCTION public.get_benchmark_id(citext, citext, citext)
-IS 'Select existing benchmark given unique columns, '
-   'returning "benchmark.benchmark_id".';
diff --git a/dev/benchmarking/ddl/3_01_functions_triggers.sql b/dev/benchmarking/ddl/3_01_functions_triggers.sql
deleted file mode 100644
index b6ce4741ac0fd..0000000000000
--- a/dev/benchmarking/ddl/3_01_functions_triggers.sql
+++ /dev/null
@@ -1,574 +0,0 @@
-/*
-  Licensed to the Apache Software Foundation (ASF) under one
-  or more contributor license agreements.  See the NOTICE file
-  distributed with this work for additional information
-  regarding copyright ownership.  The ASF licenses this file
-  to you under the Apache License, Version 2.0 (the
-  "License"); you may not use this file except in compliance
-  with the License.  You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing,
-  software distributed under the License is distributed on an
-  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-  KIND, either express or implied.  See the License for the
-  specific language governing permissions and limitations
-  under the License.
-*/
-
-
--------------------------- TRIGGER FUNCTIONS --------------------------
--- Views that do not select from a single table or view are not
--- automatically updatable. These trigger functions are intended
--- to be run instead of INSERT into the complicated views.
-
-
--- LANGUAGE_IMPLEMENTATION_VERSION_VIEW_INSERT_ROW
-CREATE OR REPLACE FUNCTION public.language_implementation_version_view_insert_row()
-RETURNS trigger AS
-$$
-  DECLARE
-    language_id integer;
-    result integer;
-  BEGIN
-    IF NEW.benchmark_language IS NULL THEN
-      RAISE EXCEPTION 'Column "benchmark_language" cannot be NULL.';
-    END IF;
-    IF NEW.language_implementation_version IS NULL THEN
-      RAISE EXCEPTION
-        'Column "language_implementation_version" cannot be NULL (use '''' instead).';
-    END IF;
-
-    SELECT public.get_benchmark_language_id(NEW.benchmark_language)
-      INTO language_id;
-
-    SELECT language_implementation_version_id INTO result FROM public.language_implementation_version AS lv
-    WHERE lv.benchmark_language_id = language_id
-      AND lv.language_implementation_version = NEW.language_implementation_version;
-
-    IF result IS NOT NULL THEN
-      -- row already exists
-      RETURN NULL;
-    ELSE
-      INSERT INTO
-        public.language_implementation_version(
-          benchmark_language_id
-          , language_implementation_version
-        )
-      VALUES (language_id, NEW.language_implementation_version)
-      RETURNING language_implementation_version_id INTO NEW.language_implementation_version_id;
-    END IF;
-
-    RETURN NEW;
-  END
-$$
-LANGUAGE plpgsql;
-
--- ENVIRONMENT_VIEW_INSERT_ROW
-CREATE OR REPLACE FUNCTION public.environment_view_insert_row()
-RETURNS trigger AS
-$$
-  DECLARE
-    found_language_id integer;
-    found_version_id integer;
-    found_dependencies_id integer;
-    result integer;
-  BEGIN
-    IF NEW.benchmark_language IS NULL
-    THEN
-      RAISE EXCEPTION 'Column "benchmark_language" cannot be NULL.';
-    END IF;
-    IF NEW.language_implementation_version IS NULL THEN
-      RAISE EXCEPTION
-        'Column "language_implementation_version" cannot be NULL (use '''' instead).';
-    END IF;
-
-    SELECT public.get_benchmark_language_id(NEW.benchmark_language)
-      INTO found_language_id;
-
-    SELECT public.get_language_implementation_version_id(
-      found_language_id
-      , NEW.language_implementation_version
-    )
-    INTO found_version_id;
-
-    SELECT public.get_dependencies_id(NEW.dependencies)
-    INTO found_dependencies_id;
-
-    SELECT environment_id INTO result FROM public.environment AS e
-    WHERE e.benchmark_language_id = found_language_id
-      AND e.language_implementation_version_id = found_version_id
-      AND e.dependencies_id = found_dependencies_id;
-
-    IF result IS NOT NULL THEN
-      -- row already exists
-      RETURN NULL;
-    ELSE
-      INSERT INTO
-        public.environment(
-          benchmark_language_id
-          , language_implementation_version_id
-          , dependencies_id
-        )
-      VALUES (found_language_id, found_version_id, found_dependencies_id)
-      RETURNING environment_id INTO NEW.environment_id;
-    END IF;
-
-    RETURN NEW;
-  END
-$$
-LANGUAGE plpgsql;
-
--- MACHINE_VIEW_INSERT_ROW
-CREATE OR REPLACE FUNCTION public.machine_view_insert_row()
-RETURNS trigger AS
-$$
-  DECLARE
-    found_cpu_id integer;
-    found_gpu_id integer;
-    found_os_id integer;
-    result integer;
-  BEGIN
-    IF (
-      NEW.machine_name IS NULL
-      OR NEW.memory_bytes IS NULL
-      OR NEW.cpu_model_name IS NULL
-      OR NEW.cpu_core_count IS NULL
-      OR NEW.cpu_thread_count IS NULL
-      OR NEW.cpu_frequency_max_Hz IS NULL
-      OR NEW.cpu_frequency_min_Hz IS NULL
-      OR NEW.cpu_L1d_cache_bytes IS NULL
-      OR NEW.cpu_L1i_cache_bytes IS NULL
-      OR NEW.cpu_L2_cache_bytes IS NULL
-      OR NEW.cpu_L3_cache_bytes IS NULL
-      OR NEW.os_name IS NULL
-      OR NEW.architecture_name IS NULL
-    )
-    THEN
-      RAISE EXCEPTION 'None of the columns in "machine_view" can be NULL. '
-        'all columns in table "gpu" will default to the empty string '''', '
-        'as will blank "os.kernel_name". This is to allow uniqueness '
-        'constraints to work. Thank you!.';
-    END IF;
-
-    SELECT public.get_cpu_id(
-      NEW.cpu_model_name
-      , NEW.cpu_core_count
-      , NEW.cpu_thread_count
-      , NEW.cpu_frequency_max_Hz
-      , NEW.cpu_frequency_min_Hz
-      , NEW.cpu_L1d_cache_bytes
-      , NEW.cpu_L1i_cache_bytes
-      , NEW.cpu_L2_cache_bytes
-      , NEW.cpu_L3_cache_bytes
-    ) INTO found_cpu_id;
-
-    SELECT public.get_gpu_id(
-      NEW.gpu_information
-      , NEW.gpu_part_number
-      , NEW.gpu_product_name
-    ) INTO found_gpu_id;
-
-    SELECT public.get_os_id(
-      NEW.os_name
-      , NEW.architecture_name
-      , NEW.kernel_name
-    ) INTO found_os_id;
-
-    SELECT machine_id INTO result FROM public.machine AS m
-    WHERE m.os_id = found_os_id
-      AND m.cpu_id = found_cpu_id
-      AND m.gpu_id = found_gpu_id
-      AND m.machine_name = NEW.machine_name
-      AND m.memory_bytes = NEW.memory_bytes
-      AND m.cpu_actual_frequency_Hz = NEW.cpu_actual_frequency_Hz;
-
-    IF result IS NOT NULL THEN
-      -- row already exists
-      RETURN NULL;
-    ELSE
-      INSERT INTO public.machine(
-        os_id
-        , cpu_id
-        , gpu_id
-        , machine_name
-        , mac_address
-        , memory_bytes
-        , cpu_actual_frequency_Hz
-        , machine_other_attributes
-      )
-      VALUES (
-        found_os_id
-        , found_cpu_id
-        , found_gpu_id
-        , NEW.machine_name
-        , NEW.mac_address
-        , NEW.memory_bytes
-        , NEW.cpu_actual_frequency_Hz
-        , NEW.machine_other_attributes
-      )
-      RETURNING machine_id INTO NEW.machine_id;
-    END IF;
-
-    RETURN NEW;
-  END
-$$
-LANGUAGE plpgsql;
-
--- UNIT_VIEW_INSERT_ROW
-CREATE OR REPLACE FUNCTION public.unit_view_insert_row()
-RETURNS trigger AS
-$$
-  DECLARE
-    found_benchmark_type_id integer;
-    result integer;
-  BEGIN
-    IF (NEW.benchmark_type IS NULL OR NEW.units IS NULL)
-    THEN
-      RAISE EXCEPTION E'"benchmark_type" and "units" cannot be NULL.\n'
-        'Further, if the "benchmark_type" has never been defined, '
-        '"lessisbetter" must be defined or there will be an error.';
-    END IF;
-
-    -- It's OK for "lessisbetter" = NULL if "benchmark_type" already exists.
-    SELECT public.get_benchmark_type_id(NEW.benchmark_type, NEW.lessisbetter)
-      INTO found_benchmark_type_id;
-
-    SELECT unit_id INTO result FROM public.unit AS u
-    WHERE u.benchmark_type_id = found_benchmark_type_id
-      AND u.units = NEW.units;
-
-    IF result IS NOT NULL THEN
-      -- row already exists
-      RETURN NULL;
-    ELSE
-      INSERT INTO public.unit (
-        benchmark_type_id
-        , units
-      )
-      VALUES (
-        found_benchmark_type_id
-        , NEW.units
-      )
-      RETURNING unit_id INTO NEW.unit_id;
-    END IF;
-
-    RETURN NEW;
-  END
-$$
-LANGUAGE plpgsql;
-
--- BENCHMARK_VIEW_INSERT_ROW
-CREATE OR REPLACE FUNCTION public.benchmark_view_insert_row()
-RETURNS trigger AS
-$$
-  DECLARE
-    found_benchmark_language_id integer;
-    found_units_id integer;
-    result integer;
-  BEGIN
-    IF (
-      NEW.benchmark_name IS NULL
-      OR NEW.benchmark_version IS NULL
-      OR NEW.benchmark_language IS NULL
-      OR NEW.benchmark_type IS NULL
-      OR NEW.benchmark_description IS NULL
-      OR NEW.units IS NULL
-    )
-    THEN
-      RAISE EXCEPTION 'The only nullable column in this view is '
-        '"benchmark.parameter_names".';
-    END IF;
-
-    SELECT public.get_benchmark_language_id(
-      NEW.benchmark_language
-    ) INTO found_benchmark_language_id;
-
-    SELECT public.get_unit_id(NEW.units) INTO found_units_id;
-
-    SELECT benchmark_id INTO result FROM public.benchmark AS b
-    WHERE b.benchmark_language_id = found_benchmark_language_id
-      AND b.benchmark_name = NEW.benchmark_name
-      -- handle nullable "parameter_names"
-      AND b.parameter_names IS NOT DISTINCT FROM NEW.parameter_names
-      AND b.benchmark_description = NEW.benchmark_description
-      AND b.benchmark_version = NEW.benchmark_version
-      AND b.unit_id = found_units_id;
-
-    IF result IS NOT NULL THEN
-      -- row already exists
-      RETURN NULL;
-    ELSE
-      INSERT INTO public.benchmark(
-        benchmark_language_id
-        , benchmark_name
-        , parameter_names
-        , benchmark_description
-        , benchmark_version
-        , unit_id
-      )
-      VALUES (
-        found_benchmark_language_id
-        , NEW.benchmark_name
-        , NEW.parameter_names
-        , NEW.benchmark_description
-        , NEW.benchmark_version
-        , found_units_id
-      )
-      RETURNING benchmark_id INTO NEW.benchmark_id;
-    END IF;
-
-    RETURN NEW;
-  END
-$$
-LANGUAGE plpgsql;
-
--- BENCHMARK_RUN_VIEW_INSERT_ROW
-CREATE OR REPLACE FUNCTION public.benchmark_run_view_insert_row()
-RETURNS trigger AS
-$$
-  DECLARE
-    found_benchmark_id integer;
-    found_benchmark_language_id integer;
-    found_machine_id integer;
-    found_environment_id integer;
-    found_language_implementation_version_id integer;
-  BEGIN
-    IF (
-      NEW.benchmark_name IS NULL
-      OR NEW.benchmark_version IS NULL
-      OR NEW.benchmark_language IS NULL
-      OR NEW.value IS NULL
-      OR NEW.run_timestamp IS NULL
-      OR NEW.git_commit_timestamp IS NULL
-      OR NEW.git_hash IS NULL
-      OR NEW.language_implementation_version IS NULL
-      OR NEW.mac_address IS NULL
-    )
-    THEN
-      RAISE EXCEPTION 'Only the following columns can be NULL: '
-        '"parameter_names", "val_min", "val_q1", "val_q3", "val_max".';
-    END IF;
-
-    SELECT public.get_benchmark_id(
-      NEW.benchmark_language
-      , NEW.benchmark_name
-      , NEW.benchmark_version
-    ) INTO found_benchmark_id;
-
-    SELECT public.get_benchmark_language_id(
-      NEW.benchmark_language
-    ) INTO found_benchmark_language_id;
-
-    SELECT public.get_machine_id(
-      NEW.mac_address
-    ) INTO found_machine_id;
-
-    SELECT public.get_environment_id(
-      NEW.benchmark_language
-      , NEW.language_implementation_version
-      , NEW.dependencies
-    ) INTO found_environment_id;
-
-    SELECT public.get_language_implementation_version_id(
-      found_benchmark_language_id,
-      NEW.language_implementation_version
-    ) INTO found_language_implementation_version_id;
-
-    INSERT INTO public.benchmark_run (
-      parameter_values
-      , value
-      , git_commit_timestamp
-      , git_hash
-      , val_min
-      , val_q1
-      , val_q3
-      , val_max
-      , std_dev
-      , n_obs
-      , run_timestamp
-      , run_metadata
-      , run_notes
-      , machine_id
-      , benchmark_language_id
-      , language_implementation_version_id
-      , environment_id
-      , benchmark_id
-    )
-    VALUES (
-      COALESCE(NEW.parameter_values, '{}'::jsonb)
-      , NEW.value
-      , NEW.git_commit_timestamp
-      , NEW.git_hash
-      , NEW.val_min
-      , NEW.val_q1
-      , NEW.val_q3
-      , NEW.val_max
-      , NEW.std_dev
-      , NEW.n_obs
-      , NEW.run_timestamp
-      , NEW.run_metadata
-      , NEW.run_notes
-      , found_machine_id
-      , found_benchmark_language_id
-      , found_language_implementation_version_id
-      , found_environment_id
-      , found_benchmark_id
-    ) returning benchmark_run_id INTO NEW.benchmark_run_id;
-
-    RETURN NEW;
-  END
-$$
-LANGUAGE plpgsql;
-
--- FULL_BENCHMARK_RUN_VIEW_INSERT_ROW
-CREATE OR REPLACE FUNCTION public.full_benchmark_run_view_insert_row()
-RETURNS trigger AS
-$$
-  DECLARE
-    found_benchmark_id integer;
-    found_benchmark_language_id integer;
-    found_machine_id integer;
-    found_environment_id integer;
-    found_language_implementation_version_id integer;
-  BEGIN
-    IF (
-      NEW.value IS NULL
-      OR NEW.git_hash IS NULL
-      OR NEW.git_commit_timestamp IS NULL
-      OR NEW.run_timestamp IS NULL
-      -- benchmark
-      OR NEW.benchmark_name IS NULL
-      OR NEW.benchmark_description IS NULL
-      OR NEW.benchmark_version IS NULL
-      OR NEW.benchmark_language IS NULL
-      -- unit
-      OR NEW.benchmark_type IS NULL
-      OR NEW.units IS NULL
-      OR NEW.lessisbetter IS NULL
-      -- machine
-      OR NEW.machine_name IS NULL
-      OR NEW.memory_bytes IS NULL
-      OR NEW.cpu_model_name IS NULL
-      OR NEW.cpu_core_count IS NULL
-      OR NEW.os_name IS NULL
-      OR NEW.architecture_name IS NULL
-      OR NEW.kernel_name IS NULL
-      OR NEW.cpu_model_name IS NULL
-      OR NEW.cpu_core_count IS NULL
-      OR NEW.cpu_thread_count IS NULL
-      OR NEW.cpu_frequency_max_Hz IS NULL
-      OR NEW.cpu_frequency_min_Hz IS NULL
-      OR NEW.cpu_L1d_cache_bytes IS NULL
-      OR NEW.cpu_L1i_cache_bytes IS NULL
-      OR NEW.cpu_L2_cache_bytes IS NULL
-      OR NEW.cpu_L3_cache_bytes IS NULL
-    )
-    THEN
-      RAISE EXCEPTION 'Only the following columns can be NULL: '
-        '"machine_other_attributes", "parameter_names", "val_min", '
-        '"val_q1", "val_q3", "val_max", "run_metadata", "run_notes". '
-        'If "gpu_information", "gpu_part_number", "gpu_product_name", or '
-        '"kernel_name" are null, they will be silently turned into an '
-        'empty string ('''').';
-    END IF;
-
-    SELECT public.get_benchmark_id(
-      NEW.benchmark_language
-      , NEW.benchmark_name
-      , NEW.parameter_names
-      , NEW.benchmark_description
-      , NEW.benchmark_version
-      , NEW.benchmark_type
-      , NEW.units
-      , NEW.lessisbetter
-    ) INTO found_benchmark_id;
-
-    SELECT public.get_benchmark_language_id(
-      NEW.benchmark_language
-    ) INTO found_benchmark_language_id;
-
-    SELECT public.get_machine_id(
-      NEW.mac_address
-      , NEW.machine_name
-      , NEW.memory_bytes
-      , NEW.cpu_actual_frequency_Hz
-      -- os
-      , NEW.os_name
-      , NEW.architecture_name
-      , NEW.kernel_name
-      -- cpu
-      , NEW.cpu_model_name
-      , NEW.cpu_core_count
-      , NEW.cpu_thread_count
-      , NEW.cpu_frequency_max_Hz
-      , NEW.cpu_frequency_min_Hz
-      , NEW.cpu_L1d_cache_bytes
-      , NEW.cpu_L1i_cache_bytes
-      , NEW.cpu_L2_cache_bytes
-      , NEW.cpu_L3_cache_bytes
-      -- gpu
-      , NEW.gpu_information
-      , NEW.gpu_part_number
-      , NEW.gpu_product_name
-      -- nullable machine attributes
-      , NEW.machine_other_attributes
-    ) INTO found_machine_id;
-
-    SELECT public.get_environment_id(
-      NEW.benchmark_language
-      , NEW.language_implementation_version
-      , NEW.dependencies
-    ) INTO found_environment_id;
-
-    SELECT public.get_language_implementation_version_id(
-      found_benchmark_language_id,
-      NEW.language_implementation_version
-    ) INTO found_language_implementation_version_id;
-
-    INSERT INTO public.benchmark_run (
-      parameter_values
-      , value
-      , git_commit_timestamp
-      , git_hash
-      , val_min
-      , val_q1
-      , val_q3
-      , val_max
-      , std_dev
-      , n_obs
-      , run_timestamp
-      , run_metadata
-      , run_notes
-      , machine_id
-      , benchmark_language_id
-      , language_implementation_version_id
-      , environment_id
-      , benchmark_id
-    )
-    VALUES (
-      NEW.parameter_values
-      , NEW.value
-      , NEW.git_commit_timestamp
-      , NEW.git_hash
-      , NEW.val_min
-      , NEW.val_q1
-      , NEW.val_q3
-      , NEW.val_max
-      , NEW.std_dev
-      , NEW.n_obs
-      , NEW.run_timestamp
-      , NEW.run_metadata
-      , NEW.run_notes
-      , found_machine_id
-      , found_benchmark_language_id
-      , found_language_implementation_version_id
-      , found_environment_id
-      , found_benchmark_id
-    ) returning benchmark_run_id INTO NEW.benchmark_run_id;
-
-    RETURN NEW;
-  END
-$$
-LANGUAGE plpgsql;
diff --git a/dev/benchmarking/ddl/3_02_functions_ingestion.sql b/dev/benchmarking/ddl/3_02_functions_ingestion.sql
deleted file mode 100644
index 000c61d00e7b0..0000000000000
--- a/dev/benchmarking/ddl/3_02_functions_ingestion.sql
+++ /dev/null
@@ -1,323 +0,0 @@
-/*
-  Licensed to the Apache Software Foundation (ASF) under one
-  or more contributor license agreements.  See the NOTICE file
-  distributed with this work for additional information
-  regarding copyright ownership.  The ASF licenses this file
-  to you under the Apache License, Version 2.0 (the
-  "License"); you may not use this file except in compliance
-  with the License.  You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing,
-  software distributed under the License is distributed on an
-  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-  KIND, either express or implied.  See the License for the
-  specific language governing permissions and limitations
-  under the License.
-*/
-
-
--------------------------- IMPORT HELPERS  --------------------------
--- Load from JSON (from https://stackoverflow.com/a/48396608)
--- How to use it in the psql client:
---   \set content `cat /examples/machine.json`
---   select ingest_machine(:'content'::jsonb);
--- INGEST_MACHINE_VIEW
-CREATE OR REPLACE FUNCTION public.ingest_machine_view(from_jsonb jsonb)
-RETURNS integer AS
-$$
-  DECLARE
-    result integer;
-  BEGIN
-    INSERT INTO public.machine_view
-    SELECT * FROM jsonb_populate_record(null::public.machine_view, from_jsonb)
-    RETURNING machine_id INTO result;
-    RETURN result;
-  END
-$$
-LANGUAGE plpgsql;
-COMMENT ON FUNCTION public.ingest_machine_view(jsonb) IS
-  E'The argument is a JSON object. NOTE: key names must be entirely\n'
-  'lowercase, or the insert will fail. Extra key-value pairs are ignored.\n'
-  'Example::\n\n'
-  '  {\n'
-  '    "mac_address": "0a:00:2d:01:02:03",\n'
-  '    "machine_name": "Yet-Another-Machine-Name",\n'
-  '    "memory_bytes": 8589934592,\n'
-  '    "cpu_actual_frequency_hz": 2300000000,\n'
-  '    "os_name": "OSX",\n'
-  '    "architecture_name": "x86_64",\n'
-  '    "kernel_name": "18.2.0",\n'
-  '    "cpu_model_name": "Intel(R) Core(TM) i5-7360U CPU @ 2.30GHz",\n'
-  '    "cpu_core_count": 2,\n'
-  '    "cpu_thread_count": 4,\n'
-  '    "cpu_frequency_max_hz": 2300000000,\n'
-  '    "cpu_frequency_min_hz": 2300000000,\n'
-  '    "cpu_l1d_cache_bytes": 32768,\n'
-  '    "cpu_l1i_cache_bytes": 32768,\n'
-  '    "cpu_l2_cache_bytes": 262144,\n'
-  '    "cpu_l3_cache_bytes": 4194304,\n'
-  '    "machine_other_attributes": {"just": "an example"},\n'
-  '    "gpu_information": "",\n'
-  '    "gpu_part_number": "",\n'
-  '    "gpu_product_name": ""\n'
-  '  }\n\n'
-  'To identify which columns in "machine_view" are required,\n'
-  'please see the view documentation in :ref:`benchmark-data-model`.\n';
-
--- INGEST_BENCHMARK_VIEW
-CREATE OR REPLACE FUNCTION public.ingest_benchmark_view(from_jsonb jsonb)
-RETURNS setof integer AS
-$$
-  BEGIN
-    RETURN QUERY
-    INSERT INTO public.benchmark_view
-    SELECT * FROM jsonb_populate_recordset(
-      null::public.benchmark_view
-      , from_jsonb
-    )
-    RETURNING benchmark_id;
-  END
-$$
-LANGUAGE plpgsql;
-COMMENT ON FUNCTION public.ingest_benchmark_view(jsonb) IS
-  E'The argument is a JSON object. NOTE: key names must be entirely\n'
-  'lowercase, or the insert will fail. Extra key-value pairs are ignored.\n'
-  'Example::\n\n'
-  '  [\n'
-  '    {\n'
-  '      "benchmark_name": "Benchmark 1",\n'
-  '      "parameter_names": ["arg0", "arg1", "arg2"],\n'
-  '      "benchmark_description": "First benchmark",\n'
-  '      "benchmark_type": "Time",\n'
-  '      "units": "miliseconds",\n'
-  '      "lessisbetter": true,\n'
-  '      "benchmark_version": "second version",\n'
-  '      "benchmark_language": "Python"\n'
-  '    },\n'
-  '    {\n'
-  '      "benchmark_name": "Benchmark 2",\n'
-  '      "parameter_names": ["arg0", "arg1"],\n'
-  '      "benchmark_description": "Description 2.",\n'
-  '      "benchmark_type": "Time",\n'
-  '      "units": "nanoseconds",\n'
-  '      "lessisbetter": true,\n'
-  '      "benchmark_version": "second version",\n'
-  '      "benchmark_language": "Python"\n'
-  '    }\n'
-  '  ]\n\n'
-  'To identify which columns in "benchmark_view" are required,\n'
-  'please see the view documentation in :ref:`benchmark-data-model`.\n';
-
--- INGEST_BENCHMARK_RUN_VIEW
-CREATE OR REPLACE FUNCTION public.ingest_benchmark_run_view(from_jsonb jsonb)
-RETURNS setof bigint AS
-$$
-  BEGIN
-    RETURN QUERY
-    INSERT INTO public.benchmark_run_view
-    SELECT * FROM
-    jsonb_populate_recordset(null::public.benchmark_run_view, from_jsonb)
-    RETURNING benchmark_run_id;
-  END
-$$
-LANGUAGE plpgsql;
-COMMENT ON FUNCTION public.ingest_benchmark_run_view(jsonb) IS
-  E'The argument is a JSON object. NOTE: key names must be entirely\n'
-  'lowercase, or the insert will fail. Extra key-value pairs are ignored.\n'
-  'Example::\n\n'
-  '  [\n'
-  '    {\n'
-  '      "benchmark_name": "Benchmark 2",\n'
-  '      "benchmark_version": "version 0",\n'
-  '      "parameter_values": {"arg0": 100, "arg1": 5},\n'
-  '      "value": 2.5,\n'
-  '      "git_commit_timestamp": "2019-02-08 22:35:53 +0100",\n'
-  '      "git_hash": "324d3cf198444a",\n'
-  '      "val_min": 1,\n'
-  '      "val_q1": 2,\n'
-  '      "val_q3": 3,\n'
-  '      "val_max": 4,\n'
-  '      "std_dev": 1.41,\n'
-  '      "n_obs": 8,\n'
-  '      "run_timestamp": "2019-02-14 03:00:05 -0600",\n'
-  '      "mac_address": "08:00:2b:01:02:03",\n'
-  '      "benchmark_language": "Python",\n'
-  '      "language_implementation_version": "CPython 2.7",\n'
-  '      "dependencies": {"six": "", "numpy": "1.14", "other_lib": "1.0"}\n'
-  '    },\n'
-  '    {\n'
-  '      "benchmark_name": "Benchmark 2",\n'
-  '      "benchmark_version": "version 0",\n'
-  '      "parameter_values": {"arg0": 1000, "arg1": 5},\n'
-  '      "value": 5,\n'
-  '      "git_commit_timestamp": "2019-02-08 22:35:53 +0100",\n'
-  '      "git_hash": "324d3cf198444a",\n'
-  '      "std_dev": 3.14,\n'
-  '      "n_obs": 8,\n'
-  '      "run_timestamp": "2019-02-14 03:00:10 -0600",\n'
-  '      "mac_address": "08:00:2b:01:02:03",\n'
-  '      "benchmark_language": "Python",\n'
-  '      "language_implementation_version": "CPython 2.7",\n'
-  '      "dependencies": {"six": "", "numpy": "1.14", "other_lib": "1.0"}\n'
-  '    }\n'
-  '  ]\n'
-  'To identify which columns in "benchmark_run_view" are required,\n'
-  'please see the view documentation in :ref:`benchmark-data-model`.\n';
-
--- INGEST_BENCHMARK_RUNS_WITH_CONTEXT
-CREATE OR REPLACE FUNCTION public.ingest_benchmark_runs_with_context(from_jsonb jsonb)
-RETURNS setof bigint AS
-$$
-  DECLARE
-    context_jsonb jsonb;
-    found_environment_id integer;
-    found_machine_id integer;
-  BEGIN
-    SELECT from_jsonb -> 'context' INTO context_jsonb;
-
-    SELECT public.get_machine_id((context_jsonb ->> 'mac_address')::macaddr)
-    INTO found_machine_id;
-
-    SELECT get_environment_id(
-      (context_jsonb ->> 'benchmark_language')::citext
-      , (context_jsonb ->> 'language_implementation_version')::citext
-      , context_jsonb -> 'dependencies'
-    ) INTO found_environment_id;
-
-    RETURN QUERY
-    WITH run_datum AS (
-      SELECT *
-      FROM jsonb_to_recordset(from_jsonb -> 'benchmarks')
-      AS x(
-        benchmark_name citext
-        , parameter_values jsonb
-        , value numeric
-        , val_min numeric
-        , val_q1 numeric
-        , val_q3 numeric
-        , val_max numeric
-        , std_dev numeric
-        , n_obs integer
-        , run_timestamp timestamp (0) with time zone
-        , run_metadata jsonb
-        , run_notes text
-      )
-    ), benchmark_name_and_id AS (
-      SELECT
-        key AS benchmark_name
-        , public.get_benchmark_id(
-            (context_jsonb ->> 'benchmark_language')::citext
-            , key::citext  -- benchmark_name
-            , value::citext  -- benchmark_version
-        ) AS benchmark_id
-      FROM jsonb_each_text(from_jsonb -> 'benchmark_version')
-    )
-    INSERT INTO public.benchmark_run (
-      benchmark_id
-      -- run_datum
-      , parameter_values
-      , value
-      , val_min
-      , val_q1
-      , val_q3
-      , val_max
-      , std_dev
-      , n_obs
-      , run_metadata
-      , run_notes
-      -- additional context information
-      , git_commit_timestamp
-      , git_hash
-      , run_timestamp
-      -- machine
-      , machine_id
-      -- environment
-      , environment_id
-      , language_implementation_version_id
-      , benchmark_language_id
-    )
-    SELECT
-      b.benchmark_id
-      -- run_datum
-      , run_datum.parameter_values
-      , run_datum.value
-      , run_datum.val_min
-      , run_datum.val_q1
-      , run_datum.val_q3
-      , run_datum.val_max
-      , run_datum.std_dev
-      , run_datum.n_obs
-      , run_datum.run_metadata
-      , run_datum.run_notes
-      -- additional context information
-      , (context_jsonb ->> 'git_commit_timestamp')::timestamp (0) with time zone
-      , context_jsonb ->> 'git_hash'
-      , (context_jsonb ->> 'run_timestamp')::timestamp (0) with time zone
-      -- machine
-      , found_machine_id
-      -- environment
-      , e.environment_id
-      , e.language_implementation_version_id
-      , e.benchmark_language_id
-    FROM run_datum
-    JOIN public.environment AS e
-      ON e.environment_id = found_environment_id
-    JOIN benchmark_name_and_id AS b
-      ON b.benchmark_name = run_datum.benchmark_name
-    RETURNING benchmark_run_id;
-  END
-$$
-LANGUAGE plpgsql;
-COMMENT ON FUNCTION public.ingest_benchmark_runs_with_context(jsonb) IS
-  E'The argument is a JSON object. NOTE: key names must be entirely\n'
-  'lowercase, or the insert will fail. Extra key-value pairs are ignored.\n'
-  'The object contains three key-value pairs::\n\n'
-  '    {"context": {\n'
-  '        "mac_address": "08:00:2b:01:02:03",\n'
-  '        "benchmark_language": "Python",\n'
-  '        "language_implementation_version": "CPython 3.6",\n'
-  '        "dependencies": {"six": "", "numpy": "1.14", "other_lib": "1.0"},\n'
-  '        "git_commit_timestamp": "2019-02-14 22:42:22 +0100",\n'
-  '        "git_hash": "123456789abcde",\n'
-  '        "run_timestamp": "2019-02-14 03:00:40 -0600",\n'
-  '        "extra stuff": "does not hurt anything and will not be added."\n'
-  '      },\n'
-  '     "benchmark_version": {\n'
-  '        "Benchmark Name 1": "Any string can be a version.",\n'
-  '        "Benchmark Name 2": "A git hash can be a version.",\n'
-  '        "An Unused Benchmark Name": "Will be ignored."\n'
-  '      },\n'
-  '     "benchmarks": [\n'
-  '        {\n'
-  '          "benchmark_name": "Benchmark Name 1",\n'
-  '          "parameter_values": {"argument1": 1, "argument2": "value2"},\n'
-  '          "value": 42,\n'
-  '          "val_min": 41.2,\n'
-  '          "val_q1":  41.5,\n'
-  '          "val_q3":  42.5,\n'
-  '          "val_max": 42.8,\n'
-  '          "std_dev": 0.5,\n'
-  '          "n_obs": 100,\n'
-  '          "run_metadata": {"any": "key-value pairs"},\n'
-  '          "run_notes": "Any relevant notes."\n'
-  '        },\n'
-  '        {\n'
-  '          "benchmark_name": "Benchmark Name 2",\n'
-  '          "parameter_values": {"not nullable": "Use {} if no params."},\n'
-  '          "value": 8,\n'
-  '          "std_dev": 1,\n'
-  '          "n_obs": 2,\n'
-  '        }\n'
-  '      ]\n'
-  '    }\n\n'
-  '- The entry for "context" contains the machine, environment, and timestamp\n'
-  '  information common to all of the runs\n'
-  '- The entry for "benchmark_version" maps benchmark\n'
-  '  names to their version strings. (Which can be a git hash,\n'
-  '  the entire code string, a number, or any other string of your choice.)\n'
-  '- The entry for "benchmarks" is a list of benchmark run data\n'
-  '  for the given context and benchmark versions. The first example\n'
-  '  benchmark run entry contains all possible values, even\n'
-  '  nullable ones, and the second entry omits all nullable values.\n\n';
diff --git a/dev/benchmarking/ddl/3_10_functions_documentation.sql b/dev/benchmarking/ddl/3_10_functions_documentation.sql
deleted file mode 100644
index 6b2a057909f86..0000000000000
--- a/dev/benchmarking/ddl/3_10_functions_documentation.sql
+++ /dev/null
@@ -1,395 +0,0 @@
-/*
-  Licensed to the Apache Software Foundation (ASF) under one
-  or more contributor license agreements.  See the NOTICE file
-  distributed with this work for additional information
-  regarding copyright ownership.  The ASF licenses this file
-  to you under the Apache License, Version 2.0 (the
-  "License"); you may not use this file except in compliance
-  with the License.  You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing,
-  software distributed under the License is distributed on an
-  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-  KIND, either express or implied.  See the License for the
-  specific language governing permissions and limitations
-  under the License.
-*/
-
-
--- _DOCUMENTATION_INGESTION
-CREATE OR REPLACE FUNCTION public._documentation_ingestion()
-RETURNS text AS
-$$
-  WITH ingestion_docs AS (
-    SELECT
-      proname || E'\n'
-      || rpad('', character_length(proname), '-')
-      || E'\n\n:code:`'
-      || proname || '('
-      || string_agg(a.argname || ' ' || typname , ', ')
-      || E')`\n\n'
-      || description
-      || E'\n\n\nback to `Benchmark data model <benchmark-data-model>`_\n'
-      AS docs
-    FROM pg_catalog.pg_proc
-    JOIN pg_catalog.pg_namespace
-      ON nspname='public'
-      AND pg_namespace.oid = pronamespace
-      AND proname LIKE '%ingest%'
-    JOIN pg_catalog.pg_description
-      ON pg_description.objoid=pg_proc.oid,
-    LATERAL unnest(proargnames, proargtypes) AS a(argname, argtype)
-      JOIN pg_catalog.pg_type
-        ON pg_type.oid = a.argtype
-    GROUP BY proname, description
-  )
-  SELECT
-    string_agg(docs, E'\n\n') AS docs
-  FROM ingestion_docs;
-$$
-LANGUAGE sql STABLE;
-
--- _DOCUMENTATION_VIEW_DETAILS
-CREATE OR REPLACE FUNCTION public._documentation_view_details(view_name citext)
-RETURNS TABLE(
-    column_name name
-    , type_name name
-    , nullable text
-    , default_value text
-    , description text
-) AS
-$$
-    WITH view_columns AS (
-      SELECT
-        attname AS column_name
-        , attnum AS column_order
-      FROM pg_catalog.pg_attribute
-      WHERE attrelid=view_name::regclass
-    )
-    SELECT
-      t.column_name
-      , type_name
-      , coalesce(nullable, '')
-      , coalesce(default_value, '')
-      , coalesce(description, '')
-    FROM public.summarized_tables_view AS t
-    JOIN view_columns AS v ON v.column_name = t.column_name
-    WHERE t.table_name || '_view' = view_name OR t.column_name NOT LIKE '%_id'
-    ORDER BY column_order;
-$$
-LANGUAGE sql STABLE;
-
-
--- _DOCUMENTATION_VIEW_PIECES
-CREATE OR REPLACE FUNCTION public._documentation_view_pieces(view_name citext)
-RETURNS TABLE (rst_formatted text)
-AS
-$$
-DECLARE
-  column_length integer;
-  type_length integer;
-  nullable_length integer;
-  default_length integer;
-  description_length integer;
-  sep text;
-  border text;
-BEGIN
-
-  -- All of the hard-coded constants here are the string length of the table
-  -- column headers: 'Column', 'Type', 'Nullable', 'Default', 'Description'
-  SELECT greatest(6, max(character_length(column_name)))
-  FROM public._documentation_view_details(view_name) INTO column_length;
-
-  SELECT greatest(4, max(character_length(type_name)))
-  FROM public._documentation_view_details(view_name) INTO type_length;
-
-  SELECT greatest(8, max(character_length(nullable)))
-  FROM public._documentation_view_details(view_name) INTO nullable_length;
-
-  SELECT greatest(7, max(character_length(default_value)))
-  FROM public._documentation_view_details(view_name) INTO default_length;
-
-  SELECT greatest(11, max(character_length(description)))
-  FROM public._documentation_view_details(view_name) INTO description_length;
-
-  SELECT '  ' INTO sep;
-
-  SELECT
-      concat_ws(sep
-        , rpad('', column_length, '=')
-        , rpad('', type_length, '=')
-        , rpad('', nullable_length, '=')
-        , rpad('', default_length, '=')
-        , rpad('', description_length, '=')
-      )
-    INTO border;
-
-  RETURN QUERY
-  SELECT
-    border
-  UNION ALL
-  SELECT
-      concat_ws(sep
-        , rpad('Column', column_length, ' ')
-        , rpad('Type', type_length, ' ')
-        , rpad('Nullable', nullable_length, ' ')
-        , rpad('Default', default_length, ' ')
-        , rpad('Description', description_length, ' ')
-      )
-  UNION ALL
-  SELECT border
-  UNION ALL
-  SELECT
-    concat_ws(sep
-      , rpad(v.column_name, column_length, ' ')
-      , rpad(v.type_name, type_length, ' ')
-      , rpad(v.nullable, nullable_length, ' ')
-      , rpad(v.default_value, default_length, ' ')
-      , rpad(v.description, description_length, ' ')
-    )
-  FROM public._documentation_view_details(view_name) AS v
-  UNION ALL
-  SELECT border;
-
-END
-$$
-LANGUAGE plpgsql STABLE;
-
-
--- DOCUMENTATION_FOR
-CREATE OR REPLACE FUNCTION public.documentation_for(view_name citext)
-RETURNS text AS
-$$
-  DECLARE
-    view_description text;
-    view_table_markup text;
-  BEGIN
-    SELECT description FROM pg_catalog.pg_description
-    WHERE pg_description.objoid = view_name::regclass
-    INTO view_description;
-
-    SELECT
-      view_name || E'\n' || rpad('', length(view_name), '-') || E'\n\n' ||
-      view_description || E'\n\n' ||
-      string_agg(rst_formatted, E'\n')
-    INTO view_table_markup
-    FROM public._documentation_view_pieces(view_name);
-
-    RETURN view_table_markup;
-  END
-$$
-LANGUAGE plpgsql STABLE;
-COMMENT ON FUNCTION public.documentation_for(citext)
-IS E'Create an ".rst"-formatted table describing a specific view.\n'
-  'Example: SELECT public.documentation_for(''endpoint'');';
-
-
--- DOCUMENTATION
-CREATE OR REPLACE FUNCTION public.documentation(dotfile_name text)
-RETURNS TABLE (full_text text) AS
-$$
-  WITH v AS (
-      SELECT
-        public.documentation_for(relname::citext)
-        || E'\n\nback to `Benchmark data model <benchmark-data-model>`_\n'
-        AS view_documentation
-      FROM pg_catalog.pg_trigger
-      JOIN pg_catalog.pg_class ON pg_trigger.tgrelid = pg_class.oid
-      WHERE NOT tgisinternal
-  )
-  SELECT
-    E'\n.. _benchmark-data-model:\n\n'
-    'Benchmark data model\n'
-    '====================\n\n\n'
-    '.. graphviz:: '
-    || dotfile_name
-    || E'\n\n\n.. _benchmark-ingestion:\n\n'
-      'Benchmark ingestion helper functions\n'
-      '====================================\n\n'
-    || public._documentation_ingestion()
-    || E'\n\n\n.. _benchmark-views:\n\n'
-      'Benchmark views\n'
-      '===============\n\n\n'
-    || string_agg(v.view_documentation, E'\n')
-  FROM v
-  GROUP BY True;
-$$
-LANGUAGE sql STABLE;
-COMMENT ON FUNCTION public.documentation(text)
-IS E'Create an ".rst"-formatted file that shows the columns in '
-  'every insertable view in the "public" schema.\n'
-  'The text argument is the name of the generated dotfile to be included.\n'
-  'Example: SELECT public.documentation(''data_model.dot'');';
-
-
--- _DOCUMENTATION_DOTFILE_NODE_FOR
-CREATE OR REPLACE FUNCTION public._documentation_dotfile_node_for(tablename name)
-RETURNS text AS
-$$
-DECLARE
-  result text;
-BEGIN
-  WITH node AS (
-  SELECT
-    tablename::text AS lines
-  UNION ALL
-  SELECT
-    E'[label = \n'
-    '  <<table border="0" cellborder="1" cellspacing="0" cellpadding="2">'
-  UNION ALL
-  -- table name
-  SELECT
-    '    <tr><td border="0"><font point-size="14">'
-    || tablename
-    || '</font></td></tr>'
-  UNION ALL
-  -- primary keys
-  SELECT
-    '    <tr><td port="' || column_name || '"><b>'
-    || column_name
-    || ' (pk)</b></td></tr>'
-  FROM public.summarized_tables_view
-  WHERE table_name = tablename
-    AND description LIKE '%primary key%'
-  UNION ALL
-  -- columns
-  SELECT
-    '    <tr><td>'
-    || column_name
-    || CASE WHEN description LIKE '%unique' THEN ' (u)' ELSE '' END
-    || CASE WHEN nullable <> 'not null' THEN ' (o)' ELSE '' END
-    || '</td></tr>'
-  FROM public.summarized_tables_view
-  WHERE table_name = tablename
-    AND (description IS NULL OR description  not like '%key%')
-  UNION ALL
-  -- foreign keys
-  SELECT
-    '    <tr><td port="' || column_name || '">'
-    || column_name
-    || CASE WHEN description LIKE '%unique' THEN ' (u)' ELSE '' END
-    || ' (fk) </td></tr>'
-  FROM public.summarized_tables_view
-  WHERE table_name = tablename
-    AND description LIKE '%foreign key%'
-    AND description NOT LIKE '%primary key%'
-  UNION ALL
-  SELECT
-    E'  </table>>\n];'
-  )
-  SELECT
-    string_agg(lines, E'\n')
-  INTO result
-  FROM node;
-
-  RETURN result;
-END
-$$
-LANGUAGE plpgsql STABLE;
-
-
--- _DOCUMENTATION_DOTFILE_EDGES
-CREATE OR REPLACE FUNCTION public._documentation_dotfile_edges()
-RETURNS text AS
-$$
-DECLARE
-  result text;
-BEGIN
-  WITH relationship AS (
-  SELECT
-    conrelid AS fk_table_id
-    , confrelid AS pk_table_id
-    , unnest(conkey) AS fk_colnum
-    , unnest(confkey) AS pk_colnum
-    FROM pg_catalog.pg_constraint
-    WHERE confkey IS NOT NULL
-    AND connamespace='public'::regnamespace
-  ), all_edges AS (
-  SELECT
-    fk_tbl.relname || ':' || fk_col.attname
-    || ' -> '
-    || pk_tbl.relname || ':' || pk_col.attname
-    || ';' AS lines
-  FROM relationship
-  -- foreign key table + column
-  JOIN pg_catalog.pg_attribute AS fk_col
-    ON fk_col.attrelid = relationship.fk_table_id
-    AND fk_col.attnum = relationship.fk_colnum
-  JOIN pg_catalog.pg_class AS fk_tbl
-    ON fk_tbl.oid = relationship.fk_table_id
-  -- primary key table + column
-  JOIN pg_catalog.pg_attribute AS pk_col
-    ON pk_col.attrelid = relationship.pk_table_id
-    AND pk_col.attnum = relationship.pk_colnum
-  JOIN pg_catalog.pg_class AS pk_tbl
-    ON pk_tbl.oid = relationship.pk_table_id
-  )
-  SELECT
-    string_agg(lines, E'\n')
-    INTO result
-    FROM all_edges;
-
-  RETURN result;
-END
-$$
-LANGUAGE plpgsql STABLE;
-
-
--- DOCUMENTATION_DOTFILE
-CREATE OR REPLACE FUNCTION public.documentation_dotfile()
-RETURNS text AS
-$$
-DECLARE
-  schemaname name := 'public';
-  result text;
-BEGIN
-  WITH file_contents AS (
-    SELECT
-      E'digraph database {\n  concentrate = true;\n'
-      '  rankdir = LR;\n'
-      '  ratio = ".75";\n'
-      '  node [shape = none, fontsize="11", fontname="Helvetica"];\n'
-      '  edge [fontsize="8", fontname="Helvetica"];'
-      AS lines
-    UNION ALL
-    SELECT
-      E'legend\n[fontsize = "14"\nlabel =\n'
-      '<<table border="0" cellpadding="0">\n'
-      '  <tr><td align="left"><font point-size="16">Legend</font></td></tr>\n'
-      '  <tr><td align="left">pk = primary key</td></tr>\n'
-      '  <tr><td align="left">fk = foreign key</td></tr>\n'
-      '  <tr><td align="left">u = unique*</td></tr>\n'
-      '  <tr><td align="left">o = optional</td></tr>\n'
-      '  <tr><td align="left">'
-      '* multiple uniques in the same table are a unique group</td></tr>\n'
-      '</table>>\n];'
-    UNION ALL
-    SELECT
-      string_agg(
-        public._documentation_dotfile_node_for(relname),
-        E'\n'  -- Forcing the 'env' table to the end makes a better image
-        ORDER BY (CASE WHEN relname LIKE 'env%' THEN 'z' ELSE relname END)
-    )
-    FROM pg_catalog.pg_class
-      WHERE relkind='r' AND relnamespace = schemaname::regnamespace
-    UNION ALL
-    SELECT
-      public._documentation_dotfile_edges()
-    UNION ALL
-    SELECT
-      '}'
-  )
-  SELECT
-    string_agg(lines, E'\n') AS dotfile
-    INTO result
-  FROM file_contents;
-  RETURN result;
-END
-$$
-LANGUAGE plpgsql STABLE;
-COMMENT ON FUNCTION public.documentation_dotfile()
-IS E'Create a Graphviz dotfile of the data model: '
-  'every table in the "public" schema.\n'
-  'Example: SELECT public.documentation_dotfile();';
diff --git a/dev/benchmarking/ddl/4_00_triggers.sql b/dev/benchmarking/ddl/4_00_triggers.sql
deleted file mode 100644
index 5fb0e50185951..0000000000000
--- a/dev/benchmarking/ddl/4_00_triggers.sql
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
-  Licensed to the Apache Software Foundation (ASF) under one
-  or more contributor license agreements.  See the NOTICE file
-  distributed with this work for additional information
-  regarding copyright ownership.  The ASF licenses this file
-  to you under the Apache License, Version 2.0 (the
-  "License"); you may not use this file except in compliance
-  with the License.  You may obtain a copy of the License at
- 
-    http://www.apache.org/licenses/LICENSE-2.0
- 
-  Unless required by applicable law or agreed to in writing,
-  software distributed under the License is distributed on an
-  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-  KIND, either express or implied.  See the License for the
-  specific language governing permissions and limitations
-  under the License.
-*/
-
-
--- LANGUAGE_IMPLEMENTATION_VERSION_VIEW_TRIGGER_INSERT
-CREATE TRIGGER language_implementation_version_view_trigger_insert
-  INSTEAD OF INSERT ON public.language_implementation_version_view
-  FOR EACH ROW
-  EXECUTE FUNCTION public.language_implementation_version_view_insert_row();
-
--- ENVIRONMENT_VIEW_TRIGGER_INSERT
-CREATE TRIGGER environment_view_trigger_insert
-  INSTEAD OF INSERT ON public.environment_view
-  FOR EACH ROW
-  EXECUTE FUNCTION public.environment_view_insert_row();
-
--- MACHINE_VIEW_TRIGGER_INSERT
-CREATE TRIGGER machine_view_trigger_insert
-  INSTEAD OF INSERT ON public.machine_view
-  FOR EACH ROW
-  EXECUTE FUNCTION public.machine_view_insert_row();
-
--- UNIT_VIEW_TRIGGER_INSERT
-CREATE TRIGGER unit_view_trigger_insert
-  INSTEAD OF INSERT ON public.unit_view
-  FOR EACH ROW
-  EXECUTE FUNCTION public.unit_view_insert_row();
-
--- BENCHMARK_VIEW_TRIGGER_INSERT
-CREATE TRIGGER benchmark_view_trigger_insert
-  INSTEAD OF INSERT ON public.benchmark_view
-  FOR EACH ROW
-  EXECUTE FUNCTION public.benchmark_view_insert_row();
-
--- BENCHMARK_RUN_VIEW_TRIGGER_INSERT
-CREATE TRIGGER benchmark_run_view_trigger_insert
-  INSTEAD OF INSERT ON public.benchmark_run_view
-  FOR EACH ROW
-  EXECUTE FUNCTION public.benchmark_run_view_insert_row();
-
--- FULL_BENCHMARK_RUN_VIEW_TRIGGER_INSERT
-CREATE TRIGGER full_benchmark_run_view_trigger_insert
-  INSTEAD OF INSERT ON public.full_benchmark_run_view
-  FOR EACH ROW
-  EXECUTE FUNCTION public.full_benchmark_run_view_insert_row();
diff --git a/dev/benchmarking/ddl/5_00_permissions.sql b/dev/benchmarking/ddl/5_00_permissions.sql
deleted file mode 100644
index dd72c40db3130..0000000000000
--- a/dev/benchmarking/ddl/5_00_permissions.sql
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
-  Licensed to the Apache Software Foundation (ASF) under one
-  or more contributor license agreements.  See the NOTICE file
-  distributed with this work for additional information
-  regarding copyright ownership.  The ASF licenses this file
-  to you under the Apache License, Version 2.0 (the
-  "License"); you may not use this file except in compliance
-  with the License.  You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing,
-  software distributed under the License is distributed on an
-  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-  KIND, either express or implied.  See the License for the
-  specific language governing permissions and limitations
-  under the License.
-*/
----------------------------- ROLES ----------------------------
--- ARROW_WEB
-CREATE ROLE arrow_web login password 'arrow';
-COMMENT ON ROLE arrow_web IS 'Anonymous login user.';
-
--- ARROW_ADMIN
-CREATE ROLE arrow_admin;
-COMMENT ON ROLE arrow_admin
-  IS 'Can select, insert, update, and delete on all public tables.';
-
--- ARROW_ANONYMOUS
-CREATE ROLE arrow_anonymous;
-COMMENT ON ROLE arrow_anonymous
-  IS 'Can insert and select on all public tables.';
-
-GRANT arrow_anonymous TO arrow_web;
-
-
----------------------------- PRIVILEGES ----------------------------
-GRANT USAGE ON SCHEMA public TO arrow_anonymous, arrow_admin;
-
--- ARROW_ADMIN
-GRANT EXECUTE ON ALL FUNCTIONS IN SCHEMA public TO arrow_admin;
-GRANT USAGE, SELECT ON ALL SEQUENCES IN SCHEMA public to arrow_admin;
-GRANT SELECT, UPDATE, INSERT, DELETE ON ALL TABLES IN SCHEMA public
-  TO arrow_admin;
-
--- ARROW_ANONYMOUS
-GRANT EXECUTE ON ALL FUNCTIONS IN SCHEMA public TO arrow_anonymous;
-GRANT SELECT ON ALL TABLES IN SCHEMA public TO arrow_anonymous;
-GRANT USAGE, SELECT ON ALL SEQUENCES IN SCHEMA public to arrow_anonymous;
-GRANT INSERT ON
-  public.benchmark
-  , public.benchmark_language
-  , public.dependencies
-  , public.language_implementation_version
-  , public.benchmark_run
-  , public.benchmark_type
-  , public.cpu
-  , public.environment
-  , public.environment_view
-  , public.gpu
-  , public.machine
-  , public.machine_view
-  , public.os
-  , public.unit
-  --, public.project  -- The only disallowed table is `project`.
-  , public.benchmark_run_view
-  , public.benchmark_view
-  , public.environment_view
-  , public.full_benchmark_run_view
-  , public.language_implementation_version_view
-  , public.machine_view
-  , public.unit_view
-TO arrow_anonymous;
diff --git a/dev/benchmarking/docker-compose.yml b/dev/benchmarking/docker-compose.yml
deleted file mode 100644
index ca60206bfdfb6..0000000000000
--- a/dev/benchmarking/docker-compose.yml
+++ /dev/null
@@ -1,43 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-version: '3'
-services:
-
-  pg:
-    build:
-      context: .
-      dockerfile: Dockerfile
-    restart: always
-    ports:
-      - '5432:5432'
-    environment:
-      - POSTGRES_PASSWORD=${PG_PASS}
-      - POSTGRES_USER=${PG_USER}
-
-  graphile:
-    image: graphile/postgraphile
-    restart: always
-    ports:
-      - 5000:5000
-    depends_on:
-      - pg
-    command:
-      - --connection
-      - postgres://${PG_USER}:${PG_PASS}@pg:5432/${PG_USER}
-      - --schema
-      - public
-      - --watch
diff --git a/dev/benchmarking/examples/benchmark_example.json b/dev/benchmarking/examples/benchmark_example.json
deleted file mode 100644
index d6f58c2862ece..0000000000000
--- a/dev/benchmarking/examples/benchmark_example.json
+++ /dev/null
@@ -1,32 +0,0 @@
-[
-  {
-    "benchmark_name": "Benchmark 1",
-    "parameter_names": ["arg0", "arg1", "arg2"],
-    "benchmark_description": "First benchmark",
-    "benchmark_type": "Time",
-    "units": "miliseconds",
-    "lessisbetter": true,
-    "benchmark_version": "second version",
-    "benchmark_language": "Python"
-  },
-  {
-    "benchmark_name": "Benchmark 2",
-    "parameter_names": ["arg0", "arg1"],
-    "benchmark_description": "Description 2.",
-    "benchmark_type": "Time",
-    "units": "nanoseconds",
-    "lessisbetter": true,
-    "benchmark_version": "second version",
-    "benchmark_language": "Python"
-  },
-  {
-    "benchmark_name": "Benchmark 3",
-    "parameter_names": ["arg0"],
-    "benchmark_description": "Third benchmark",
-    "benchmark_type": "Memory",
-    "units": "kilobytes",
-    "lessisbetter": true,
-    "benchmark_version": "1",
-    "benchmark_language": "Python"
-  }
-]
diff --git a/dev/benchmarking/examples/benchmark_run_example.csv b/dev/benchmarking/examples/benchmark_run_example.csv
deleted file mode 100644
index eab208a1c709e..0000000000000
--- a/dev/benchmarking/examples/benchmark_run_example.csv
+++ /dev/null
@@ -1,6 +0,0 @@
-benchmark_run_id,benchmark_name,benchmark_version,parameter_values,value,git_commit_timestamp,git_hash,val_min,val_q1,val_q3,val_max,std_dev,n_obs,run_timestamp,run_metadata,run_notes,mac_address,benchmark_language,language_implementation_version,dependencies
-,Benchmark 2,version 0,"{""arg0"": 100, ""arg1"": 5}",2.5,2019-01-31 14:31:10 -0600,8136c46d5c60fb,1,2,3,4,1.41,8,2019-02-14 02:00:00 -0600,,,08:00:2b:01:02:03,Python,CPython 2.7,"{""six"": """", ""numpy"": ""1.14"", ""other_lib"": ""1.0""}"
-,Benchmark 2,version 0,"{""arg0"": 1000, ""arg1"": 5}",5,2019-01-31 14:31:10 -0600,8136c46d5c60fb,2,4,6,8,3.14,8,2019-02-14 02:01:00 -0600,,,08:00:2b:01:02:03,Python,CPython 2.7,"{""six"": """", ""numpy"": ""1.14"", ""other_lib"": ""1.0""}"
-,Benchmark 2,version 0,"{""arg0"": 100, ""arg1"": 5}",2.5,2019-01-31 14:31:10 -0600,8136c46d5c60fb,0.5,1,3,5,3,8,2019-02-14 02:02:00 -0600,,,08:00:2b:01:02:03,Python,CPython 3.6,"{""boost"": ""1.42"", ""numpy"": ""1.15""}"
-,Benchmark 2,version 0,"{""arg0"": 1000, ""arg1"": 5}",3,2019-01-31 14:31:10 -0600,8136c46d5c60fb,2,2.5,4,4.5,1.5,8,2019-02-14 02:03:00 -0600,,,08:00:2b:01:02:03,Python,CPython 3.6,"{""boost"": ""1.42"", ""numpy"": ""1.15""}"
-,Benchmark 2,version 0,"{""arg0"": 1000, ""arg1"": 10}",3,2019-01-31 14:31:10 -0600,8136c46d5c60fb,1,2,4,5,2,8,2019-02-14 02:03:30 -0600,,,08:00:2b:01:02:03,Python,CPython 2.7,"{""six"": """", ""numpy"": ""1.15"", ""other_lib"": ""1.0""}"
diff --git a/dev/benchmarking/examples/benchmark_run_example.json b/dev/benchmarking/examples/benchmark_run_example.json
deleted file mode 100644
index 2ded776c9898d..0000000000000
--- a/dev/benchmarking/examples/benchmark_run_example.json
+++ /dev/null
@@ -1,97 +0,0 @@
-[
-  {
-    "benchmark_name": "Benchmark 2",
-    "benchmark_version": "version 0",
-    "parameter_values": {"arg0": 100, "arg1": 5},
-    "value": 2.5,
-    "git_commit_timestamp": "2019-02-08 22:35:53 +0100",
-    "git_hash": "324d3cf198444a",
-    "val_min": 1,
-    "val_q1": 2,
-    "val_q3": 3,
-    "val_max": 4,
-    "std_dev": 1.41,
-    "n_obs": 8,
-    "run_timestamp": "2019-02-14 03:00:05 -0600",
-    "mac_address": "08:00:2b:01:02:03",
-    "benchmark_language": "Python",
-    "language_implementation_version": "CPython 2.7",
-    "dependencies": {"six": "", "numpy": "1.14", "other_lib": "1.0"}
-  },
-  {
-    "benchmark_name": "Benchmark 2",
-    "benchmark_version": "version 0",
-    "parameter_values": {"arg0": 1000, "arg1": 5},
-    "value": 5,
-    "git_commit_timestamp": "2019-02-08 22:35:53 +0100",
-    "git_hash": "324d3cf198444a",
-    "val_min": 2,
-    "val_q1": 4,
-    "val_q3": 6,
-    "val_max": 8,
-    "std_dev": 3.14,
-    "n_obs": 8,
-    "run_timestamp": "2019-02-14 03:00:10 -0600",
-    "mac_address": "08:00:2b:01:02:03",
-    "benchmark_language": "Python",
-    "language_implementation_version": "CPython 2.7",
-    "dependencies": {"six": "", "numpy": "1.14", "other_lib": "1.0"}
-  },
-  {
-    "benchmark_name": "Benchmark 2",
-    "benchmark_version": "version 0",
-    "parameter_values": {"arg0": 100, "arg1": 5},
-    "value": 2.5,
-    "git_commit_timestamp": "2019-02-08 22:35:53 +0100",
-    "git_hash": "324d3cf198444a",
-    "val_min": 0.5,
-    "val_q1": 1,
-    "val_q3": 3,
-    "val_max": 5,
-    "std_dev": 3,
-    "n_obs": 8,
-    "run_timestamp": "2019-02-14 03:00:20 -0600",
-    "mac_address": "08:00:2b:01:02:03",
-    "benchmark_language": "Python",
-    "language_implementation_version": "CPython 2.7",
-    "dependencies": {"boost": "1.42", "numpy": "1.15"}
-  },
-  {
-    "benchmark_name": "Benchmark 2",
-    "benchmark_version": "version 0",
-    "parameter_values": {"arg0": 1000, "arg1": 5},
-    "value": 3,
-    "git_commit_timestamp": "2019-02-08 22:35:53 +0100",
-    "git_hash": "324d3cf198444a",
-    "val_min": 2,
-    "val_q1": 2.5,
-    "val_q3": 4,
-    "val_max": 4.5,
-    "std_dev": 1.5,
-    "n_obs": 8,
-    "run_timestamp": "2019-02-14 03:00:30 -0600",
-    "mac_address": "08:00:2b:01:02:03",
-    "benchmark_language": "Python",
-    "language_implementation_version": "CPython 2.7",
-    "dependencies": {"boost": "1.42", "numpy": "1.15"}
-  },
-  {
-    "benchmark_name": "Benchmark 2",
-    "benchmark_version": "version 0",
-    "parameter_values": {"arg0": 1000, "arg1": 10},
-    "value": 3,
-    "git_commit_timestamp": "2019-02-08 22:35:53 +0100",
-    "git_hash": "324d3cf198444a",
-    "val_min": 1,
-    "val_q1": 2,
-    "val_q3": 4,
-    "val_max": 5,
-    "std_dev": 2,
-    "n_obs": 8,
-    "run_timestamp": "2019-02-14 03:00:40 -0600",
-    "mac_address": "08:00:2b:01:02:03",
-    "benchmark_language": "Python",
-    "language_implementation_version": "CPython 2.7",
-    "dependencies": {"six": "", "numpy": "1.15", "other_lib": "1.0"}
-  }
-]
diff --git a/dev/benchmarking/examples/benchmark_with_context_example.json b/dev/benchmarking/examples/benchmark_with_context_example.json
deleted file mode 100644
index f9e6e31309f68..0000000000000
--- a/dev/benchmarking/examples/benchmark_with_context_example.json
+++ /dev/null
@@ -1,73 +0,0 @@
-{
-  "context": {
-      "mac_address": "08:00:2b:01:02:03",
-      "benchmark_language": "Python",
-      "language_implementation_version": "CPython 2.7",
-      "dependencies": {"six": "", "numpy": "1.14", "other_lib": "1.0"},
-      "git_commit_timestamp": "2019-02-14 22:42:22 +0100",
-      "git_hash": "123456789abcde",
-      "run_timestamp": "2019-02-25 03:00:40 -0600",
-      "Extra stuff": "does not hurt anything and won't be added.",
-      "However": "all of the entries above 'Extra stuff' are required."
-  },
-  "benchmark_version": {
-      "Benchmark 2": "version 0",
-      "Benchmark 3": "any string is a version. (Benchmark 3 not actually used)"
-  },
-  "benchmarks": [
-    {
-      "benchmark_name": "Benchmark 2",
-      "parameter_values": {"arg0": 1, "arg1": 5},
-      "value": 2.5,
-      "val_min": 1,
-      "val_q1": 2,
-      "val_q3": 3,
-      "val_max": 4,
-      "std_dev": 1.41,
-      "n_obs": 8,
-      "run_metadata": {"any": "json object is admissible"},
-      "run_notes": "This value is an arbitrary-length string."
-    },
-    {
-      "benchmark_name": "Benchmark 2",
-      "parameter_values": {"arg0": 2, "arg1": 5},
-      "value": 5,
-      "std_dev": 3.14,
-      "n_obs": 8
-    },
-    {
-      "benchmark_name": "Benchmark 2",
-      "parameter_values": {"arg0": 3, "arg1": 5},
-      "value": 2.5,
-      "val_min": 0.5,
-      "val_q1": 1,
-      "val_q3": 3,
-      "val_max": 5,
-      "std_dev": 3,
-      "n_obs": 8,
-      "run_notes": "The previous run in this list has the minimal set of keys."
-    },
-    {
-      "benchmark_name": "Benchmark 2",
-      "parameter_values": {"arg0": 4, "arg1": 5},
-      "value": 3,
-      "val_min": 2,
-      "val_q1": 2.5,
-      "val_q3": 4,
-      "val_max": 4.5,
-      "std_dev": 1.5,
-      "n_obs": 8
-    },
-    {
-      "benchmark_name": "Benchmark 2",
-      "parameter_values": {"arg0": 5, "arg1": 5},
-      "value": 3,
-      "val_min": 1,
-      "val_q1": 2,
-      "val_q3": 4,
-      "val_max": 5,
-      "std_dev": 2,
-      "n_obs": 8
-    }
-  ]
-}
diff --git a/dev/benchmarking/examples/example.sql b/dev/benchmarking/examples/example.sql
deleted file mode 100644
index e93269af75bd0..0000000000000
--- a/dev/benchmarking/examples/example.sql
+++ /dev/null
@@ -1,232 +0,0 @@
-/*
-  Licensed to the Apache Software Foundation (ASF) under one
-  or more contributor license agreements.  See the NOTICE file
-  distributed with this work for additional information
-  regarding copyright ownership.  The ASF licenses this file
-  to you under the Apache License, Version 2.0 (the
-  "License"); you may not use this file except in compliance
-  with the License.  You may obtain a copy of the License at
- 
-    http://www.apache.org/licenses/LICENSE-2.0
- 
-  Unless required by applicable law or agreed to in writing,
-  software distributed under the License is distributed on an
-  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-  KIND, either express or implied.  See the License for the
-  specific language governing permissions and limitations
-  under the License.
-*/
-
-
--- Example insert into each of the views:
-INSERT INTO public.project(project_name, project_url, repo_url)
-VALUES (
-  'Apache Arrow'
-  , 'https://arrow.apache.org/'
-  , 'https://github.com/apache/arrow');
-
-INSERT INTO public.environment_view
-  (benchmark_language, language_implementation_version, dependencies)
-VALUES
-  ('Python', 'CPython 2.7', '{"six": "", "numpy": "1.14", "other_lib": "1.0"}'),
-  ('Python', 'CPython 3.6', '{"boost": "1.42", "numpy": "1.15"}');
-
-INSERT INTO public.dependencies(dependencies)
-VALUES
-  ('{"boost": "1.68", "numpy": "1.14"}'),
-  ('{"boost": "1.42", "numpy": "1.16"}');
-
-INSERT INTO public.language_implementation_version_view
-  (benchmark_language, language_implementation_version)
-VALUES
-  ('Python', 'CPython 2.7'),
-  ('Python', 'CPython 3.6');
-
-INSERT INTO public.unit_view
-  (benchmark_type, units, lessisbetter)
-VALUES
-  ('Memory', 'gigabytes', True),
-  ('Memory', 'kilobytes', True);
-
-
-\echo 'use \\dv to list the views views';
-\dv
-
-
-SELECT * FROM environment_view;
-SELECT * FROM unit_view;
-
-
-INSERT INTO public.machine_view (
-  mac_address
-  , machine_name
-  , memory_bytes
-  , cpu_actual_frequency_hz
-  , os_name
-  , architecture_name
-  , kernel_name
-  , cpu_model_name
-  , cpu_core_count
-  , cpu_thread_count
-  , cpu_frequency_max_hz
-  , cpu_frequency_min_hz
-  , cpu_l1d_cache_bytes
-  , cpu_l1i_cache_bytes
-  , cpu_l2_cache_bytes
-  , cpu_l3_cache_bytes
-  , machine_other_attributes
-) VALUES (
-  '08:00:2b:01:02:03'  -- mac_address
-  , 'My-Machine-Name'  -- machine_name
-  , 8589934592  -- memory_bytes
-  -- All (?) standard mac address formats are allowable:
-  -- https://www.postgresql.org/docs/11/datatype-net-types.html
-  , 2300000000  -- cpu_actual_frequency_Hz
-  , 'OSX'  -- os_name
-  , 'x86_64'  -- architecture_name
-  , '18.2.0'  -- kernel
-  , 'Intel(R) Core(TM) i5-7360U CPU @ 2.30GHz'  -- cpu_model_name
-  , 2  -- cpu_core_count
-  , 4  -- cpu_thread_count
-  , 2300000000  -- cpu_frequency_max_Hz
-  , 2300000000  -- cpu_frequency_min_Hz
-  , 32768 -- cpu_l1d_cache_bytes
-  , 32768  -- cpu_l1i_cache_bytes
-  , 262144  -- cpu_l2_cache_bytes
-  , 4194304  -- cpu_l3_cache_bytes
-  , '{"example": "for machine_other_attributes"}'::jsonb
-);
-
-
-INSERT INTO public.full_benchmark_run_view (
-  benchmark_name
-  , parameter_names
-  , benchmark_description
-  , benchmark_type
-  , units
-  , lessisbetter
-  , benchmark_version
-  -- datum
-  , parameter_values
-  , value
-  , git_commit_timestamp
-  , git_hash
-  , val_min
-  , val_q1
-  , val_q3
-  , val_max
-  , std_dev
-  , n_obs
-  , run_timestamp
-  , run_metadata
-  , run_notes
-  -- machine_view
-  , machine_name
-  , mac_address
-  , memory_bytes
-  , cpu_actual_frequency_hz
-  , os_name
-  , architecture_name
-  , kernel_name
-  , cpu_model_name
-  , cpu_core_count
-  , cpu_thread_count
-  , cpu_frequency_max_hz
-  , cpu_frequency_min_hz
-  , cpu_l1d_cache_bytes
-  , cpu_l1i_cache_bytes
-  , cpu_l2_cache_bytes
-  , cpu_l3_cache_bytes
-  , machine_other_attributes
-  -- environment_view
-  , benchmark_language
-  , language_implementation_version
-  , dependencies
-) VALUES (
-  'Benchmark 3'
-  , '{"arg0"}'::text[]
-  , 'Third benchmark'
-  , 'Memory'
-  , 'kilobytes'
-  , TRUE
-  , '0'
-  -- datum
-  , '{"arg0": 10}'::jsonb
-  , 0.5
-  , '2019-01-31 14:31:10 -0600'
-  , '8136c46d5c60fb'
-  , 0.5
-  , 0.5
-  , 0.5
-  , 0.5
-  , 0
-  , 2
-  , '2019-02-14 14:00:00 -0600'
-  , '{"ci_99": [2.7e-06, 3.1e-06]}'::jsonb
-  , 'Additional run_notes.'
-  -- machine_view
-  , 'My-Machine-Name'
-  , '09-00-2c-01-02-03'
-  , 8589934592
-  , 2300000000
-  , 'OSX'
-  , 'x86_64'
-  , '18.2.0'
-  , 'Intel(R) Core(TM) i5-7360U CPU @ 2.30GHz'
-  , 2
-  , 4
-  , 2300000000
-  , 2300000000
-  , 32768
-  , 32768
-  , 262144
-  , 4194304
-  , '{"example": "for machine_other_attributes"}'::jsonb
-  -- environment_view
-  , 'Python'
-  , 'CPython 2.7'
-  , '{"six": "", "numpy": "1.15", "other_lib": "1.0"}'::jsonb
-);
-
-
--- Bulk load from CSV. First column is empty; serial "benchmark_run_id" will be assigned.
---\copy benchmark_run_view FROM 'examples/benchmark_run_example.csv' WITH (FORMAT csv, HEADER);
-
--- Load from JSON
---\set content `cat examples/benchmark_example.json`
---SELECT ingest_benchmark_view(:'content'::jsonb);
-
-INSERT INTO public.benchmark_view (
-    benchmark_name
-    , parameter_names
-    , benchmark_description
-    , benchmark_type
-    , units
-    , lessisbetter
-    , benchmark_version
-    , benchmark_language
-  ) VALUES (
-    'Benchmark 1'
-    , '{"arg0", "arg1", "arg2"}'::text[]
-    , E'Description.\nNewlines are OK in a string escaped with leading "E".'
-    , 'Time'
-    , 'miliseconds'
-    , TRUE
-    , 'Hash of code or other way to identify distinct benchmark versions.'
-    , 'Python'
-  ), (
-    'Benchmark 2'
-    , '{"arg0", "arg1"}'::text[]
-    , 'Description 2.'
-    , 'Time'
-    , 'nanoseconds'
-    , TRUE
-    , 'version 0'
-    , 'Python'
-  );
-
-
-\x
-SELECT * from benchmark_run_view;
-
-\x
diff --git a/dev/benchmarking/examples/example_graphql_mutation.json b/dev/benchmarking/examples/example_graphql_mutation.json
deleted file mode 100644
index fec5eed0a68a5..0000000000000
--- a/dev/benchmarking/examples/example_graphql_mutation.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-  "query": "mutation ($p: CreateProjectInput!){createProject(input:$p){project{id}}}",
-  "variables": {
-    "p": {
-      "project": {
-        "projectName": "Apache Arrow",
-        "projectUrl": "https://www.arrow.apache.org",
-        "repoUrl": "https://www.github.com/apache/arrow"
-      }
-    }
-  }
-}
diff --git a/dev/benchmarking/examples/graphql_query_environment_view.json b/dev/benchmarking/examples/graphql_query_environment_view.json
deleted file mode 100644
index 78804fa918a23..0000000000000
--- a/dev/benchmarking/examples/graphql_query_environment_view.json
+++ /dev/null
@@ -1,3 +0,0 @@
-{
-  "query": "{allEnvironmentViews(orderBy: [BENCHMARK_LANGUAGE_ASC, LANGUAGE_IMPLEMENTATION_VERSION_ASC, DEPENDENCIES_ASC]) {edges {node {environmentId, benchmarkLanguage, languageImplementationVersion, dependencies}}}}"
-}
diff --git a/dev/benchmarking/examples/machine.json b/dev/benchmarking/examples/machine.json
deleted file mode 100644
index 2485e2bc1c4e2..0000000000000
--- a/dev/benchmarking/examples/machine.json
+++ /dev/null
@@ -1,22 +0,0 @@
-{
-  "mac_address": "0a:00:2d:01:02:03",
-  "machine_name": "Yet-Another-Machine-Name",
-  "memory_bytes": 8589934592,
-  "cpu_actual_frequency_hz": 2300000000,
-  "os_name": "OSX",
-  "architecture_name": "x86_64",
-  "kernel_name": "18.2.0",
-  "cpu_model_name": "Intel(R) Core(TM) i5-7360U CPU @ 2.30GHz",
-  "cpu_core_count": 2,
-  "cpu_thread_count": 4,
-  "cpu_frequency_max_hz": 2300000000,
-  "cpu_frequency_min_hz": 2300000000,
-  "cpu_l1d_cache_bytes": 32768,
-  "cpu_l1i_cache_bytes": 32768,
-  "cpu_l2_cache_bytes": 262144,
-  "cpu_l3_cache_bytes": 4194304,
-  "machine_other_attributes": {"just": "an example"},
-  "gpu_information": "",
-  "gpu_part_number": "",
-  "gpu_product_name": ""
-}
diff --git a/dev/benchmarking/graphql_submit.sh b/dev/benchmarking/graphql_submit.sh
deleted file mode 100755
index 2eaab9cdfa5d4..0000000000000
--- a/dev/benchmarking/graphql_submit.sh
+++ /dev/null
@@ -1,75 +0,0 @@
-#!/usr/bin/env bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-OPTIONS=("machine" "benchmarks" "runs")
-
-option=${1-help}
-datafile=${2-machine.json}
-uri=${3-localhost:5000/graphql}
-
-help() {
-  cat <<HELP
-  Submit data via GraphQL
-  
-  Usage:
-  ${0} [option] [JSON_file] [URI]
-
-  Arguments:
-    option    - $(echo ${OPTIONS[@]} | sed 's/ /|/g')
-    JSON_file - path to the submission file (default 'machine.json')
-    URI       - URI to submit to (default 'localhost:5000/graphql')
-HELP
-}
-
-escape_quote() {  sed 's/"/\\"/g'; }
-
-template() {
-  cat <<TEMPLATE
-  {
-    "query": "mutation (\$jsonb: JSON!){${1}(input:{fromJsonb:\$jsonb}){${2}}}",
-    "variables": {
-      "jsonb": "$(echo $(cat ${datafile}) | escape_quote )"
-    }
-  }
-TEMPLATE
-}
-
-submit () {
-  curl -X POST -H "Content-Type: application/json"  --data @<(template $1 $2) ${uri}
-}
-
-
-case "$1" in
-  machine)
-    submit ingestMachineView integer;;
-
-  benchmarks)
-    submit ingestBenchmarkView integers;;
-
-  runs)
-    if grep -q context <(head -n2 ${2})
-    then
-      submit ingestBenchmarkRunsWithContext bigInts
-    else
-      submit ingestBenchmarkRunView bigInts
-    fi;;
-
-  *)
-    help
-    exit 1
-esac
diff --git a/dev/benchmarking/make_data_model_rst.sh b/dev/benchmarking/make_data_model_rst.sh
deleted file mode 100755
index 6a4f5f5b6720d..0000000000000
--- a/dev/benchmarking/make_data_model_rst.sh
+++ /dev/null
@@ -1,69 +0,0 @@
-#!/usr/bin/env bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-set -e
-DOTFILE=data_model.dot
-OUTFILE=data_model.rst
-
-license() {
-  cat <<'LICENSE' > ${1}
-.. Licensed to the Apache Software Foundation (ASF) under one
-.. or more contributor license agreements.  See the NOTICE file
-.. distributed with this work for additional information
-.. regarding copyright ownership.  The ASF licenses this file
-.. to you under the Apache License, Version 2.0 (the
-.. "License"); you may not use this file except in compliance
-.. with the License.  You may obtain a copy of the License at
-
-..   http://www.apache.org/licenses/LICENSE-2.0
-
-.. Unless required by applicable law or agreed to in writing,
-.. software distributed under the License is distributed on an
-.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-.. KIND, either express or implied.  See the License for the
-.. specific language governing permissions and limitations
-.. under the License.
-
-
-LICENSE
-}
-
-warning() {
-  cat <<'WARNING' >> ${1}
-.. WARNING
-..   This is an auto-generated file. Please do not edit.
-
-..   To reproduce, please run :code:`./make_data_model_rst.sh`.
-..   (This requires you have the
-..   `psql client <https://www.postgresql.org/download/>`_
-..   and have started the docker containers using
-..   :code:`docker-compose up`).
-
-WARNING
-}
-
-echo "Making ${OUTFILE}"
-
-license ${OUTFILE}
-warning ${OUTFILE}
-
-PGPASSWORD=arrow \
-  psql --tuples-only --username=arrow_web \
-  --dbname=benchmark --port=5432 --host=localhost \
-  --command="select public.documentation('${DOTFILE}');" \
-  | sed "s/ *+$//" | sed "s/^ //" >> ${OUTFILE}
diff --git a/dev/benchmarking/make_dotfile.sh b/dev/benchmarking/make_dotfile.sh
deleted file mode 100755
index b86dc3eb3c6d3..0000000000000
--- a/dev/benchmarking/make_dotfile.sh
+++ /dev/null
@@ -1,70 +0,0 @@
-#!/usr/bin/env bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-set -e
-OUTFILE=data_model.dot
-
-license() {
-  cat <<'LICENSE' > ${1}
-/*
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements.See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership.The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License.You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied.See the License for the
- specific language governing permissions and limitations
- under the License.
-*/
-
-LICENSE
-}
-
-warning() {
-  cat <<'WARNING' >> ${1}
-/*
- WARNING
-   This is an auto-generated file. Please do not edit.
-
-   To reproduce, please run :code:`./make_data_model_rst.sh`.
-   (This requires you have the
-   `psql client <https://www.postgresql.org/download/>`_
-   and have started the docker containers using
-   :code:`docker-compose up`).
-*/
-WARNING
-}
-
-echo "Making ${OUTFILE}"
-
-license ${OUTFILE}
-warning ${OUTFILE}
-
-PGPASSWORD=arrow \
-  psql --tuples-only --username=arrow_web \
-  --dbname=benchmark --port=5432 --host=localhost \
-  --command="select public.documentation_dotfile();" \
-  | sed "s/ *+$//" | sed "s/^ //" >> ${OUTFILE}
diff --git a/dev/benchmarking/make_machine_json.sh b/dev/benchmarking/make_machine_json.sh
deleted file mode 100755
index 09bf0ea2d15dc..0000000000000
--- a/dev/benchmarking/make_machine_json.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/usr/bin/env bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-set -e
-OUTFILE=machine.json
-
-echo "Making ${OUTFILE}"
-echo "** NOTE: This command fails on everything but OSX right now.      **"
-echo "*  also, the intent is to make this script not suck, just not now. *"
-echo "Please type GPU details here (or manually modify ${OUTFILE} later)."
-read -p "GPU information string (or <enter>): " gpu_information
-read -p "GPU part number (or <enter>): " gpu_part_number
-read -p "GPU product name (or <enter>): " gpu_product_name
-
-
-cat <<MACHINE_JSON > ${OUTFILE}
-{
-  "mac_address": "$(ifconfig en1 | awk '/ether/{print $2}')",
-  "machine_name": "$(uname -n)",
-  "memory_bytes": $(sysctl -n hw.memsize),
-  "cpu_actual_frequency_hz": $(sysctl -n hw.cpufrequency),
-  "os_name": "$(uname -s)",
-  "architecture_name": "$(uname -m)",
-  "kernel_name": "$(uname -r)",
-  "cpu_model_name": "$(sysctl -n machdep.cpu.brand_string)",
-  "cpu_core_count": $(sysctl -n hw.physicalcpu),
-  "cpu_thread_count": $(sysctl -n hw.logicalcpu),
-  "cpu_frequency_max_hz": $(sysctl -n hw.cpufrequency_max),
-  "cpu_frequency_min_hz": $(sysctl -n hw.cpufrequency_min),
-  "cpu_l1d_cache_bytes": $(sysctl -n hw.l1dcachesize),
-  "cpu_l1i_cache_bytes": $(sysctl -n hw.l1icachesize),
-  "cpu_l2_cache_bytes": $(sysctl -n hw.l2cachesize),
-  "cpu_l3_cache_bytes": $(sysctl -n hw.l3cachesize),
-  "gpu_information": "${gpu_information}",
-  "gpu_part_number": "${gpu_part_number}",
-  "gpu_product_name": "${gpu_product_name}"
-}
-MACHINE_JSON
-
-echo "Machine details saved in ${OUTFILE}"

From a36dd8cd208582316322b0dd6e7569785c8f2780 Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Mon, 22 Nov 2021 16:27:56 +0100
Subject: [PATCH 189/194] ARROW-14791: [C++] Fix crash when validating corrupt
 list array

Should fix the following failure (found by OSS-Fuzz):
* https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=41143

Closes #11753 from pitrou/ARROW-14791-fuzz-ipc-stream

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/arrow/array/validate.cc | 2 +-
 testing                         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc
index 44889e4f36d6f..52fcad5e7eb2f 100644
--- a/cpp/src/arrow/array/validate.cc
+++ b/cpp/src/arrow/array/validate.cc
@@ -564,7 +564,7 @@ struct ValidateArrayImpl {
                              " and offset: ", data.offset);
     }
 
-    if (full_validation && offsets_byte_size != 0) {
+    if (full_validation && required_offsets > 0) {
       // Validate all offset values
       const offset_type* offsets = data.GetValues<offset_type>(1);
 
diff --git a/testing b/testing
index 1d8525e109a12..93ef4a7bbf8cc 160000
--- a/testing
+++ b/testing
@@ -1 +1 @@
-Subproject commit 1d8525e109a12a8c67c489eba48715a199609153
+Subproject commit 93ef4a7bbf8cc629fa1f82bf38bb6e89cda91d40

From 0d18f4f8b78a614152c3af5233bd1aee2cffa8fe Mon Sep 17 00:00:00 2001
From: Benson Muite <bkmgit@users.noreply.github.com>
Date: Mon, 22 Nov 2021 16:41:16 +0100
Subject: [PATCH 190/194] ARROW-14748: [C++][CI] Update flags to give warning
 for unused results

Maybe problematic on older GCC compilers https://gcc.gnu.org/bugzilla/show_bug.cgi?id=38172

Closes #11747 from bkmgit/ARROW-14748

Lead-authored-by: Benson Muite <bkmgit@users.noreply.github.com>
Co-authored-by: bkmgit <benson_muite@emailplus.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/cmake_modules/SetupCxxFlags.cmake | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cpp/cmake_modules/SetupCxxFlags.cmake b/cpp/cmake_modules/SetupCxxFlags.cmake
index c1a1ba043664d..a04eba91d0e10 100644
--- a/cpp/cmake_modules/SetupCxxFlags.cmake
+++ b/cpp/cmake_modules/SetupCxxFlags.cmake
@@ -275,6 +275,7 @@ if("${BUILD_WARNING_LEVEL}" STREQUAL "CHECKIN")
     set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-conversion")
     set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-deprecated-declarations")
     set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-sign-conversion")
+    set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wunused-result")
   elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
     if(WIN32)
       set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} /Wall")
@@ -305,6 +306,7 @@ elseif("${BUILD_WARNING_LEVEL}" STREQUAL "EVERYTHING")
     set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wpedantic")
     set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wextra")
     set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-unused-parameter")
+    set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wunused-result")
   elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
     if(WIN32)
       set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} /Wall")

From cbff533e2c96bc7873fdac747d090f8fa1d440ca Mon Sep 17 00:00:00 2001
From: Neal Richardson <neal.p.richardson@gmail.com>
Date: Mon, 22 Nov 2021 10:46:39 -0500
Subject: [PATCH 191/194] MINOR: add note about earlier benchmarking work to
 archery README

Closes #11755 from nealrichardson/benchmarking-readme

Authored-by: Neal Richardson <neal.p.richardson@gmail.com>
Signed-off-by: Neal Richardson <neal.p.richardson@gmail.com>
---
 dev/archery/README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/dev/archery/README.md b/dev/archery/README.md
index eff65441661c8..b43ca5ba552a3 100644
--- a/dev/archery/README.md
+++ b/dev/archery/README.md
@@ -46,4 +46,6 @@ to use the functionality of it:
 
 Additionally, if you would prefer to install everything at once,
 `pip install -e "arrow/dev/archery[all]"` is an alias for all of
-the above subpackages.
\ No newline at end of file
+the above subpackages.
+
+For some prior art on benchmarking in Arrow, see [this prototype](https://github.com/apache/arrow/tree/0409498819332fc479f8df38babe3426d707fb9e/dev/benchmarking).
\ No newline at end of file

From 2a3395e82de06287950f460e8468bf1e92e41574 Mon Sep 17 00:00:00 2001
From: Neal Richardson <neal.p.richardson@gmail.com>
Date: Mon, 22 Nov 2021 11:40:20 -0500
Subject: [PATCH 192/194] ARROW-14786: [R] Bump dev version following 6.0.1
 patch release

Closes #11752 from nealrichardson/post-6.0.1

Authored-by: Neal Richardson <neal.p.richardson@gmail.com>
Signed-off-by: Neal Richardson <neal.p.richardson@gmail.com>
---
 ci/scripts/PKGBUILD                               |  2 +-
 .../homebrew-formulae/autobrew/apache-arrow.rb    |  2 +-
 r/DESCRIPTION                                     |  2 +-
 r/NEWS.md                                         | 15 ++++++++++++---
 4 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/ci/scripts/PKGBUILD b/ci/scripts/PKGBUILD
index f02ce4e2d9e10..284d35b9f63a6 100644
--- a/ci/scripts/PKGBUILD
+++ b/ci/scripts/PKGBUILD
@@ -18,7 +18,7 @@
 _realname=arrow
 pkgbase=mingw-w64-${_realname}
 pkgname="${MINGW_PACKAGE_PREFIX}-${_realname}"
-pkgver=6.0.0.9000
+pkgver=6.0.1.9000
 pkgrel=8000
 pkgdesc="Apache Arrow is a cross-language development platform for in-memory data (mingw-w64)"
 arch=("any")
diff --git a/dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb b/dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb
index f8381ea77f729..c4dab8ef5c479 100644
--- a/dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb
+++ b/dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb
@@ -19,7 +19,7 @@
 class ApacheArrow < Formula
   desc "Columnar in-memory analytics layer designed to accelerate big data"
   homepage "https://arrow.apache.org/"
-  url "https://www.apache.org/dyn/closer.lua?path=arrow/arrow-6.0.0.9000/apache-arrow-6.0.0.9000.tar.gz"
+  url "https://www.apache.org/dyn/closer.lua?path=arrow/arrow-6.0.1.9000/apache-arrow-6.0.1.9000.tar.gz"
   sha256 "9948ddb6d4798b51552d0dca3252dd6e3a7d0f9702714fc6f5a1b59397ce1d28"
   head "https://github.com/apache/arrow.git"
 
diff --git a/r/DESCRIPTION b/r/DESCRIPTION
index e3bd5139d5757..08cb5d44038e0 100644
--- a/r/DESCRIPTION
+++ b/r/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: arrow
 Title: Integration to 'Apache' 'Arrow'
-Version: 6.0.0.9000
+Version: 6.0.1.9000
 Authors@R: c(
     person("Neal", "Richardson", email = "neal@ursalabs.org", role = c("aut", "cre")),
     person("Ian", "Cook", email = "ianmcook@gmail.com", role = c("aut")),
diff --git a/r/NEWS.md b/r/NEWS.md
index cea923b902e30..7666b49c7a775 100644
--- a/r/NEWS.md
+++ b/r/NEWS.md
@@ -17,7 +17,16 @@
   under the License.
 -->
 
-# arrow 6.0.0.9000
+# arrow 6.0.1.9000
+
+# arrow 6.0.1
+
+* Joins now support inclusion of dictionary columns, and multiple crashes have been fixed
+* Grouped aggregation no longer crashes when working on data that has been filtered down to 0 rows
+* Bindings added for `str_count()` in dplyr queries
+* Work around a critical bug in the AWS SDK for C++ that could affect S3 multipart upload
+* A UBSAN warning in the round kernel has been resolved
+* Fixes for build failures on Solaris and on old versions of macOS
 
 # arrow 6.0.0
 
@@ -58,7 +67,7 @@ You can also take a duckdb `tbl` and call `to_arrow()` to stream data to Arrow's
 
 * Package installation now fails if the Arrow C++ library does not compile. In previous versions, if the C++ library failed to compile, you would get a successful R package installation that wouldn't do much useful.
 * You can disable all optional C++ components when building from source by setting the environment variable `LIBARROW_MINIMAL=true`. This will have the core Arrow/Feather components but excludes Parquet, Datasets, compression libraries, and other optional features.
-* Source packages now bundle the Arrow C++ source code, so it does not have to be downloaded in order to build the package. Because the source is included, it is now possible to build the package on an offline/airgapped system. By default, the offline build will be minimal because it cannot download third-party C++ dependencies required to support all features. To allow a fully featured offline build, the included `create_package_with_all_dependencies()` function (also available on GitHub without installing the arrow package) will download all third-party C++ dependencies and bundle them inside the R source package. Run this function on a system connected to the network to produce the "fat" source package, then copy that .tar.gz package to your offline machine and install. Special thanks to @karldw for the huge amount of work on this. 
+* Source packages now bundle the Arrow C++ source code, so it does not have to be downloaded in order to build the package. Because the source is included, it is now possible to build the package on an offline/airgapped system. By default, the offline build will be minimal because it cannot download third-party C++ dependencies required to support all features. To allow a fully featured offline build, the included `create_package_with_all_dependencies()` function (also available on GitHub without installing the arrow package) will download all third-party C++ dependencies and bundle them inside the R source package. Run this function on a system connected to the network to produce the "fat" source package, then copy that .tar.gz package to your offline machine and install. Special thanks to @karldw for the huge amount of work on this.
 * Source builds can make use of system dependencies (such as `libz`) by setting `ARROW_DEPENDENCY_SOURCE=AUTO`. This is not the default in this release (`BUNDLED`, i.e. download and build all dependencies) but may become the default in the future.
 * The JSON library components (`read_json_arrow()`) are now optional and still on by default; set `ARROW_JSON=OFF` before building to disable them.
 
@@ -202,7 +211,7 @@ Over 100 functions can now be called on Arrow objects inside a `dplyr` verb:
 * Table columns can now be added, replaced, or removed by assigning (`<-`) with either `$` or `[[`
 * Column names of Tables and RecordBatches can be renamed by assigning `names()`
 * Large string types can now be written to Parquet files
-* The [pronouns `.data` and `.env`](https://rlang.r-lib.org/reference/tidyeval-data.html) are now fully supported in Arrow `dplyr` pipelines.
+* The `rlang` pronouns `.data` and `.env` are now fully supported in Arrow `dplyr` pipelines.
 * Option `arrow.skip_nul` (default `FALSE`, as in `base::scan()`) allows conversion of Arrow string (`utf8()`) type data containing embedded nul `\0` characters to R. If set to `TRUE`, nuls will be stripped and a warning is emitted if any are found.
 * `arrow_info()` for an overview of various run-time and build-time Arrow configurations, useful for debugging
 * Set environment variable `ARROW_DEFAULT_MEMORY_POOL` before loading the Arrow package to change memory allocators. Windows packages are built with `mimalloc`; most others are built with both `jemalloc` (used by default) and `mimalloc`. These alternative memory allocators are generally much faster than the system memory allocator, so they are used by default when available, but sometimes it is useful to turn them off for debugging purposes. To disable them, set `ARROW_DEFAULT_MEMORY_POOL=system`.

From bccf360421a425630512103d9681876087bd21a8 Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Mon, 22 Nov 2021 17:49:51 +0100
Subject: [PATCH 193/194] MINOR: [C++] Ensure public API headers don't include
 Windows.h

Closes #11711 from pitrou/minor-windows-api-test

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/arrow/public_api_test.cc   | 9 +++++++++
 cpp/src/parquet/public_api_test.cc | 7 +++++++
 2 files changed, 16 insertions(+)

diff --git a/cpp/src/arrow/public_api_test.cc b/cpp/src/arrow/public_api_test.cc
index eba14ec66ac1e..2a0728e1be0d4 100644
--- a/cpp/src/arrow/public_api_test.cc
+++ b/cpp/src/arrow/public_api_test.cc
@@ -70,6 +70,15 @@
 #error "arrow::vendored::date should not be visible from Arrow public headers."
 #endif
 
+#ifdef PROTOBUF_EXPORT
+#error "Protocol Buffers should not be visible from Arrow public headers."
+#endif
+
+#if defined(SendMessage) || defined(GetObject) || defined(ERROR_INVALID_HANDLE) || \
+    defined(FILE_SHARE_READ) || defined(WAIT_TIMEOUT)
+#error "Windows.h should not be included by Arrow public headers"
+#endif
+
 #include <gmock/gmock-matchers.h>
 #include <gtest/gtest.h>
 
diff --git a/cpp/src/parquet/public_api_test.cc b/cpp/src/parquet/public_api_test.cc
index c0ef97a70a8d1..7424daade5cf3 100644
--- a/cpp/src/parquet/public_api_test.cc
+++ b/cpp/src/parquet/public_api_test.cc
@@ -40,6 +40,13 @@ TEST(TestPublicAPI, DoesNotIncludeZlib) {
 #endif
 }
 
+TEST(TestPublicAPI, DoesNotIncludeWindows) {
+#if defined(SendMessage) || defined(GetObject) || defined(ERROR_INVALID_HANDLE) || \
+    defined(FILE_SHARE_READ) || defined(WAIT_TIMEOUT)
+  FAIL() << "Windows.h should not be transitively included";
+#endif
+}
+
 PARQUET_NORETURN void ThrowsParquetException() {
   throw parquet::ParquetException("This function throws");
 }

From 911a833930d46a14ed79f7f7d8d1cece9572ed6e Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Tue, 23 Nov 2021 05:42:23 +0900
Subject: [PATCH 194/194] ARROW-14784: [GLib][Ruby] Rename GArrowSortKey::name
 to ::target

Because arrow::compute::SortKey::name was renamed to ::target by
528625e6ed4bf1f6540f8a410a496a14712252e7 ARROW-14074.

This is a backward incompatible change in GLib.

This doesn't break backward compatibility in Ruby.

Closes #11748 from kou/glib-sort-keys

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 c_glib/arrow-glib/compute.cpp            |  83 +++++++++++-----
 c_glib/arrow-glib/compute.h              |   4 +-
 c_glib/arrow-glib/compute.hpp            |   6 ++
 c_glib/arrow-glib/expression.cpp         |  20 ++--
 ruby/red-arrow/lib/arrow/sort-key.rb     | 116 ++++++++++++-----------
 ruby/red-arrow/lib/arrow/sort-options.rb |  16 ++--
 6 files changed, 145 insertions(+), 100 deletions(-)

diff --git a/c_glib/arrow-glib/compute.cpp b/c_glib/arrow-glib/compute.cpp
index c03d24293f1b6..e98a3e7262712 100644
--- a/c_glib/arrow-glib/compute.cpp
+++ b/c_glib/arrow-glib/compute.cpp
@@ -2299,7 +2299,7 @@ typedef struct GArrowSortKeyPrivate_ {
 } GArrowSortKeyPrivate;
 
 enum {
-  PROP_SORT_KEY_NAME = 1,
+  PROP_SORT_KEY_TARGET = 1,
   PROP_SORT_KEY_ORDER,
 };
 
@@ -2329,9 +2329,6 @@ garrow_sort_key_set_property(GObject *object,
   auto priv = GARROW_SORT_KEY_GET_PRIVATE(object);
 
   switch (prop_id) {
-  case PROP_SORT_KEY_NAME:
-    priv->sort_key.target = g_value_get_string(value);
-    break;
   case PROP_SORT_KEY_ORDER:
     priv->sort_key.order =
       static_cast<arrow::compute::SortOrder>(g_value_get_enum(value));
@@ -2351,11 +2348,14 @@ garrow_sort_key_get_property(GObject *object,
   auto priv = GARROW_SORT_KEY_GET_PRIVATE(object);
 
   switch (prop_id) {
-  case PROP_SORT_KEY_NAME:
-    if (auto name = priv->sort_key.target.name()) {
-      g_value_set_string(value, name->c_str());
-    } else {
-      g_value_set_string(value, "");
+  case PROP_SORT_KEY_TARGET:
+    {
+      auto name = priv->sort_key.target.name();
+      if (name) {
+        g_value_set_string(value, name->c_str());
+      } else {
+        g_value_set_string(value, priv->sort_key.target.ToDotPath().c_str());
+      }
     }
     break;
   case PROP_SORT_KEY_ORDER:
@@ -2385,18 +2385,22 @@ garrow_sort_key_class_init(GArrowSortKeyClass *klass)
 
   GParamSpec *spec;
   /**
-   * GArrowSortKey:name:
+   * GArrowSortKey:target:
    *
-   * The column name to be used.
+   * A name or dot path for the sort target.
    *
-   * Since: 3.0.0
+   *     dot_path = '.' name
+   *              | '[' digit+ ']'
+   *              | dot_path+
+   *
+   * Since: 7.0.0
    */
-  spec = g_param_spec_string("name",
-                             "Name",
-                             "The column name to be used",
+  spec = g_param_spec_string("target",
+                             "Target",
+                             "The sort target",
                              NULL,
-                             static_cast<GParamFlags>(G_PARAM_READWRITE));
-  g_object_class_install_property(gobject_class, PROP_SORT_KEY_NAME, spec);
+                             static_cast<GParamFlags>(G_PARAM_READABLE));
+  g_object_class_install_property(gobject_class, PROP_SORT_KEY_TARGET, spec);
 
   /**
    * GArrowSortKey:order:
@@ -2410,13 +2414,14 @@ garrow_sort_key_class_init(GArrowSortKeyClass *klass)
                            "How to order values",
                            GARROW_TYPE_SORT_ORDER,
                            0,
-                           static_cast<GParamFlags>(G_PARAM_READWRITE));
+                           static_cast<GParamFlags>(G_PARAM_READWRITE |
+                                                    G_PARAM_CONSTRUCT_ONLY));
   g_object_class_install_property(gobject_class, PROP_SORT_KEY_ORDER, spec);
 }
 
 /**
  * garrow_sort_key_new:
- * @name: A column name to be used.
+ * @target: A name or dot path for sort target.
  * @order: How to order by this sort key.
  *
  * Returns: A newly created #GArrowSortKey.
@@ -2424,12 +2429,21 @@ garrow_sort_key_class_init(GArrowSortKeyClass *klass)
  * Since: 3.0.0
  */
 GArrowSortKey *
-garrow_sort_key_new(const gchar *name, GArrowSortOrder order)
+garrow_sort_key_new(const gchar *target,
+                    GArrowSortOrder order,
+                    GError **error)
 {
+  auto arrow_reference_result = garrow_field_reference_resolve_raw(target);
+  if (!garrow::check(error,
+                     arrow_reference_result,
+                     "[sort-key][new]")) {
+    return NULL;
+  }
   auto sort_key = g_object_new(GARROW_TYPE_SORT_KEY,
-                               "name", name,
                                "order", order,
                                NULL);
+  auto priv = GARROW_SORT_KEY_GET_PRIVATE(sort_key);
+  priv->sort_key.target = *arrow_reference_result;
   return GARROW_SORT_KEY(sort_key);
 }
 
@@ -2535,8 +2549,7 @@ garrow_sort_options_get_sort_keys(GArrowSortOptions *options)
   auto arrow_options = garrow_sort_options_get_raw(options);
   GList *sort_keys = NULL;
   for (const auto &arrow_sort_key : arrow_options->sort_keys) {
-    auto sort_key = g_object_new(GARROW_TYPE_SORT_KEY, NULL);
-    GARROW_SORT_KEY_GET_PRIVATE(sort_key)->sort_key = arrow_sort_key;
+    auto sort_key = garrow_sort_key_new_raw(arrow_sort_key);
     sort_keys = g_list_prepend(sort_keys, sort_key);
   }
   return g_list_reverse(sort_keys);
@@ -4068,6 +4081,19 @@ garrow_record_batch_filter(GArrowRecordBatch *record_batch,
 
 G_END_DECLS
 
+
+arrow::Result<arrow::FieldRef>
+garrow_field_reference_resolve_raw(const gchar *reference)
+{
+  if (reference && reference[0] == '.') {
+    return arrow::FieldRef::FromDotPath(reference);
+  } else {
+    arrow::FieldRef arrow_reference(reference);
+    return arrow_reference;
+  }
+}
+
+
 arrow::compute::ExecContext *
 garrow_execute_context_get_raw(GArrowExecuteContext *context)
 {
@@ -4231,6 +4257,16 @@ garrow_array_sort_options_get_raw(GArrowArraySortOptions *options)
     garrow_function_options_get_raw(GARROW_FUNCTION_OPTIONS(options)));
 }
 
+
+GArrowSortKey *
+garrow_sort_key_new_raw(const arrow::compute::SortKey &arrow_sort_key)
+{
+  auto sort_key = g_object_new(GARROW_TYPE_SORT_KEY, NULL);
+  auto priv = GARROW_SORT_KEY_GET_PRIVATE(sort_key);
+  priv->sort_key = arrow_sort_key;
+  return GARROW_SORT_KEY(sort_key);
+}
+
 arrow::compute::SortKey *
 garrow_sort_key_get_raw(GArrowSortKey *sort_key)
 {
@@ -4238,6 +4274,7 @@ garrow_sort_key_get_raw(GArrowSortKey *sort_key)
   return &(priv->sort_key);
 }
 
+
 arrow::compute::SortOptions *
 garrow_sort_options_get_raw(GArrowSortOptions *options)
 {
diff --git a/c_glib/arrow-glib/compute.h b/c_glib/arrow-glib/compute.h
index 2171d6abd9a99..2416b03c12a96 100644
--- a/c_glib/arrow-glib/compute.h
+++ b/c_glib/arrow-glib/compute.h
@@ -426,7 +426,9 @@ struct _GArrowSortKeyClass
 
 GARROW_AVAILABLE_IN_3_0
 GArrowSortKey *
-garrow_sort_key_new(const gchar *name, GArrowSortOrder order);
+garrow_sort_key_new(const gchar *target,
+                    GArrowSortOrder order,
+                    GError **error);
 
 GARROW_AVAILABLE_IN_3_0
 gboolean
diff --git a/c_glib/arrow-glib/compute.hpp b/c_glib/arrow-glib/compute.hpp
index 88f55d5329c30..b9a127155171c 100644
--- a/c_glib/arrow-glib/compute.hpp
+++ b/c_glib/arrow-glib/compute.hpp
@@ -24,6 +24,10 @@
 #include <arrow-glib/compute.h>
 
 
+arrow::Result<arrow::FieldRef>
+garrow_field_reference_resolve_raw(const gchar *reference);
+
+
 arrow::compute::ExecContext *
 garrow_execute_context_get_raw(GArrowExecuteContext *context);
 
@@ -88,6 +92,8 @@ arrow::compute::ArraySortOptions *
 garrow_array_sort_options_get_raw(GArrowArraySortOptions *options);
 
 
+GArrowSortKey *
+garrow_sort_key_new_raw(const arrow::compute::SortKey &arrow_sort_key);
 arrow::compute::SortKey *
 garrow_sort_key_get_raw(GArrowSortKey *sort_key);
 
diff --git a/c_glib/arrow-glib/expression.cpp b/c_glib/arrow-glib/expression.cpp
index 406e121cdb80e..eaa8bcd5ddc6e 100644
--- a/c_glib/arrow-glib/expression.cpp
+++ b/c_glib/arrow-glib/expression.cpp
@@ -175,20 +175,14 @@ GArrowFieldExpression *
 garrow_field_expression_new(const gchar *reference,
                             GError **error)
 {
-  if (reference && reference[0] == '.') {
-    auto arrow_reference_result = arrow::FieldRef::FromDotPath(reference);
-    if (!garrow::check(error,
-                       arrow_reference_result,
-                       "[field-expression][new]")) {
-      return NULL;
-    }
-    auto arrow_expression = arrow::compute::field_ref(*arrow_reference_result);
-    return GARROW_FIELD_EXPRESSION(garrow_expression_new_raw(arrow_expression));
-  } else {
-    arrow::FieldRef arrow_reference(reference);
-    auto arrow_expression = arrow::compute::field_ref(arrow_reference);
-    return GARROW_FIELD_EXPRESSION(garrow_expression_new_raw(arrow_expression));
+  auto arrow_reference_result = garrow_field_reference_resolve_raw(reference);
+  if (!garrow::check(error,
+                     arrow_reference_result,
+                     "[field-expression][new]")) {
+    return NULL;
   }
+  auto arrow_expression = arrow::compute::field_ref(*arrow_reference_result);
+  return GARROW_FIELD_EXPRESSION(garrow_expression_new_raw(arrow_expression));
 }
 
 
diff --git a/ruby/red-arrow/lib/arrow/sort-key.rb b/ruby/red-arrow/lib/arrow/sort-key.rb
index 9870272561c93..7ceab631ea23c 100644
--- a/ruby/red-arrow/lib/arrow/sort-key.rb
+++ b/ruby/red-arrow/lib/arrow/sort-key.rb
@@ -29,25 +29,26 @@ class << self
       #
       #   @return [Arrow::SortKey] The given sort key itself.
       #
-      # @overload resolve(name)
+      # @overload resolve(target)
       #
-      #   Creates a new suitable sort key from column name with
-      #   leading order mark. See {#initialize} for details about
+      #   Creates a new suitable sort key from column name or dot path
+      #   with leading order mark. See {#initialize} for details about
       #   order mark.
       #
       #   @return [Arrow::SortKey] A new suitable sort key.
       #
-      # @overload resolve(name, order)
+      # @overload resolve(target, order)
       #
-      #   Creates a new suitable sort key from column name without
-      #   leading order mark and order. See {#initialize} for details.
+      #   Creates a new suitable sort key from column name or dot path
+      #   without leading order mark and order. See {#initialize} for
+      #   details.
       #
       #   @return [Arrow::SortKey] A new suitable sort key.
       #
       # @since 4.0.0
-      def resolve(name, order=nil)
-        return name if name.is_a?(self)
-        new(name, order)
+      def resolve(target, order=nil)
+        return target if target.is_a?(self)
+        new(target, order)
       end
 
       # @api private
@@ -65,47 +66,49 @@ def try_convert(value)
     private :initialize_raw
     # Creates a new {Arrow::SortKey}.
     #
-    # @overload initialize(name)
+    # @overload initialize(target)
     #
-    #   @param name [Symbol, String] The name of the sort column.
+    #   @param target [Symbol, String] The name or dot path of the
+    #     sort column.
     #
-    #     If `name` is a String, the first character may be processed
-    #     as the "leading order mark". If the first character is `"+"`
-    #     or `"-"`, they are processed as a leading order mark. If the
-    #     first character is processed as a leading order mark, the
-    #     first character is removed from sort column name and
-    #     corresponding order is used. `"+"` uses ascending order and
-    #     `"-"` uses ascending order.
+    #     If `target` is a String, the first character may be
+    #     processed as the "leading order mark". If the first
+    #     character is `"+"` or `"-"`, they are processed as a leading
+    #     order mark. If the first character is processed as a leading
+    #     order mark, the first character is removed from sort column
+    #     target and corresponding order is used. `"+"` uses ascending
+    #     order and `"-"` uses ascending order.
     #
-    #     If `name` is not a String nor `name` doesn't start with the
-    #     leading order mark, sort column name is `name` as-is and
+    #     If `target` is not a String nor `target` doesn't start with the
+    #     leading order mark, sort column target is `target` as-is and
     #     ascending order is used.
     #
     #   @example String without the leading order mark
     #     key = Arrow::SortKey.new("count")
-    #     key.name  # => "count"
-    #     key.order # => Arrow::SortOrder::ASCENDING
+    #     key.target # => "count"
+    #     key.order  # => Arrow::SortOrder::ASCENDING
     #
     #   @example String with the "+" leading order mark
     #     key = Arrow::SortKey.new("+count")
-    #     key.name  # => "count"
-    #     key.order # => Arrow::SortOrder::ASCENDING
+    #     key.target # => "count"
+    #     key.order  # => Arrow::SortOrder::ASCENDING
     #
     #   @example String with the "-" leading order mark
     #     key = Arrow::SortKey.new("-count")
-    #     key.name  # => "count"
-    #     key.order # => Arrow::SortOrder::DESCENDING
+    #     key.target # => "count"
+    #     key.order  # => Arrow::SortOrder::DESCENDING
     #
     #   @example Symbol that starts with "-"
     #     key = Arrow::SortKey.new(:"-count")
-    #     key.name  # => "-count"
-    #     key.order # => Arrow::SortOrder::ASCENDING
+    #     key.target # => "-count"
+    #     key.order  # => Arrow::SortOrder::ASCENDING
     #
-    # @overload initialize(name, order)
+    # @overload initialize(target, order)
     #
-    #   @param name [Symbol, String] The name of the sort column.
+    #   @param target [Symbol, String] The name or dot path of the
+    #     sort column.
     #
-    #     No leading order mark processing. The given `name` is used
+    #     No leading order mark processing. The given `target` is used
     #     as-is.
     #
     #   @param order [Symbol, String, Arrow::SortOrder] How to order
@@ -117,29 +120,29 @@ def try_convert(value)
     #
     #   @example No leading order mark processing
     #     key = Arrow::SortKey.new("-count", :ascending)
-    #     key.name  # => "-count"
-    #     key.order # => Arrow::SortOrder::ASCENDING
+    #     key.target # => "-count"
+    #     key.order  # => Arrow::SortOrder::ASCENDING
     #
-    #   @example Order by abbreviated name with Symbol
+    #   @example Order by abbreviated target with Symbol
     #     key = Arrow::SortKey.new("count", :desc)
-    #     key.name  # => "count"
-    #     key.order # => Arrow::SortOrder::DESCENDING
+    #     key.target # => "count"
+    #     key.order  # => Arrow::SortOrder::DESCENDING
     #
     #   @example Order by String
     #     key = Arrow::SortKey.new("count", "descending")
-    #     key.name  # => "count"
-    #     key.order # => Arrow::SortOrder::DESCENDING
+    #     key.target # => "count"
+    #     key.order  # => Arrow::SortOrder::DESCENDING
     #
     #   @example Order by Arrow::SortOrder
     #     key = Arrow::SortKey.new("count", Arrow::SortOrder::DESCENDING)
-    #     key.name  # => "count"
-    #     key.order # => Arrow::SortOrder::DESCENDING
+    #     key.target # => "count"
+    #     key.order  # => Arrow::SortOrder::DESCENDING
     #
     # @since 4.0.0
-    def initialize(name, order=nil)
-      name, order = normalize_name(name, order)
+    def initialize(target, order=nil)
+      target, order = normalize_target(target, order)
       order = normalize_order(order) || :ascending
-      initialize_raw(name, order)
+      initialize_raw(target, order)
     end
 
     # @return [String] The string representation of this sort key. You
@@ -154,28 +157,31 @@ def initialize(name, order=nil)
     # @since 4.0.0
     def to_s
       if order == SortOrder::ASCENDING
-        "+#{name}"
+        "+#{target}"
       else
-        "-#{name}"
+        "-#{target}"
       end
     end
 
+    # For backward compatibility
+    alias_method :name, :target
+
     private
-    def normalize_name(name, order)
-      case name
+    def normalize_target(target, order)
+      case target
       when Symbol
-        return name.to_s, order
+        return target.to_s, order
       when String
-        return name, order if order
-        if name.start_with?("-")
-          return name[1..-1], order || :descending
-        elsif name.start_with?("+")
-          return name[1..-1], order || :ascending
+        return target, order if order
+        if target.start_with?("-")
+          return target[1..-1], order || :descending
+        elsif target.start_with?("+")
+          return target[1..-1], order || :ascending
         else
-          return name, order
+          return target, order
         end
       else
-        return name, order
+        return target, order
       end
     end
 
diff --git a/ruby/red-arrow/lib/arrow/sort-options.rb b/ruby/red-arrow/lib/arrow/sort-options.rb
index a7c2d6431f421..24a027406b6bb 100644
--- a/ruby/red-arrow/lib/arrow/sort-options.rb
+++ b/ruby/red-arrow/lib/arrow/sort-options.rb
@@ -78,20 +78,20 @@ def initialize(*sort_keys)
     #     options.add_sort_key(Arrow::SortKey.new(:price, :descending))
     #     options.sort_keys.collect(&:to_s) # => ["-price"]
     #
-    # @overload add_sort_key(name)
+    # @overload add_sort_key(target)
     #
-    #   @param name [Symbol, String] The sort key name to be
-    #     added. See also {Arrow::SortKey#initialize} for the leading
-    #     order mark for String name.
+    #   @param target [Symbol, String] The sort key name or dot path
+    #     to be added. See also {Arrow::SortKey#initialize} for the
+    #     leading order mark for `String` target.
     #
     #   @example Add a key to sort by "price" column in descending order
     #     options = Arrow::SortOptions.new
     #     options.add_sort_key("-price")
     #     options.sort_keys.collect(&:to_s) # => ["-price"]
     #
-    # @overload add_sort_key(name, order)
+    # @overload add_sort_key(target, order)
     #
-    #   @param name [Symbol, String] The sort key name.
+    #   @param target [Symbol, String] The sort key name or dot path.
     #
     #   @param order [Symbol, String, Arrow::SortOrder] The sort
     #     order. See {Arrow::SortKey#initialize} for details.
@@ -102,8 +102,8 @@ def initialize(*sort_keys)
     #     options.sort_keys.collect(&:to_s) # => ["-price"]
     #
     # @since 4.0.0
-    def add_sort_key(name, order=nil)
-      add_sort_key_raw(SortKey.resolve(name, order))
+    def add_sort_key(target, order=nil)
+      add_sort_key_raw(SortKey.resolve(target, order))
     end
   end
 end