GH-35730: [C++] Add the ability to specify custom schema on a dataset…

… write (#35860) ### Rationale for this change The dataset write node previously allowed you to specify custom key/value metadata on a write node. This was added to support saving schema metadata. However, it doesn't capture field metadata or field nullability. This PR replaces that capability with the ability to specify a custom schema instead. The custom schema must have the same number of fields as the input to the write node and each field must have the same type. ### What changes are included in this PR? Added `custom_schema` to `WriteNodeOptions` and removed `custom_metadata`. ### Are these changes tested? Yes, I added a new C++ unit test to verify that the custom info is applied to written files. ### Are there any user-facing changes? No. Only new functionality (which is user facing) * Closes: #35730 Lead-authored-by: Weston Pace <weston.pace@gmail.com> Co-authored-by: Nic Crane <thisisnic@gmail.com> Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com> Co-authored-by: anjakefala <anja@voltrondata.com> Co-authored-by: Antoine Pitrou <pitrou@free.fr> Signed-off-by: Weston Pace <weston.pace@gmail.com>
apache · Jun 1, 2023 · 018e7d3 · 018e7d3
1 parent 3fe4a31
commit 018e7d3
Show file tree

Hide file tree

Showing 10 changed files with 310 additions and 29 deletions.
diff --git a/cpp/src/arrow/dataset/CMakeLists.txt b/cpp/src/arrow/dataset/CMakeLists.txt
@@ -151,6 +151,7 @@ add_arrow_dataset_test(file_test)
 add_arrow_dataset_test(partition_test)
 add_arrow_dataset_test(scanner_test)
 add_arrow_dataset_test(subtree_test)
+add_arrow_dataset_test(write_node_test)
 
 if(ARROW_CSV)
   add_arrow_dataset_test(file_csv_test)

diff --git a/cpp/src/arrow/dataset/file_base.cc b/cpp/src/arrow/dataset/file_base.cc
@@ -387,16 +387,16 @@ Status WriteBatch(
 
 class DatasetWritingSinkNodeConsumer : public acero::SinkNodeConsumer {
  public:
-  DatasetWritingSinkNodeConsumer(std::shared_ptr<const KeyValueMetadata> custom_metadata,
+  DatasetWritingSinkNodeConsumer(std::shared_ptr<Schema> custom_schema,
                                  FileSystemDatasetWriteOptions write_options)
-      : custom_metadata_(std::move(custom_metadata)),
+      : custom_schema_(std::move(custom_schema)),
         write_options_(std::move(write_options)) {}
 
   Status Init(const std::shared_ptr<Schema>& schema,
               acero::BackpressureControl* backpressure_control,
               acero::ExecPlan* plan) override {
-    if (custom_metadata_) {
-      schema_ = schema->WithMetadata(custom_metadata_);
+    if (custom_schema_) {
+      schema_ = custom_schema_;
     } else {
       schema_ = schema;
     }
@@ -434,7 +434,7 @@ class DatasetWritingSinkNodeConsumer : public acero::SinkNodeConsumer {
                       });
   }
 
-  std::shared_ptr<const KeyValueMetadata> custom_metadata_;
+  std::shared_ptr<Schema> custom_schema_;
   std::unique_ptr<internal::DatasetWriter> dataset_writer_;
   FileSystemDatasetWriteOptions write_options_;
   Future<> finished_ = Future<>::Make();
@@ -453,13 +453,16 @@ Status FileSystemDataset::Write(const FileSystemDatasetWriteOptions& write_optio
 
   // The projected_schema is currently used by pyarrow to preserve the custom metadata
   // when reading from a single input file.
-  const auto& custom_metadata = scanner->options()->projected_schema->metadata();
+  const auto& custom_schema = scanner->options()->projected_schema;
+
+  WriteNodeOptions write_node_options(write_options);
+  write_node_options.custom_schema = custom_schema;
 
   acero::Declaration plan = acero::Declaration::Sequence({
       {"scan", ScanNodeOptions{dataset, scanner->options()}},
       {"filter", acero::FilterNodeOptions{scanner->options()->filter}},
       {"project", acero::ProjectNodeOptions{std::move(exprs), std::move(names)}},
-      {"write", WriteNodeOptions{write_options, custom_metadata}},
+      {"write", std::move(write_node_options)},
   });
 
   return acero::DeclarationToStatus(std::move(plan), scanner->options()->use_threads);
@@ -475,16 +478,50 @@ Result<acero::ExecNode*> MakeWriteNode(acero::ExecPlan* plan,
 
   const WriteNodeOptions write_node_options =
       checked_cast<const WriteNodeOptions&>(options);
+  std::shared_ptr<Schema> custom_schema = write_node_options.custom_schema;
   const std::shared_ptr<const KeyValueMetadata>& custom_metadata =
       write_node_options.custom_metadata;
   const FileSystemDatasetWriteOptions& write_options = write_node_options.write_options;
 
+  const std::shared_ptr<Schema>& input_schema = inputs[0]->output_schema();
+
+  if (custom_schema != nullptr) {
+    if (custom_metadata) {
+      return Status::TypeError(
+          "Do not provide both custom_metadata and custom_schema.  If custom_schema is "
+          "used then custom_schema->metadata should be used instead of custom_metadata");
+    }
+
+    if (custom_schema->num_fields() != input_schema->num_fields()) {
+      return Status::TypeError(
+          "The provided custom_schema did not have the same number of fields as the "
+          "data.  The custom schema can only be used to add metadata / nullability to "
+          "fields and cannot change the type or number of fields.");
+    }
+    for (int field_idx = 0; field_idx < input_schema->num_fields(); field_idx++) {
+      if (!input_schema->field(field_idx)->type()->Equals(
+              custom_schema->field(field_idx)->type())) {
+        return Status::TypeError("The provided custom_schema specified type ",
+                                 custom_schema->field(field_idx)->type()->ToString(),
+                                 " for field ", field_idx, "and the input data has type ",
+                                 input_schema->field(field_idx),
+                                 "The custom schema can only be used to add metadata / "
+                                 "nullability to fields and "
+                                 "cannot change the type or number of fields.");
+      }
+    }
+  }
+
+  if (custom_metadata) {
+    custom_schema = input_schema->WithMetadata(custom_metadata);
+  }
+
   if (!write_options.partitioning) {
     return Status::Invalid("Must provide partitioning");
   }
 
   std::shared_ptr<DatasetWritingSinkNodeConsumer> consumer =
-      std::make_shared<DatasetWritingSinkNodeConsumer>(custom_metadata, write_options);
+      std::make_shared<DatasetWritingSinkNodeConsumer>(custom_schema, write_options);
 
   ARROW_ASSIGN_OR_RAISE(
       auto node,

diff --git a/cpp/src/arrow/dataset/file_base.h b/cpp/src/arrow/dataset/file_base.h
@@ -33,6 +33,7 @@
 #include "arrow/dataset/visibility.h"
 #include "arrow/filesystem/filesystem.h"
 #include "arrow/io/file.h"
+#include "arrow/type_fwd.h"
 #include "arrow/util/compression.h"
 
 namespace arrow {
@@ -470,6 +471,15 @@ class ARROW_DS_EXPORT WriteNodeOptions : public acero::ExecNodeOptions {
 
   /// \brief Options to control how to write the dataset
   FileSystemDatasetWriteOptions write_options;
+  /// \brief Optional schema to attach to all written batches
+  ///
+  /// By default, we will use the output schema of the input.
+  ///
+  /// This can be used to alter schema metadata, field nullability, or field metadata.
+  /// However, this cannot be used to change the type of data.  If the custom schema does
+  /// not have the same number of fields and the same data types as the input then the
+  /// plan will fail.
+  std::shared_ptr<Schema> custom_schema;
   /// \brief Optional metadata to attach to written batches
   std::shared_ptr<const KeyValueMetadata> custom_metadata;
 };

diff --git a/cpp/src/arrow/dataset/write_node_test.cc b/cpp/src/arrow/dataset/write_node_test.cc
@@ -0,0 +1,174 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gmock/gmock-matchers.h>
+#include <gtest/gtest.h>
+
+#include <memory>
+
+#include "arrow/acero/exec_plan.h"
+#include "arrow/acero/options.h"
+#include "arrow/dataset/file_base.h"
+#include "arrow/dataset/file_ipc.h"
+#include "arrow/dataset/partition.h"
+#include "arrow/dataset/plan.h"
+#include "arrow/filesystem/filesystem.h"
+#include "arrow/filesystem/mockfs.h"
+#include "arrow/io/interfaces.h"
+#include "arrow/ipc/reader.h"
+#include "arrow/testing/generator.h"
+#include "arrow/testing/gtest_util.h"
+#include "arrow/testing/matchers.h"
+
+#include "arrow/table.h"
+#include "arrow/util/key_value_metadata.h"
+
+namespace arrow {
+
+namespace dataset {
+
+class SimpleWriteNodeTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    internal::Initialize();
+    mock_fs_ = std::make_shared<fs::internal::MockFileSystem>(fs::kNoTime);
+    auto ipc_format = std::make_shared<dataset::IpcFileFormat>();
+
+    fs_write_options_.filesystem = mock_fs_;
+    fs_write_options_.base_dir = "/my_dataset";
+    fs_write_options_.basename_template = "{i}.arrow";
+    fs_write_options_.file_write_options = ipc_format->DefaultWriteOptions();
+    fs_write_options_.partitioning = dataset::Partitioning::Default();
+  }
+
+  std::shared_ptr<fs::internal::MockFileSystem> mock_fs_;
+  dataset::FileSystemDatasetWriteOptions fs_write_options_;
+};
+
+TEST_F(SimpleWriteNodeTest, CustomNullability) {
+  // Create an input table with a nullable and a non-nullable type
+  ExecBatch batch = gen::Gen({gen::Step()})->FailOnError()->ExecBatch(/*num_rows=*/1);
+  std::shared_ptr<Schema> test_schema =
+      schema({field("nullable_i32", uint32(), /*nullable=*/true),
+              field("non_nullable_i32", uint32(), /*nullable=*/false)});
+  std::shared_ptr<RecordBatch> record_batch =
+      RecordBatch::Make(test_schema, /*num_rows=*/1,
+                        {batch.values[0].make_array(), batch.values[0].make_array()});
+  ASSERT_OK_AND_ASSIGN(std::shared_ptr<Table> table,
+                       Table::FromRecordBatches({std::move(record_batch)}));
+
+  ASSERT_TRUE(table->field(0)->nullable());
+  ASSERT_FALSE(table->field(1)->nullable());
+
+  dataset::WriteNodeOptions write_options(fs_write_options_);
+  write_options.custom_schema = test_schema;
+
+  // Write the data to disk (these plans use a project because it destroys whatever
+  // metadata happened to be in the table source node's output schema).  This more
+  // accurately simulates reading from a dataset.
+  acero::Declaration plan = acero::Declaration::Sequence(
+      {{"table_source", acero::TableSourceNodeOptions(table)},
+       {"project",
+        acero::ProjectNodeOptions({compute::field_ref(0), compute::field_ref(1)})},
+       {"write", write_options}});
+
+  ASSERT_OK(DeclarationToStatus(plan));
+
+  // Read the file back out and verify the nullability
+  ASSERT_OK_AND_ASSIGN(std::shared_ptr<io::RandomAccessFile> file,
+                       mock_fs_->OpenInputFile("/my_dataset/0.arrow"));
+  ASSERT_OK_AND_ASSIGN(std::shared_ptr<ipc::RecordBatchFileReader> file_reader,
+                       ipc::RecordBatchFileReader::Open(file));
+  std::shared_ptr<Schema> file_schema = file_reader->schema();
+
+  ASSERT_TRUE(file_schema->field(0)->nullable());
+  ASSERT_FALSE(file_schema->field(1)->nullable());
+
+  // Invalid custom schema
+
+  // Incorrect # of fields
+  write_options.custom_schema = schema({});
+  plan = acero::Declaration::Sequence(
+      {{"table_source", acero::TableSourceNodeOptions(table)},
+       {"project",
+        acero::ProjectNodeOptions({compute::field_ref(0), compute::field_ref(1)})},
+       {"write", write_options}});
+
+  ASSERT_THAT(
+      DeclarationToStatus(plan),
+      Raises(StatusCode::TypeError,
+             ::testing::HasSubstr("did not have the same number of fields as the data")));
+
+  // Incorrect types
+  write_options.custom_schema =
+      schema({field("nullable_i32", int32()), field("non_nullable_i32", int32())});
+  plan = acero::Declaration::Sequence(
+      {{"table_source", acero::TableSourceNodeOptions(table)},
+       {"project",
+        acero::ProjectNodeOptions({compute::field_ref(0), compute::field_ref(1)})},
+       {"write", write_options}});
+  ASSERT_THAT(
+      DeclarationToStatus(plan),
+      Raises(StatusCode::TypeError, ::testing::HasSubstr("and the input data has type")));
+
+  // Cannot have both custom_schema and custom_metadata
+  write_options.custom_schema = test_schema;
+  write_options.custom_metadata = key_value_metadata({{"foo", "bar"}});
+  plan = acero::Declaration::Sequence(
+      {{"table_source", acero::TableSourceNodeOptions(std::move(table))},
+       {"project",
+        acero::ProjectNodeOptions({compute::field_ref(0), compute::field_ref(1)})},
+       {"write", write_options}});
+  ASSERT_THAT(DeclarationToStatus(plan),
+              Raises(StatusCode::TypeError,
+                     ::testing::HasSubstr(
+                         "Do not provide both custom_metadata and custom_schema")));
+}
+
+TEST_F(SimpleWriteNodeTest, CustomMetadata) {
+  constexpr int64_t kRowsPerChunk = 1;
+  constexpr int64_t kNumChunks = 1;
+  // Create an input table with no schema metadata
+  std::shared_ptr<Table> table =
+      gen::Gen({gen::Step()})->FailOnError()->Table(kRowsPerChunk, kNumChunks);
+
+  std::shared_ptr<KeyValueMetadata> custom_metadata =
+      key_value_metadata({{"foo", "bar"}});
+
+  dataset::WriteNodeOptions write_options(fs_write_options_);
+  write_options.custom_metadata = custom_metadata;
+
+  // Write the data to disk
+  acero::Declaration plan = acero::Declaration::Sequence(
+      {{"table_source", acero::TableSourceNodeOptions(table)},
+       {"project", acero::ProjectNodeOptions({compute::field_ref(0)})},
+       {"write", write_options}});
+
+  ASSERT_OK(DeclarationToStatus(plan));
+
+  // Read the file back out and verify the schema metadata
+  ASSERT_OK_AND_ASSIGN(std::shared_ptr<io::RandomAccessFile> file,
+                       mock_fs_->OpenInputFile("/my_dataset/0.arrow"));
+  ASSERT_OK_AND_ASSIGN(std::shared_ptr<ipc::RecordBatchFileReader> file_reader,
+                       ipc::RecordBatchFileReader::Open(file));
+  std::shared_ptr<Schema> file_schema = file_reader->schema();
+
+  ASSERT_TRUE(custom_metadata->Equals(*file_schema->metadata()));
+}
+
+}  // namespace dataset
+}  // namespace arrow
diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py
@@ -5172,6 +5172,59 @@ def test_dataset_partition_with_slash(tmpdir):
     assert encoded_paths == file_paths
 
 
+@pytest.mark.parquet
+def test_write_dataset_preserve_nullability(tempdir):
+    # GH-35730
+    schema_nullable = pa.schema([
+        pa.field("x", pa.int64(), nullable=False),
+        pa.field("y", pa.int64(), nullable=True)])
+
+    arrays = [[1, 2, 3], [None, 5, None]]
+    table = pa.Table.from_arrays(arrays, schema=schema_nullable)
+
+    pq.write_to_dataset(table, tempdir / "nulltest1")
+    dataset = ds.dataset(tempdir / "nulltest1", format="parquet")
+    # nullability of field is preserved
+    assert dataset.to_table().schema.equals(schema_nullable)
+
+    ds.write_dataset(table, tempdir / "nulltest2", format="parquet")
+    dataset = ds.dataset(tempdir / "nulltest2", format="parquet")
+    assert dataset.to_table().schema.equals(schema_nullable)
+
+    ds.write_dataset([table, table], tempdir / "nulltest3", format="parquet")
+    dataset = ds.dataset(tempdir / "nulltest3", format="parquet")
+    assert dataset.to_table().schema.equals(schema_nullable)
+
+
+def test_write_dataset_preserve_field_metadata(tempdir):
+    schema_metadata = pa.schema([
+        pa.field("x", pa.int64(), metadata={b'foo': b'bar'}),
+        pa.field("y", pa.int64())])
+
+    schema_no_meta = pa.schema([
+        pa.field("x", pa.int64()),
+        pa.field("y", pa.int64())])
+
+    arrays = [[1, 2, 3], [None, 5, None]]
+    table = pa.Table.from_arrays(arrays, schema=schema_metadata)
+    table_no_meta = pa.Table.from_arrays(arrays, schema=schema_no_meta)
+
+    # If no schema is provided the schema of the first table will be used
+    ds.write_dataset([table, table_no_meta], tempdir / "test1", format="parquet")
+    dataset = ds.dataset(tempdir / "test1", format="parquet")
+    assert dataset.to_table().schema.equals(schema_metadata, check_metadata=True)
+
+    ds.write_dataset([table_no_meta, table], tempdir / "test2", format="parquet")
+    dataset = ds.dataset(tempdir / "test2", format="parquet")
+    assert dataset.to_table().schema.equals(schema_no_meta, check_metadata=True)
+
+    # If a schema is provided it will override the schema of the input
+    ds.write_dataset([table_no_meta, table], tempdir / "test3", format="parquet",
+                     schema=schema_metadata)
+    dataset = ds.dataset(tempdir / "test3", format="parquet")
+    assert dataset.to_table().schema.equals(schema_metadata, check_metadata=True)
+
+
 @pytest.mark.parametrize('dstype', [
     "fs", "mem"
 ])

diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R
diff --git a/r/R/query-engine.R b/r/R/query-engine.R
@@ -236,10 +236,12 @@ ExecPlan <- R6Class("ExecPlan",
     },
     Write = function(node, ...) {
       # TODO(ARROW-16200): take FileSystemDatasetWriteOptions not ...
+      final_metadata <- prepare_key_value_metadata(node$final_metadata())
+
       ExecPlan_Write(
         self,
         node,
-        prepare_key_value_metadata(node$final_metadata()),
+        node$schema$WithMetadata(final_metadata),
         ...
       )
     },