From 8f16381f5db6ca2b85d0466c9eed93d26dafa5b7 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 18 Sep 2025 16:54:56 -0400 Subject: [PATCH 1/6] Support reading/writing VariantArray to parquet with Variant LogicalType --- parquet-variant-compute/Cargo.toml | 2 +- parquet/Cargo.toml | 2 +- parquet/src/arrow/schema/complex.rs | 29 +-- parquet/src/arrow/schema/extension.rs | 62 ++++++ parquet/src/arrow/schema/mod.rs | 12 +- parquet/src/variant.rs | 263 ++++++++++++++++++++++---- 6 files changed, 317 insertions(+), 53 deletions(-) create mode 100644 parquet/src/arrow/schema/extension.rs diff --git a/parquet-variant-compute/Cargo.toml b/parquet-variant-compute/Cargo.toml index feb8172a9407..c110310b4237 100644 --- a/parquet-variant-compute/Cargo.toml +++ b/parquet-variant-compute/Cargo.toml @@ -31,7 +31,7 @@ rust-version = { workspace = true } [dependencies] -arrow = { workspace = true } +arrow = { workspace = true , features = ["canonical_extension_types"]} arrow-schema = { workspace = true } half = { version = "2.1", default-features = false } indexmap = "2.10.0" diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index 5dbd4b5b39dd..b34928bc1f25 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -129,7 +129,7 @@ encryption = ["dep:ring"] flate2-rust_backened = ["flate2/rust_backend"] flate2-zlib-rs = ["flate2/zlib-rs"] # Enable parquet variant support -variant_experimental = ["parquet-variant", "parquet-variant-json", "parquet-variant-compute"] +variant_experimental = ["arrow", "parquet-variant", "parquet-variant-json", "parquet-variant-compute"] [[example]] diff --git a/parquet/src/arrow/schema/complex.rs b/parquet/src/arrow/schema/complex.rs index 16d46bd852dc..ecc80a65904a 100644 --- a/parquet/src/arrow/schema/complex.rs +++ b/parquet/src/arrow/schema/complex.rs @@ -18,6 +18,7 @@ use std::collections::HashMap; use std::sync::Arc; +use crate::arrow::schema::extension::add_extension_type; use crate::arrow::schema::primitive::convert_primitive; use crate::arrow::{ProjectionMask, PARQUET_FIELD_ID_META_KEY}; use crate::basic::{ConvertedType, Repetition}; @@ -172,7 +173,7 @@ impl Visitor { let parquet_fields = struct_type.get_fields(); - // Extract the arrow fields + // Extract any arrow fields from the hints let arrow_fields = match &context.data_type { Some(DataType::Struct(fields)) => { if fields.len() != parquet_fields.len() { @@ -220,10 +221,10 @@ impl Visitor { data_type, }; - if let Some(child) = self.dispatch(parquet_field, child_ctx)? { + if let Some(mut child) = self.dispatch(parquet_field, child_ctx)? { // The child type returned may be different from what is encoded in the arrow // schema in the event of a mismatch or a projection - child_fields.push(convert_field(parquet_field, &child, arrow_field)); + child_fields.push(convert_field(parquet_field, &mut child, arrow_field)); children.push(child); } } @@ -352,13 +353,13 @@ impl Visitor { // Need both columns to be projected match (maybe_key, maybe_value) { - (Some(key), Some(value)) => { + (Some(mut key), Some(mut value)) => { let key_field = Arc::new( - convert_field(map_key, &key, arrow_key) + convert_field(map_key, &mut key, arrow_key) // The key is always non-nullable (#5630) .with_nullable(false), ); - let value_field = Arc::new(convert_field(map_value, &value, arrow_value)); + let value_field = Arc::new(convert_field(map_value, &mut value, arrow_value)); let field_metadata = match arrow_map { Some(field) => field.metadata().clone(), _ => HashMap::default(), @@ -495,8 +496,8 @@ impl Visitor { }; match self.dispatch(item_type, new_context) { - Ok(Some(item)) => { - let item_field = Arc::new(convert_field(item_type, &item, arrow_field)); + Ok(Some(mut item)) => { + let item_field = Arc::new(convert_field(item_type, &mut item, arrow_field)); // Use arrow type as hint for index size let arrow_type = match context.data_type { @@ -540,11 +541,15 @@ impl Visitor { } } -/// Computes the [`Field`] for a child column +/// Computes the Arrow [`Field`] for a child column /// -/// The resulting [`Field`] will have the type dictated by `field`, a name +/// The resulting Arrow [`Field`] will have the type dictated by the Parquet `field`, a name /// dictated by the `parquet_type`, and any metadata from `arrow_hint` -fn convert_field(parquet_type: &Type, field: &ParquetField, arrow_hint: Option<&Field>) -> Field { +fn convert_field( + parquet_type: &Type, + field: &mut ParquetField, + arrow_hint: Option<&Field>, +) -> Field { let name = parquet_type.name(); let data_type = field.arrow_type.clone(); let nullable = field.nullable; @@ -575,7 +580,7 @@ fn convert_field(parquet_type: &Type, field: &ParquetField, arrow_hint: Option<& ); ret.set_metadata(meta); } - ret + add_extension_type(ret, parquet_type) } } } diff --git a/parquet/src/arrow/schema/extension.rs b/parquet/src/arrow/schema/extension.rs new file mode 100644 index 000000000000..de70923392ce --- /dev/null +++ b/parquet/src/arrow/schema/extension.rs @@ -0,0 +1,62 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Arrow Extension Type Support for Parquet +//! +//! This module contains mapping code to map Parquet [`LogicalType`]s to/from +//! Arrow [`ExtensionType`]s. +//! +//! Extension types are represented using the metadata from Arrow [`Field`]s +//! with the key "ARROW:extension:name". + +use crate::basic::LogicalType; +use crate::schema::types::Type; +use arrow_schema::extension::ExtensionType; +use arrow_schema::Field; + +/// Adds extension type metadata, if necessary, based on the Parquet field's +/// [`LogicalType`] +/// +/// Some Parquet logical types, such as Variant, do not map directly to an +/// Arrow DataType, and instead are represented by an Arrow ExtensionType. +/// Extension types are attached to Arrow Fields via metadata. +pub(crate) fn add_extension_type(arrow_field: Field, parquet_type: &Type) -> Field { + let result = match parquet_type.get_basic_info().logical_type() { + #[cfg(feature = "variant_experimental")] + Some(LogicalType::Variant) => { + arrow_field.with_extension_type(parquet_variant_compute::VariantType) + } + // TODO add other LogicalTypes here + _ => arrow_field, + }; + result +} + +/// Return the Parquet logical type to use for the specified Arrow field, if any. +#[cfg(feature = "variant_experimental")] +pub(crate) fn logical_type_for_struct(field: &Field) -> Option { + use parquet_variant_compute::VariantType; + if field.extension_type_name()? == VariantType::NAME { + return Some(LogicalType::Variant); + }; + None +} + +#[cfg(not(feature = "variant_experimental"))] +pub(crate) fn logical_type_for_struct(field: &Field) -> Option { + None +} diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index 5b079b66276a..9d1098d86ca6 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -35,13 +35,14 @@ use crate::file::{metadata::KeyValue, properties::WriterProperties}; use crate::schema::types::{ColumnDescriptor, SchemaDescriptor, Type}; mod complex; +mod extension; mod primitive; +use super::PARQUET_FIELD_ID_META_KEY; +use crate::arrow::schema::extension::logical_type_for_struct; use crate::arrow::ProjectionMask; pub(crate) use complex::{ParquetField, ParquetFieldType}; -use super::PARQUET_FIELD_ID_META_KEY; - /// Convert Parquet schema to Arrow schema including optional metadata /// /// Attempts to decode any existing Arrow schema metadata, falling back @@ -63,7 +64,11 @@ pub fn parquet_to_arrow_schema_by_columns( Ok(parquet_to_arrow_schema_and_fields(parquet_schema, mask, key_value_metadata)?.0) } -/// Extracts the arrow metadata +/// Determines the Arrow Schema from a Parquet schema +/// +/// Looks for an Arrow schema metadata "hint" (see +/// [`parquet_to_arrow_field_levels`]), and uses it if present to ensure +/// lossless round trips. pub(crate) fn parquet_to_arrow_schema_and_fields( parquet_schema: &SchemaDescriptor, mask: ProjectionMask, @@ -728,6 +733,7 @@ fn arrow_to_parquet_type(field: &Field, coerce_types: bool) -> Result { .with_fields(fields) .with_repetition(repetition) .with_id(id) + .with_logical_type(logical_type_for_struct(field)) .build() } DataType::Map(field, _) => { diff --git a/parquet/src/variant.rs b/parquet/src/variant.rs index b5902c02ed8e..276059ff49f9 100644 --- a/parquet/src/variant.rs +++ b/parquet/src/variant.rs @@ -25,38 +25,36 @@ //! * [`Variant`] represents variant value, which can be an object, list, or primitive. //! * [`VariantBuilder`] for building `Variant` values. //! * [`VariantArray`] for representing a column of Variant values. -//! * [`compute`] module with functions for manipulating Variants, such as +//! * [`json_to_variant`] and [`variant_to_json`] for converting to/from JSON. +//! * [`cast_to_variant()`] for casting other Arrow arrays to `VariantArray`. +//! * [`VariantType`] Arrow ExtensionType for Parquet Variant logical type. //! [`variant_get`] to extracting a value by path and functions to convert //! between `Variant` and JSON. //! -//! [Variant Logical Type]: Variant -//! [`VariantArray`]: compute::VariantArray -//! [`variant_get`]: compute::variant_get -//! //! # Example: Writing a Parquet file with Variant column //! ```rust -//! # use parquet::variant::compute::{VariantArray, VariantArrayBuilder}; -//! # use parquet::variant::VariantBuilderExt; +//! # use parquet::variant::{VariantArray, VariantType, VariantArrayBuilder, VariantBuilderExt}; //! # use std::sync::Arc; -//! # use arrow_array::{ArrayRef, RecordBatch}; +//! # use arrow_array::{Array, ArrayRef, RecordBatch}; +//! # use arrow_schema::{DataType, Field, Schema}; //! # use parquet::arrow::ArrowWriter; //! # fn main() -> Result<(), parquet::errors::ParquetError> { //! // Use the VariantArrayBuilder to build a VariantArray //! let mut builder = VariantArrayBuilder::new(3); -//! // row 1: {"name": "Alice"} -//! builder.new_object().with_field("name", "Alice").finish(); +//! builder.new_object().with_field("name", "Alice").finish(); // row 1: {"name": "Alice"} +//! builder.append_value("such wow"); // row 2: "such wow" (a string) //! let array = builder.build(); //! -//! // TODO support writing VariantArray directly -//! // at the moment it panics when trying to downcast to a struct array -//! // https://github.com/apache/arrow-rs/issues/8296 -//! // let array: ArrayRef = Arc::new(array); -//! let array: ArrayRef = Arc::new(array.into_inner()); -//! +//! // Since VariantArray is an ExtensionType, it needs to be converted +//! // to an ArrayRef and Field with the appropriate metadata +//! // before it can be written to a Parquet file +//! let field = array.field("data"); +//! let array = ArrayRef::from(array); //! // create a RecordBatch with the VariantArray -//! let batch = RecordBatch::try_from_iter(vec![("data", array)])?; +//! let schema = Schema::new(vec![field]); +//! let batch = RecordBatch::try_new(Arc::new(schema), vec![array])?; //! -//! // write the RecordBatch to a Parquet file +//! // Now you can write the RecordBatch to the Parquet file, as normal //! let file = std::fs::File::create("variant.parquet")?; //! let mut writer = ArrowWriter::try_new(file, batch.schema(), None)?; //! writer.write(&batch)?; @@ -67,37 +65,29 @@ //! # } //! ``` //! -//! # Example: Writing JSON with a Parquet file with Variant column +//! # Example: Writing JSON into a Parquet file with Variant column //! ```rust //! # use std::sync::Arc; //! # use arrow_array::{ArrayRef, RecordBatch, StringArray}; -//! # use parquet::variant::compute::json_to_variant; -//! # use parquet::variant::compute::VariantArray; +//! # use arrow_schema::Schema; +//! # use parquet::variant::{json_to_variant, VariantArray}; //! # use parquet::arrow::ArrowWriter; //! # fn main() -> Result<(), parquet::errors::ParquetError> { //! // Create an array of JSON strings, simulating a column of JSON data -//! // TODO use StringViewArray when available -//! let input_array = StringArray::from(vec![ +//! let input_array: ArrayRef = Arc::new(StringArray::from(vec![ //! Some(r#"{"name": "Alice", "age": 30}"#), //! Some(r#"{"name": "Bob", "age": 25, "address": {"city": "New York"}}"#), //! None, //! Some("{}"), -//! ]); -//! let input_array: ArrayRef = Arc::new(input_array); +//! ])); //! //! // Convert the JSON strings to a VariantArray //! let array: VariantArray = json_to_variant(&input_array)?; -//! -//! // TODO support writing VariantArray directly -//! // at the moment it panics when trying to downcast to a struct array -//! // https://github.com/apache/arrow-rs/issues/8296 -//! // let array: ArrayRef = Arc::new(array); -//! let array: ArrayRef = Arc::new(array.into_inner()); -//! //! // create a RecordBatch with the VariantArray -//! let batch = RecordBatch::try_from_iter(vec![("data", array)])?; +//! let schema = Schema::new(vec![array.field("data")]); +//! let batch = RecordBatch::try_new(Arc::new(schema), vec![ArrayRef::from(array)])?; //! -//! // write the RecordBatch to a Parquet file +//! // write the RecordBatch to a Parquet file as normal //! let file = std::fs::File::create("variant-json.parquet")?; //! let mut writer = ArrowWriter::try_new(file, batch.schema(), None)?; //! writer.write(&batch)?; @@ -108,6 +98,207 @@ //! ``` //! //! # Example: Reading a Parquet file with Variant column -//! (TODO: add example) +//! +//! Use the [`VariantType`] extension type to find the Variant column: +//! +//! ``` +//! # use std::sync::Arc; +//! # use std::path::PathBuf; +//! # use arrow_array::{ArrayRef, RecordBatch, RecordBatchReader}; +//! # use parquet::variant::{Variant, VariantArray, VariantType}; +//! # use parquet::arrow::arrow_reader::ArrowReaderBuilder; +//! # fn main() -> Result<(), parquet::errors::ParquetError> { +//! # use arrow_array::StructArray; +//! # fn file_path() -> PathBuf { // return a testing file path +//! # PathBuf::from(arrow::util::test_util::parquet_test_data()) +//! # .join("..") +//! # .join("shredded_variant") +//! # .join("case-075.parquet") +//! # } +//! // Read the Parquet file using standard Arrow Parquet reader. +//! // Note this file has 2 columns: "id", "var", and the "var" column +// // contains a variant that looks like this: +// // "Variant(metadata=VariantMetadata(dict={}), value=Variant(type=STRING, value=iceberg))" +//! let file = std::fs::File::open(file_path())?; +//! let mut reader = ArrowReaderBuilder::try_new(file)?.build()?; +//! +//! // You can check if a column contains a Variant using +//! // the VariantType extension type +//! let schema = reader.schema(); +//! let field = schema.field_with_name("var")?; +//! assert!(field.try_extension_type::().is_ok()); +//! +//! // The reader will yield RecordBatches with a StructArray +//! // to convert them to VariantArray, use VariantArray::try_new +//! let batch = reader.next().unwrap().unwrap(); +//! +//! let col = batch.column_by_name("var").unwrap(); +//! let var_array = VariantArray::try_new(col)?; +//! assert_eq!(var_array.len(), 1); +//! let var_value: Variant = var_array.value(0); +//! assert_eq!(var_value, Variant::from("iceberg")); // the value in case-075.parquet +//! # Ok(()) +//! # } +//! ``` pub use parquet_variant::*; -pub use parquet_variant_compute as compute; +pub use parquet_variant_compute::*; + +#[cfg(test)] +mod tests { + use crate::arrow::arrow_reader::ArrowReaderBuilder; + use crate::arrow::ArrowWriter; + use crate::file::metadata::{ParquetMetaData, ParquetMetaDataReader}; + use crate::file::reader::ChunkReader; + use arrow::util::test_util::parquet_test_data; + use arrow_array::{ArrayRef, RecordBatch}; + use arrow_schema::Schema; + use bytes::Bytes; + use parquet_variant::{Variant, VariantBuilderExt}; + use parquet_variant_compute::{VariantArray, VariantArrayBuilder, VariantType}; + use std::path::PathBuf; + use std::sync::Arc; + + #[test] + fn roundtrip_basic() { + roundtrip(variant_array()); + } + + /// Ensure a file with Variant LogicalType, written by another writer in + /// parquet-testing, can be read as a VariantArray + #[test] + fn read_logical_type() { + // Note: case-075 2 columns ("id", "var") + // The variant looks like this: + // "Variant(metadata=VariantMetadata(dict={}), value=Variant(type=STRING, value=iceberg))" + let batch = read_shredded_variant_test_case("case-075.parquet"); + + assert_variant_metadata(&batch, "var"); + let var_column = batch.column_by_name("var").expect("expected var column"); + let var_array = + VariantArray::try_new(&var_column).expect("expected var column to be a VariantArray"); + + // verify the value + assert_eq!(var_array.len(), 1); + assert!(var_array.is_valid(0)); + let var_value = var_array.value(0); + assert_eq!(var_value, Variant::from("iceberg")); + } + + /// Writes a variant to a parquet file and ensures the parquet logical type + /// annotation is correct + #[test] + fn write_logical_type() { + let array = variant_array(); + let batch = variant_array_to_batch(array); + let buffer = write_to_buffer(&batch); + + // read the parquet file's metadata and verify the logical type + let metadata = read_metadata(&Bytes::from(buffer)); + let schema = metadata.file_metadata().schema_descr(); + let fields = schema.root_schema().get_fields(); + assert_eq!(fields.len(), 1); + let field = &fields[0]; + assert_eq!(field.name(), "data"); + // data should have been written with the Variant logical type + assert_eq!( + field.get_basic_info().logical_type(), + Some(crate::basic::LogicalType::Variant) + ); + } + + /// Return a VariantArray with 3 rows: + /// + /// 1. `{"name": "Alice"}` + /// 2. `"such wow"` (a string) + /// 3. `null` + fn variant_array() -> VariantArray { + let mut builder = VariantArrayBuilder::new(3); + // row 1: {"name": "Alice"} + builder.new_object().with_field("name", "Alice").finish(); + // row 2: "such wow" (a string) + builder.append_value("such wow"); + // row 3: null + builder.append_null(); + builder.build() + } + + /// Writes a VariantArray to a parquet file and reads it back, verifying that + /// the data is the same + fn roundtrip(array: VariantArray) { + let source_batch = variant_array_to_batch(array); + assert_variant_metadata(&source_batch, "data"); + + let buffer = write_to_buffer(&source_batch); + let result_batch = read_to_batch(Bytes::from(buffer)); + assert_variant_metadata(&result_batch, "data"); + assert_eq!(result_batch, source_batch); // NB this also checks the schemas + } + + /// creates a RecordBatch with a single column "data" from a VariantArray, + fn variant_array_to_batch(array: VariantArray) -> RecordBatch { + let field = array.field("data"); + let schema = Schema::new(vec![field]); + RecordBatch::try_new(Arc::new(schema), vec![ArrayRef::from(array)]).unwrap() + } + + /// writes a RecordBatch to memory buffer and returns the buffer + fn write_to_buffer(batch: &RecordBatch) -> Vec { + let mut buffer = vec![]; + let mut writer = ArrowWriter::try_new(&mut buffer, batch.schema(), None).unwrap(); + writer.write(batch).unwrap(); + writer.close().unwrap(); + buffer + } + + /// Reads the Parquet metadata + fn read_metadata(input: &T) -> ParquetMetaData { + let mut reader = ParquetMetaDataReader::new(); + reader.try_parse(input).unwrap(); + reader.finish().unwrap() + } + + /// Reads a RecordBatch from a reader (e.g. Vec or File) + fn read_to_batch(reader: T) -> RecordBatch { + let reader = ArrowReaderBuilder::try_new(reader) + .unwrap() + .build() + .unwrap(); + let mut batches: Vec = reader.collect::, _>>().unwrap(); + assert_eq!(batches.len(), 1); + batches.swap_remove(0) + } + + /// Verifies the variant metadata is present in the schema for the specified + /// field name. + fn assert_variant_metadata(batch: &RecordBatch, field_name: &str) { + let schema = batch.schema(); + let field = schema + .field_with_name(field_name) + .expect("could not find expected field"); + + // explicitly check the metadata so it is clear in the tests what the + // names are + let metadata_value = field + .metadata() + .get("ARROW:extension:name") + .expect("metadata does not exist"); + + assert_eq!(metadata_value, "arrow.parquet.variant"); + + // verify that `VariantType` also correctly finds the metadata + field + .try_extension_type::() + .expect("VariantExtensionType should be readable"); + } + + /// Read the specified test case filename from parquet-testing + /// See parquet-testing/shredded_variant/cases.json for more details + fn read_shredded_variant_test_case(name: &str) -> RecordBatch { + let case_file = PathBuf::from(parquet_test_data()) + .join("..") // go up from data/ to parquet-testing/ + .join("shredded_variant") + .join(name); + let case_file = std::fs::File::open(case_file).unwrap(); + read_to_batch(case_file) + } +} From 68ffd32764ddaf84d27436df92a8f9a603e08443 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 24 Sep 2025 15:25:22 -0400 Subject: [PATCH 2/6] use try_extension_type after faster name check --- parquet/src/arrow/schema/extension.rs | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/parquet/src/arrow/schema/extension.rs b/parquet/src/arrow/schema/extension.rs index de70923392ce..191015b2c230 100644 --- a/parquet/src/arrow/schema/extension.rs +++ b/parquet/src/arrow/schema/extension.rs @@ -50,10 +50,16 @@ pub(crate) fn add_extension_type(arrow_field: Field, parquet_type: &Type) -> Fie #[cfg(feature = "variant_experimental")] pub(crate) fn logical_type_for_struct(field: &Field) -> Option { use parquet_variant_compute::VariantType; - if field.extension_type_name()? == VariantType::NAME { - return Some(LogicalType::Variant); - }; - None + // Check the name (= quick and cheap) and only try_extension_type if the name matches + // to avoid unnecessary String allocations in ArrowError + if field.extension_type_name()? != VariantType::NAME { + return None; + } + match field.try_extension_type::() { + Ok(VariantType) => Some(LogicalType::Variant), + // Given check above, this should not error, but if it does ignore + Err(_e) => None, + } } #[cfg(not(feature = "variant_experimental"))] From ddd5fb886f038538bd24140a24d5a5ad0de07595 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 26 Sep 2025 11:21:57 -0400 Subject: [PATCH 3/6] Update parquet/src/arrow/schema/extension.rs Co-authored-by: Matthijs Brobbel --- parquet/src/arrow/schema/extension.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/parquet/src/arrow/schema/extension.rs b/parquet/src/arrow/schema/extension.rs index 191015b2c230..54cf37ee854d 100644 --- a/parquet/src/arrow/schema/extension.rs +++ b/parquet/src/arrow/schema/extension.rs @@ -35,15 +35,14 @@ use arrow_schema::Field; /// Arrow DataType, and instead are represented by an Arrow ExtensionType. /// Extension types are attached to Arrow Fields via metadata. pub(crate) fn add_extension_type(arrow_field: Field, parquet_type: &Type) -> Field { - let result = match parquet_type.get_basic_info().logical_type() { + match parquet_type.get_basic_info().logical_type() { #[cfg(feature = "variant_experimental")] Some(LogicalType::Variant) => { arrow_field.with_extension_type(parquet_variant_compute::VariantType) } // TODO add other LogicalTypes here _ => arrow_field, - }; - result + } } /// Return the Parquet logical type to use for the specified Arrow field, if any. From a75e4cab8b59c958e5ecbd13463904d468a70baf Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 26 Sep 2025 11:27:58 -0400 Subject: [PATCH 4/6] fix whitespace in examples --- parquet/src/variant.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/parquet/src/variant.rs b/parquet/src/variant.rs index 276059ff49f9..497d1dc6c4f3 100644 --- a/parquet/src/variant.rs +++ b/parquet/src/variant.rs @@ -73,18 +73,18 @@ //! # use parquet::variant::{json_to_variant, VariantArray}; //! # use parquet::arrow::ArrowWriter; //! # fn main() -> Result<(), parquet::errors::ParquetError> { -//! // Create an array of JSON strings, simulating a column of JSON data -//! let input_array: ArrayRef = Arc::new(StringArray::from(vec![ +//! // Create an array of JSON strings, simulating a column of JSON data +//! let input_array: ArrayRef = Arc::new(StringArray::from(vec![ //! Some(r#"{"name": "Alice", "age": 30}"#), //! Some(r#"{"name": "Bob", "age": 25, "address": {"city": "New York"}}"#), //! None, //! Some("{}"), -//! ])); +//! ])); //! -//! // Convert the JSON strings to a VariantArray -//! let array: VariantArray = json_to_variant(&input_array)?; +//! // Convert the JSON strings to a VariantArray +//! let array: VariantArray = json_to_variant(&input_array)?; //! // create a RecordBatch with the VariantArray -//! let schema = Schema::new(vec![array.field("data")]); +//! let schema = Schema::new(vec![array.field("data")]); //! let batch = RecordBatch::try_new(Arc::new(schema), vec![ArrayRef::from(array)])?; //! //! // write the RecordBatch to a Parquet file as normal From e6cacd824e4c2441ebb5176000afe5da030b86d9 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 26 Sep 2025 11:33:53 -0400 Subject: [PATCH 5/6] Do not panic on extension type mismatch --- parquet/src/arrow/schema/extension.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/parquet/src/arrow/schema/extension.rs b/parquet/src/arrow/schema/extension.rs index 54cf37ee854d..d7296cd2b217 100644 --- a/parquet/src/arrow/schema/extension.rs +++ b/parquet/src/arrow/schema/extension.rs @@ -34,11 +34,14 @@ use arrow_schema::Field; /// Some Parquet logical types, such as Variant, do not map directly to an /// Arrow DataType, and instead are represented by an Arrow ExtensionType. /// Extension types are attached to Arrow Fields via metadata. -pub(crate) fn add_extension_type(arrow_field: Field, parquet_type: &Type) -> Field { +pub(crate) fn add_extension_type(mut arrow_field: Field, parquet_type: &Type) -> Field { match parquet_type.get_basic_info().logical_type() { #[cfg(feature = "variant_experimental")] Some(LogicalType::Variant) => { - arrow_field.with_extension_type(parquet_variant_compute::VariantType) + // try to add the Variant extension type, but if that fails (e.g. because the + // storage type is not supported), just return the field as is + arrow_field.try_with_extension_type(parquet_variant_compute::VariantType).ok(); + arrow_field } // TODO add other LogicalTypes here _ => arrow_field, From 77f0d93ec84dead467db47b6f06578b2021cfd92 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 26 Sep 2025 12:10:05 -0400 Subject: [PATCH 6/6] fmt --- parquet/src/arrow/schema/extension.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/parquet/src/arrow/schema/extension.rs b/parquet/src/arrow/schema/extension.rs index d7296cd2b217..752b9a5ced87 100644 --- a/parquet/src/arrow/schema/extension.rs +++ b/parquet/src/arrow/schema/extension.rs @@ -40,7 +40,9 @@ pub(crate) fn add_extension_type(mut arrow_field: Field, parquet_type: &Type) -> Some(LogicalType::Variant) => { // try to add the Variant extension type, but if that fails (e.g. because the // storage type is not supported), just return the field as is - arrow_field.try_with_extension_type(parquet_variant_compute::VariantType).ok(); + arrow_field + .try_with_extension_type(parquet_variant_compute::VariantType) + .ok(); arrow_field } // TODO add other LogicalTypes here