From 99cedf630537453aa363ca328fc2fb60029d5e3e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=ADa=20Adriana?= Date: Mon, 29 Sep 2025 17:07:35 +0200 Subject: [PATCH 01/13] Refactor HashJoinExec to progressively accumulate dynamic filter bounds instead of computing them after data is accumulated (#17444) (#46) (cherry picked from commit 5b833b9a63018fd91baa9bb9031b4ed19dbfd721) Co-authored-by: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> --- Cargo.lock | 1 - datafusion/physical-plan/Cargo.toml | 2 +- .../physical-plan/src/joins/hash_join/exec.rs | 193 +++++++++++++++--- 3 files changed, 160 insertions(+), 36 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 25bcaf68cb84..5186c6d6d598 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2559,7 +2559,6 @@ dependencies = [ "datafusion-execution", "datafusion-expr", "datafusion-functions-aggregate", - "datafusion-functions-aggregate-common", "datafusion-functions-window", "datafusion-functions-window-common", "datafusion-physical-expr", diff --git a/datafusion/physical-plan/Cargo.toml b/datafusion/physical-plan/Cargo.toml index 830a159d5eb1..01e4b1899a86 100644 --- a/datafusion/physical-plan/Cargo.toml +++ b/datafusion/physical-plan/Cargo.toml @@ -53,7 +53,7 @@ datafusion-common = { workspace = true, default-features = true } datafusion-common-runtime = { workspace = true, default-features = true } datafusion-execution = { workspace = true } datafusion-expr = { workspace = true } -datafusion-functions-aggregate-common = { workspace = true } +datafusion-functions-aggregate = { workspace = true } datafusion-functions-window-common = { workspace = true } datafusion-physical-expr = { workspace = true, default-features = true } datafusion-physical-expr-common = { workspace = true } diff --git a/datafusion/physical-plan/src/joins/hash_join/exec.rs b/datafusion/physical-plan/src/joins/hash_join/exec.rs index b184bd341306..2c23a9c8c5d1 100644 --- a/datafusion/physical-plan/src/joins/hash_join/exec.rs +++ b/datafusion/physical-plan/src/joins/hash_join/exec.rs @@ -54,20 +54,21 @@ use crate::{ PlanProperties, SendableRecordBatchStream, Statistics, }; -use arrow::array::{Array, ArrayRef, BooleanBufferBuilder}; +use arrow::array::{ArrayRef, BooleanBufferBuilder}; use arrow::compute::concat_batches; use arrow::datatypes::SchemaRef; use arrow::record_batch::RecordBatch; use arrow::util::bit_util; +use arrow_schema::DataType; use datafusion_common::config::ConfigOptions; use datafusion_common::utils::memory::estimate_memory_size; use datafusion_common::{ internal_err, plan_err, project_schema, JoinSide, JoinType, NullEquality, Result, - ScalarValue, }; use datafusion_execution::memory_pool::{MemoryConsumer, MemoryReservation}; use datafusion_execution::TaskContext; -use datafusion_functions_aggregate_common::min_max::{max_batch, min_batch}; +use datafusion_expr::Accumulator; +use datafusion_functions_aggregate::min_max::{MaxAccumulator, MinAccumulator}; use datafusion_physical_expr::equivalence::{ join_equivalence_properties, ProjectionMapping, }; @@ -1188,29 +1189,123 @@ impl ExecutionPlan for HashJoinExec { } } -/// Compute min/max bounds for each column in the given arrays -fn compute_bounds(arrays: &[ArrayRef]) -> Result> { - arrays - .iter() - .map(|array| { - if array.is_empty() { - // Return NULL values for empty arrays - return Ok(ColumnBounds::new( - ScalarValue::try_from(array.data_type())?, - ScalarValue::try_from(array.data_type())?, - )); +/// Accumulator for collecting min/max bounds from build-side data during hash join. +/// +/// This struct encapsulates the logic for progressively computing column bounds +/// (minimum and maximum values) for a specific join key expression as batches +/// are processed during the build phase of a hash join. +/// +/// The bounds are used for dynamic filter pushdown optimization, where filters +/// based on the actual data ranges can be pushed down to the probe side to +/// eliminate unnecessary data early. +struct CollectLeftAccumulator { + /// The physical expression to evaluate for each batch + expr: Arc, + /// Accumulator for tracking the minimum value across all batches + min: MinAccumulator, + /// Accumulator for tracking the maximum value across all batches + max: MaxAccumulator, +} + +impl CollectLeftAccumulator { + /// Creates a new accumulator for tracking bounds of a join key expression. + /// + /// # Arguments + /// * `expr` - The physical expression to track bounds for + /// * `schema` - The schema of the input data + /// + /// # Returns + /// A new `CollectLeftAccumulator` instance configured for the expression's data type + fn try_new(expr: Arc, schema: &SchemaRef) -> Result { + /// Recursively unwraps dictionary types to get the underlying value type. + fn dictionary_value_type(data_type: &DataType) -> DataType { + match data_type { + DataType::Dictionary(_, value_type) => { + dictionary_value_type(value_type.as_ref()) + } + _ => data_type.clone(), } + } + + let data_type = expr + .data_type(schema) + // Min/Max can operate on dictionary data but expect to be initialized with the underlying value type + .map(|dt| dictionary_value_type(&dt))?; + Ok(Self { + expr, + min: MinAccumulator::try_new(&data_type)?, + max: MaxAccumulator::try_new(&data_type)?, + }) + } - // Use Arrow kernels for efficient min/max computation - let min_val = min_batch(array)?; - let max_val = max_batch(array)?; + /// Updates the accumulators with values from a new batch. + /// + /// Evaluates the expression on the batch and updates both min and max + /// accumulators with the resulting values. + /// + /// # Arguments + /// * `batch` - The record batch to process + /// + /// # Returns + /// Ok(()) if the update succeeds, or an error if expression evaluation fails + fn update_batch(&mut self, batch: &RecordBatch) -> Result<()> { + let array = self.expr.evaluate(batch)?.into_array(batch.num_rows())?; + self.min.update_batch(std::slice::from_ref(&array))?; + self.max.update_batch(std::slice::from_ref(&array))?; + Ok(()) + } - Ok(ColumnBounds::new(min_val, max_val)) + /// Finalizes the accumulation and returns the computed bounds. + /// + /// Consumes self to extract the final min and max values from the accumulators. + /// + /// # Returns + /// The `ColumnBounds` containing the minimum and maximum values observed + fn evaluate(mut self) -> Result { + Ok(ColumnBounds::new( + self.min.evaluate()?, + self.max.evaluate()?, + )) + } +} + +/// State for collecting the build-side data during hash join +struct BuildSideState { + batches: Vec, + num_rows: usize, + metrics: BuildProbeJoinMetrics, + reservation: MemoryReservation, + bounds_accumulators: Option>, +} + +impl BuildSideState { + /// Create a new BuildSideState with optional accumulators for bounds computation + fn try_new( + metrics: BuildProbeJoinMetrics, + reservation: MemoryReservation, + on_left: Vec>, + schema: &SchemaRef, + should_compute_bounds: bool, + ) -> Result { + Ok(Self { + batches: Vec::new(), + num_rows: 0, + metrics, + reservation, + bounds_accumulators: should_compute_bounds + .then(|| { + on_left + .iter() + .map(|expr| { + CollectLeftAccumulator::try_new(Arc::clone(expr), schema) + }) + .collect::>>() + }) + .transpose()?, }) - .collect() + } } -#[expect(clippy::too_many_arguments)] /// Collects all batches from the left (build) side stream and creates a hash map for joining. /// /// This function is responsible for: @@ -1239,6 +1334,7 @@ fn compute_bounds(arrays: &[ArrayRef]) -> Result> { /// # Returns /// `JoinLeftData` containing the hash map, consolidated batch, join key values, /// visited indices bitmap, and computed bounds (if requested). +#[allow(clippy::too_many_arguments)] async fn collect_left_input( random_state: RandomState, left_stream: SendableRecordBatchStream, @@ -1254,24 +1350,48 @@ async fn collect_left_input( // This operation performs 2 steps at once: // 1. creates a [JoinHashMap] of all batches from the stream // 2. stores the batches in a vector. - let initial = (Vec::new(), 0, metrics, reservation); - let (batches, num_rows, metrics, mut reservation) = left_stream - .try_fold(initial, |mut acc, batch| async { + let initial = BuildSideState::try_new( + metrics, + reservation, + on_left.clone(), + &schema, + should_compute_bounds, + )?; + + let state = left_stream + .try_fold(initial, |mut state, batch| async move { + // Update accumulators if computing bounds + if let Some(ref mut accumulators) = state.bounds_accumulators { + for accumulator in accumulators { + accumulator.update_batch(&batch)?; + } + } + + // Decide if we spill or not let batch_size = get_record_batch_memory_size(&batch); // Reserve memory for incoming batch - acc.3.try_grow(batch_size)?; + state.reservation.try_grow(batch_size)?; // Update metrics - acc.2.build_mem_used.add(batch_size); - acc.2.build_input_batches.add(1); - acc.2.build_input_rows.add(batch.num_rows()); + state.metrics.build_mem_used.add(batch_size); + state.metrics.build_input_batches.add(1); + state.metrics.build_input_rows.add(batch.num_rows()); // Update row count - acc.1 += batch.num_rows(); + state.num_rows += batch.num_rows(); // Push batch to output - acc.0.push(batch); - Ok(acc) + state.batches.push(batch); + Ok(state) }) .await?; + // Extract fields from state + let BuildSideState { + batches, + num_rows, + metrics, + mut reservation, + bounds_accumulators, + } = state; + // Estimation of memory size, required for hashtable, prior to allocation. // Final result can be verified using `RawTable.allocation_info()` let fixed_size_u32 = size_of::(); @@ -1338,10 +1458,15 @@ async fn collect_left_input( .collect::>>()?; // Compute bounds for dynamic filter if enabled - let bounds = if should_compute_bounds && num_rows > 0 { - Some(compute_bounds(&left_values)?) - } else { - None + let bounds = match bounds_accumulators { + Some(accumulators) if num_rows > 0 => { + let bounds = accumulators + .into_iter() + .map(CollectLeftAccumulator::evaluate) + .collect::>>()?; + Some(bounds) + } + _ => None, }; let data = JoinLeftData::new( From e18a1a4525efe2e87d23cca2720809c28ac95373 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=ADa=20Adriana?= Date: Tue, 30 Sep 2025 08:59:47 +0200 Subject: [PATCH 02/13] Downgrade sha2 from 0.10.9 to 0.10.8 (#29) (#48) * Downgrade sha2 from 0.10.9 to 0.10.8 (#29) (cherry picked from commit ca48c4d85ded7592289419db9686888672f2eabb) * retrigger checks --------- Co-authored-by: Gabriel <45515538+gabotechs@users.noreply.github.com> --- datafusion/functions/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index b557fe832e28..0ebee70783d8 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -80,7 +80,7 @@ log = { workspace = true } md-5 = { version = "^0.10.0", optional = true } rand = { workspace = true } regex = { workspace = true, optional = true } -sha2 = { version = "^0.10.9", optional = true } +sha2 = { version = "^0.10.8", optional = true } unicode-segmentation = { version = "^1.7.1", optional = true } uuid = { version = "1.18", features = ["v4"], optional = true } From 0be6cd99bc67dca023a31e016487e5319f0ca431 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=ADa=20Adriana?= Date: Tue, 30 Sep 2025 09:37:28 +0200 Subject: [PATCH 03/13] Make datafusion-datasource not depend on default features (#49) (cherry picked from commit 368d6f4c60bbf2a308b0caabb9b2eba1655d7674) Co-authored-by: Gabriel Musat Mestre --- datafusion/proto/Cargo.toml | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/datafusion/proto/Cargo.toml b/datafusion/proto/Cargo.toml index c95f392a051a..c005968c79ee 100644 --- a/datafusion/proto/Cargo.toml +++ b/datafusion/proto/Cargo.toml @@ -46,7 +46,17 @@ avro = ["datafusion/avro", "datafusion-common/avro"] [dependencies] arrow = { workspace = true } chrono = { workspace = true } -datafusion = { workspace = true, default-features = true } +datafusion = { workspace = true, default-features = false, features = [ + "array_expressions", + "nested_expressions", + "datetime_expressions", + "encoding_expressions", + "regex_expressions", + "string_expressions", + "unicode_expressions", + "crypto_expressions", + "parquet", +] } datafusion-common = { workspace = true, default-features = true } datafusion-expr = { workspace = true } datafusion-proto-common = { workspace = true } From 5506e698c41810525331e57766f73e865d86e2a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=ADa=20Adriana?= Date: Tue, 30 Sep 2025 16:15:39 +0200 Subject: [PATCH 04/13] Revert arrow upgrade and related changes (#50) * Revert "Upgrade arrow/parquet to 56.0.0 (#16690)" This reverts commit fa1f8c192dd531d3f2fca61885eafa3e9002f0dd. * Revert "refactor: use upstream inline_key_fast (#17044)" This reverts commit 71b92bc9e768139f3bfab14faf76654a89021537. * Revert "fix: respect inexact flags in row group metadata (#16412)" This reverts commit afc90f727d45e6140e6a3a9ad6d159fc7a19d8dd. * Revert "Test grouping by FixedSizeList (#17415)" This reverts commit 03f39e51470544e9f4caca661987a19143ccd21e. * Spelling (got reverted) * Also allow Byt from tests * Adjust sqllogictests --- Cargo.lock | 166 ++-- Cargo.toml | 16 +- datafusion-examples/Cargo.toml | 2 +- datafusion-testing | 2 +- datafusion/common/Cargo.toml | 2 +- datafusion/common/src/config.rs | 16 +- .../common/src/file_options/parquet_writer.rs | 25 +- datafusion/common/src/scalar/mod.rs | 19 +- datafusion/common/src/types/native.rs | 5 +- .../src/datasource/file_format/parquet.rs | 27 +- datafusion/core/tests/fuzz_cases/pruning.rs | 11 +- datafusion/core/tests/parquet/mod.rs | 4 +- .../core/tests/parquet/row_group_pruning.rs | 14 +- .../src/avro_to_arrow/schema.rs | 2 - .../datasource-parquet/src/file_format.rs | 2 +- datafusion/expr/src/utils.rs | 2 - datafusion/physical-plan/src/sorts/cursor.rs | 214 ++++- .../proto/datafusion_common.proto | 40 +- datafusion/proto-common/src/from_proto/mod.rs | 31 +- .../proto-common/src/generated/pbjson.rs | 772 +++--------------- .../proto-common/src/generated/prost.rs | 66 +- datafusion/proto-common/src/to_proto/mod.rs | 18 +- .../src/generated/datafusion_proto_common.rs | 66 +- .../proto/src/logical_plan/file_formats.rs | 14 + datafusion/sql/src/unparser/expr.rs | 6 - datafusion/sqllogictest/test_files/array.slt | 14 +- datafusion/sqllogictest/test_files/copy.slt | 1 + .../test_files/information_schema.slt | 6 +- .../test_files/listing_table_statistics.slt | 2 +- .../test_files/parquet_statistics.slt | 16 +- .../test_files/repartition_scan.slt | 8 +- .../src/logical_plan/consumer/utils.rs | 2 - docs/source/library-user-guide/upgrading.md | 6 - docs/source/user-guide/configs.md | 3 +- parquet-testing | 2 +- testing | 2 +- typos.toml | 1 + 37 files changed, 635 insertions(+), 970 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5186c6d6d598..e58f72ee7b2f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -240,9 +240,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "56.0.0" +version = "55.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd798aea3553913a5986813e9c6ad31a2d2b04e931fe8ea4a37155eb541cebb5" +checksum = "f3f15b4c6b148206ff3a2b35002e08929c2462467b62b9c02036d9c34f9ef994" dependencies = [ "arrow-arith", "arrow-array", @@ -264,9 +264,9 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "56.0.0" +version = "55.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "508dafb53e5804a238cab7fd97a59ddcbfab20cc4d9814b1ab5465b9fa147f2e" +checksum = "30feb679425110209ae35c3fbf82404a39a4c0436bb3ec36164d8bffed2a4ce4" dependencies = [ "arrow-array", "arrow-buffer", @@ -278,9 +278,9 @@ dependencies = [ [[package]] name = "arrow-array" -version = "56.0.0" +version = "55.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2730bc045d62bb2e53ef8395b7d4242f5c8102f41ceac15e8395b9ac3d08461" +checksum = "70732f04d285d49054a48b72c54f791bb3424abae92d27aafdf776c98af161c8" dependencies = [ "ahash 0.8.12", "arrow-buffer", @@ -295,9 +295,9 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "56.0.0" +version = "55.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54295b93beb702ee9a6f6fbced08ad7f4d76ec1c297952d4b83cf68755421d1d" +checksum = "169b1d5d6cb390dd92ce582b06b23815c7953e9dfaaea75556e89d890d19993d" dependencies = [ "bytes", "half", @@ -306,9 +306,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "56.0.0" +version = "55.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67e8bcb7dc971d779a7280593a1bf0c2743533b8028909073e804552e85e75b5" +checksum = "e4f12eccc3e1c05a766cafb31f6a60a46c2f8efec9b74c6e0648766d30686af8" dependencies = [ "arrow-array", "arrow-buffer", @@ -327,9 +327,9 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "56.0.0" +version = "55.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "673fd2b5fb57a1754fdbfac425efd7cf54c947ac9950c1cce86b14e248f1c458" +checksum = "012c9fef3f4a11573b2c74aec53712ff9fdae4a95f4ce452d1bbf088ee00f06b" dependencies = [ "arrow-array", "arrow-cast", @@ -342,9 +342,9 @@ dependencies = [ [[package]] name = "arrow-data" -version = "56.0.0" +version = "55.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97c22fe3da840039c69e9f61f81e78092ea36d57037b4900151f063615a2f6b4" +checksum = "8de1ce212d803199684b658fc4ba55fb2d7e87b213de5af415308d2fee3619c2" dependencies = [ "arrow-buffer", "arrow-schema", @@ -354,9 +354,9 @@ dependencies = [ [[package]] name = "arrow-flight" -version = "56.0.0" +version = "55.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6808d235786b721e49e228c44dd94242f2e8b46b7e95b233b0733c46e758bfee" +checksum = "5cb3e1d2b441e6d1d5988e3f7c4523c9466b18ef77d7c525d92d36d4cad49fbe" dependencies = [ "arrow-arith", "arrow-array", @@ -381,9 +381,9 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "56.0.0" +version = "55.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "778de14c5a69aedb27359e3dd06dd5f9c481d5f6ee9fbae912dba332fd64636b" +checksum = "d9ea5967e8b2af39aff5d9de2197df16e305f47f404781d3230b2dc672da5d92" dependencies = [ "arrow-array", "arrow-buffer", @@ -396,9 +396,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "56.0.0" +version = "55.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3860db334fe7b19fcf81f6b56f8d9d95053f3839ffe443d56b5436f7a29a1794" +checksum = "5709d974c4ea5be96d900c01576c7c0b99705f4a3eec343648cb1ca863988a9c" dependencies = [ "arrow-array", "arrow-buffer", @@ -418,9 +418,9 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "56.0.0" +version = "55.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "425fa0b42a39d3ff55160832e7c25553e7f012c3f187def3d70313e7a29ba5d9" +checksum = "6506e3a059e3be23023f587f79c82ef0bcf6d293587e3272d20f2d30b969b5a7" dependencies = [ "arrow-array", "arrow-buffer", @@ -431,9 +431,9 @@ dependencies = [ [[package]] name = "arrow-pyarrow" -version = "56.0.0" +version = "55.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d944d8ae9b77230124e6570865b570416c33a5809f32c4136c679bbe774e45c9" +checksum = "0e55ecf16b9b61d433f6e63c72fc6afcf2597d7db96583de88ebb887d1822268" dependencies = [ "arrow-array", "arrow-data", @@ -443,9 +443,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "56.0.0" +version = "55.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df9c9423c9e71abd1b08a7f788fcd203ba2698ac8e72a1f236f1faa1a06a7414" +checksum = "52bf7393166beaf79b4bed9bfdf19e97472af32ce5b6b48169d321518a08cae2" dependencies = [ "arrow-array", "arrow-buffer", @@ -456,9 +456,9 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "56.0.0" +version = "55.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85fa1babc4a45fdc64a92175ef51ff00eba5ebbc0007962fecf8022ac1c6ce28" +checksum = "af7686986a3bf2254c9fb130c623cdcb2f8e1f15763e7c71c310f0834da3d292" dependencies = [ "bitflags 2.9.1", "serde", @@ -467,9 +467,9 @@ dependencies = [ [[package]] name = "arrow-select" -version = "56.0.0" +version = "55.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d8854d15f1cf5005b4b358abeb60adea17091ff5bdd094dca5d3f73787d81170" +checksum = "dd2b45757d6a2373faa3352d02ff5b54b098f5e21dccebc45a21806bc34501e5" dependencies = [ "ahash 0.8.12", "arrow-array", @@ -481,9 +481,9 @@ dependencies = [ [[package]] name = "arrow-string" -version = "56.0.0" +version = "55.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c477e8b89e1213d5927a2a84a72c384a9bf4dd0dbf15f9fd66d821aafd9e95e" +checksum = "0377d532850babb4d927a06294314b316e23311503ed580ec6ce6a0158f49d40" dependencies = [ "arrow-array", "arrow-buffer", @@ -561,6 +561,28 @@ dependencies = [ "syn 2.0.106", ] +[[package]] +name = "async-stream" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b5a71a6f37880a80d1d7f19efd781e4b5de42c88f0722cc13bcb6cc2cfe8476" +dependencies = [ + "async-stream-impl", + "futures-core", + "pin-project-lite", +] + +[[package]] +name = "async-stream-impl" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.106", +] + [[package]] name = "async-trait" version = "0.1.89" @@ -822,7 +844,7 @@ dependencies = [ "rustls-pki-types", "tokio", "tokio-rustls", - "tower", + "tower 0.5.2", "tracing", ] @@ -943,10 +965,11 @@ dependencies = [ [[package]] name = "axum" -version = "0.8.4" +version = "0.7.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "021e862c184ae977658b36c4500f7feac3221ca5da43e3f25bd04ab6c79a29b5" +checksum = "edca88bc138befd0323b20752846e6587272d3b03b0343c8ea28a6f819e6e71f" dependencies = [ + "async-trait", "axum-core", "bytes", "futures-util", @@ -962,19 +985,20 @@ dependencies = [ "rustversion", "serde", "sync_wrapper", - "tower", + "tower 0.5.2", "tower-layer", "tower-service", ] [[package]] name = "axum-core" -version = "0.5.2" +version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68464cd0412f486726fb3373129ef5d2993f90c34bc2bc1c1e9943b2f4fc7ca6" +checksum = "09f2bd6146b97ae3359fa0cc6d6b376d9539582c7b4220f041a33ec24c226199" dependencies = [ + "async-trait", "bytes", - "futures-core", + "futures-util", "http 1.3.1", "http-body 1.0.1", "http-body-util", @@ -4166,9 +4190,9 @@ dependencies = [ [[package]] name = "matchit" -version = "0.8.4" +version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "47e1ffaa40ddd1f3ed91f717a33c8c0ee23fff369e3aa8772b9605cc1d22f4c3" +checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94" [[package]] name = "md-5" @@ -4529,9 +4553,9 @@ dependencies = [ [[package]] name = "parquet" -version = "56.0.0" +version = "55.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7288a07ed5d25939a90f9cb1ca5afa6855faa08ec7700613511ae64bdb0620c" +checksum = "b17da4150748086bd43352bc77372efa9b6e3dbd06a04831d2a98c041c225cfa" dependencies = [ "ahash 0.8.12", "arrow-array", @@ -5028,10 +5052,11 @@ dependencies = [ [[package]] name = "pyo3" -version = "0.25.1" +version = "0.24.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8970a78afe0628a3e3430376fc5fd76b6b45c4d43360ffd6cdd40bdde72b682a" +checksum = "e5203598f366b11a02b13aa20cab591229ff0a89fd121a308a5df751d5fc9219" dependencies = [ + "cfg-if", "indoc", "libc", "memoffset", @@ -5045,9 +5070,9 @@ dependencies = [ [[package]] name = "pyo3-build-config" -version = "0.25.1" +version = "0.24.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "458eb0c55e7ece017adeba38f2248ff3ac615e53660d7c71a238d7d2a01c7598" +checksum = "99636d423fa2ca130fa5acde3059308006d46f98caac629418e53f7ebb1e9999" dependencies = [ "once_cell", "target-lexicon", @@ -5055,9 +5080,9 @@ dependencies = [ [[package]] name = "pyo3-ffi" -version = "0.25.1" +version = "0.24.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7114fe5457c61b276ab77c5055f206295b812608083644a5c5b2640c3102565c" +checksum = "78f9cf92ba9c409279bc3305b5409d90db2d2c22392d443a87df3a1adad59e33" dependencies = [ "libc", "pyo3-build-config", @@ -5065,9 +5090,9 @@ dependencies = [ [[package]] name = "pyo3-macros" -version = "0.25.1" +version = "0.24.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8725c0a622b374d6cb051d11a0983786448f7785336139c3c94f5aa6bef7e50" +checksum = "0b999cb1a6ce21f9a6b147dcf1be9ffedf02e0043aec74dc390f3007047cecd9" dependencies = [ "proc-macro2", "pyo3-macros-backend", @@ -5077,9 +5102,9 @@ dependencies = [ [[package]] name = "pyo3-macros-backend" -version = "0.25.1" +version = "0.24.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4109984c22491085343c05b0dbc54ddc405c3cf7b4374fc533f5c3313a572ccc" +checksum = "822ece1c7e1012745607d5cf0bcb2874769f0f7cb34c4cde03b9358eb9ef911a" dependencies = [ "heck 0.5.0", "proc-macro2", @@ -5449,7 +5474,7 @@ dependencies = [ "tokio", "tokio-rustls", "tokio-util", - "tower", + "tower 0.5.2", "tower-http", "tower-service", "url", @@ -6654,10 +6679,11 @@ dependencies = [ [[package]] name = "tonic" -version = "0.13.1" +version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e581ba15a835f4d9ea06c55ab1bd4dce26fc53752c69a04aac00703bfb49ba9" +checksum = "877c5b330756d856ffcc4553ab34a5684481ade925ecc54bcd1bf02b1d0d4d52" dependencies = [ + "async-stream", "async-trait", "axum", "base64 0.22.1", @@ -6675,7 +6701,7 @@ dependencies = [ "socket2 0.5.10", "tokio", "tokio-stream", - "tower", + "tower 0.4.13", "tower-layer", "tower-service", "tracing", @@ -6683,16 +6709,17 @@ dependencies = [ [[package]] name = "tower" -version = "0.5.2" +version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9" +checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c" dependencies = [ "futures-core", "futures-util", - "indexmap 2.11.0", + "indexmap 1.9.3", + "pin-project", "pin-project-lite", + "rand 0.8.5", "slab", - "sync_wrapper", "tokio", "tokio-util", "tower-layer", @@ -6700,6 +6727,21 @@ dependencies = [ "tracing", ] +[[package]] +name = "tower" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9" +dependencies = [ + "futures-core", + "futures-util", + "pin-project-lite", + "sync_wrapper", + "tokio", + "tower-layer", + "tower-service", +] + [[package]] name = "tower-http" version = "0.6.6" @@ -6713,7 +6755,7 @@ dependencies = [ "http-body 1.0.1", "iri-string", "pin-project-lite", - "tower", + "tower 0.5.2", "tower-layer", "tower-service", ] diff --git a/Cargo.toml b/Cargo.toml index 53c35ed35f0d..6f7c61268e0e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -90,20 +90,20 @@ ahash = { version = "0.8", default-features = false, features = [ "runtime-rng", ] } apache-avro = { version = "0.20", default-features = false } -arrow = { version = "56.0.0", features = [ +arrow = { version = "55.2.0", features = [ "prettyprint", "chrono-tz", ] } -arrow-buffer = { version = "56.0.0", default-features = false } -arrow-flight = { version = "56.0.0", features = [ +arrow-buffer = { version = "55.2.0", default-features = false } +arrow-flight = { version = "55.2.0", features = [ "flight-sql-experimental", ] } -arrow-ipc = { version = "56.0.0", default-features = false, features = [ +arrow-ipc = { version = "55.2.0", default-features = false, features = [ "lz4", ] } -arrow-ord = { version = "56.0.0", default-features = false } -arrow-schema = { version = "56.0.0", default-features = false } -async-trait = "0.1.89" +arrow-ord = { version = "55.2.0", default-features = false } +arrow-schema = { version = "55.2.0", default-features = false } +async-trait = "0.1.88" bigdecimal = "0.4.8" bytes = "1.10" chrono = { version = "0.4.41", default-features = false } @@ -157,7 +157,7 @@ itertools = "0.14" log = "^0.4" object_store = { version = "0.12.3", default-features = false } parking_lot = "0.12" -parquet = { version = "56.0.0", default-features = false, features = [ +parquet = { version = "55.2.0", default-features = false, features = [ "arrow", "async", "object_store", diff --git a/datafusion-examples/Cargo.toml b/datafusion-examples/Cargo.toml index 68bb5376a1ac..204fc8794fbb 100644 --- a/datafusion-examples/Cargo.toml +++ b/datafusion-examples/Cargo.toml @@ -81,7 +81,7 @@ serde_json = { workspace = true } tempfile = { workspace = true } test-utils = { path = "../test-utils" } tokio = { workspace = true, features = ["rt-multi-thread", "parking_lot"] } -tonic = "0.13.1" +tonic = "0.12.1" tracing = { version = "0.1" } tracing-subscriber = { version = "0.3" } url = { workspace = true } diff --git a/datafusion-testing b/datafusion-testing index f72ac4075ada..905df5f65cc9 160000 --- a/datafusion-testing +++ b/datafusion-testing @@ -1 +1 @@ -Subproject commit f72ac4075ada5ea9810551bc0c3e3161c61204a2 +Subproject commit 905df5f65cc9d0851719c21f5a4dd5cd77621f19 diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml index aea5e51befb0..6350c29b46de 100644 --- a/datafusion/common/Cargo.toml +++ b/datafusion/common/Cargo.toml @@ -71,7 +71,7 @@ log = { workspace = true } object_store = { workspace = true, optional = true } parquet = { workspace = true, optional = true, default-features = true } paste = "1.0.15" -pyo3 = { version = "0.25", optional = true } +pyo3 = { version = "0.24.2", optional = true } recursive = { workspace = true, optional = true } sqlparser = { workspace = true } tokio = { workspace = true } diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index cdd8e72a06cc..876d419b971f 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -602,6 +602,13 @@ config_namespace! { /// default parquet writer setting pub statistics_enabled: Option, transform = str::to_lowercase, default = Some("page".into()) + /// (writing) Sets max statistics size for any column. If NULL, uses + /// default parquet writer setting + /// max_statistics_size is deprecated, currently it is not being used + // TODO: remove once deprecated + #[deprecated(since = "45.0.0", note = "Setting does not do anything")] + pub max_statistics_size: Option, default = Some(4096) + /// (writing) Target maximum number of rows in each row group (defaults to 1M /// rows). Writing larger row groups requires more memory to write, but /// can get better compression and be faster to read. @@ -615,7 +622,7 @@ config_namespace! { /// (writing) Sets statistics truncate length. If NULL, uses /// default parquet writer setting - pub statistics_truncate_length: Option, default = Some(64) + pub statistics_truncate_length: Option, default = None /// (writing) Sets best effort maximum number of rows in data page pub data_page_row_count_limit: usize, default = 20_000 @@ -2134,6 +2141,13 @@ config_namespace_with_hashmap! { /// Sets bloom filter number of distinct values. If NULL, uses /// default parquet options pub bloom_filter_ndv: Option, default = None + + /// Sets max statistics size for the column path. If NULL, uses + /// default parquet options + /// max_statistics_size is deprecated, currently it is not being used + // TODO: remove once deprecated + #[deprecated(since = "45.0.0", note = "Setting does not do anything")] + pub max_statistics_size: Option, default = None } } diff --git a/datafusion/common/src/file_options/parquet_writer.rs b/datafusion/common/src/file_options/parquet_writer.rs index 185826aef47d..a59cd3cb7554 100644 --- a/datafusion/common/src/file_options/parquet_writer.rs +++ b/datafusion/common/src/file_options/parquet_writer.rs @@ -35,7 +35,7 @@ use parquet::{ metadata::KeyValue, properties::{ EnabledStatistics, WriterProperties, WriterPropertiesBuilder, WriterVersion, - DEFAULT_STATISTICS_ENABLED, + DEFAULT_MAX_STATISTICS_SIZE, DEFAULT_STATISTICS_ENABLED, }, }, schema::types::ColumnPath, @@ -160,6 +160,16 @@ impl TryFrom<&TableParquetOptions> for WriterPropertiesBuilder { builder = builder.set_column_bloom_filter_ndv(path.clone(), bloom_filter_ndv); } + + // max_statistics_size is deprecated, currently it is not being used + // TODO: remove once deprecated + #[allow(deprecated)] + if let Some(max_statistics_size) = options.max_statistics_size { + builder = { + #[allow(deprecated)] + builder.set_column_max_statistics_size(path, max_statistics_size) + } + } } Ok(builder) @@ -208,6 +218,7 @@ impl ParquetOptions { dictionary_enabled, dictionary_page_size_limit, statistics_enabled, + max_statistics_size, max_row_group_size, created_by, column_index_truncate_length, @@ -253,6 +264,13 @@ impl ParquetOptions { .set_data_page_row_count_limit(*data_page_row_count_limit) .set_bloom_filter_enabled(*bloom_filter_on_write); + builder = { + #[allow(deprecated)] + builder.set_max_statistics_size( + max_statistics_size.unwrap_or(DEFAULT_MAX_STATISTICS_SIZE), + ) + }; + if let Some(bloom_filter_fpp) = bloom_filter_fpp { builder = builder.set_bloom_filter_fpp(*bloom_filter_fpp); }; @@ -445,10 +463,12 @@ mod tests { fn column_options_with_non_defaults( src_col_defaults: &ParquetOptions, ) -> ParquetColumnOptions { + #[allow(deprecated)] // max_statistics_size ParquetColumnOptions { compression: Some("zstd(22)".into()), dictionary_enabled: src_col_defaults.dictionary_enabled.map(|v| !v), statistics_enabled: Some("none".into()), + max_statistics_size: Some(72), encoding: Some("RLE".into()), bloom_filter_enabled: Some(true), bloom_filter_fpp: Some(0.72), @@ -473,6 +493,7 @@ mod tests { dictionary_enabled: Some(!defaults.dictionary_enabled.unwrap_or(false)), dictionary_page_size_limit: 42, statistics_enabled: Some("chunk".into()), + max_statistics_size: Some(42), max_row_group_size: 42, created_by: "wordy".into(), column_index_truncate_length: Some(42), @@ -530,6 +551,7 @@ mod tests { ), bloom_filter_fpp: bloom_filter_default_props.map(|p| p.fpp), bloom_filter_ndv: bloom_filter_default_props.map(|p| p.ndv), + max_statistics_size: Some(props.max_statistics_size(&col)), } } @@ -586,6 +608,7 @@ mod tests { compression: default_col_props.compression, dictionary_enabled: default_col_props.dictionary_enabled, statistics_enabled: default_col_props.statistics_enabled, + max_statistics_size: default_col_props.max_statistics_size, bloom_filter_on_write: default_col_props .bloom_filter_enabled .unwrap_or_default(), diff --git a/datafusion/common/src/scalar/mod.rs b/datafusion/common/src/scalar/mod.rs index 4d88f5a66732..452d6c13b3e0 100644 --- a/datafusion/common/src/scalar/mod.rs +++ b/datafusion/common/src/scalar/mod.rs @@ -904,10 +904,11 @@ pub fn dict_from_values( .map(|index| { if values_array.is_valid(index) { let native_index = K::Native::from_usize(index).ok_or_else(|| { - _internal_datafusion_err!( - "Can not create index of type {} from value {index}", - K::DATA_TYPE - ) + DataFusionError::Internal(format!( + "Can not create index of type {} from value {}", + K::DATA_TYPE, + index + )) })?; Ok(Some(native_index)) } else { @@ -2202,16 +2203,6 @@ impl ScalarValue { } let array: ArrayRef = match &data_type { - DataType::Decimal32(_precision, _scale) => { - return _not_impl_err!( - "Decimal32 not supported in ScalarValue::iter_to_array" - ); - } - DataType::Decimal64(_precision, _scale) => { - return _not_impl_err!( - "Decimal64 not supported in ScalarValue::iter_to_array" - ); - } DataType::Decimal128(precision, scale) => { let decimal_array = ScalarValue::iter_to_decimal_array(scalars, *precision, *scale)?; diff --git a/datafusion/common/src/types/native.rs b/datafusion/common/src/types/native.rs index 76629e555b8c..39c79b4b9974 100644 --- a/datafusion/common/src/types/native.rs +++ b/datafusion/common/src/types/native.rs @@ -407,10 +407,7 @@ impl From for NativeType { DataType::Union(union_fields, _) => { Union(LogicalUnionFields::from(&union_fields)) } - DataType::Decimal32(p, s) - | DataType::Decimal64(p, s) - | DataType::Decimal128(p, s) - | DataType::Decimal256(p, s) => Decimal(p, s), + DataType::Decimal128(p, s) | DataType::Decimal256(p, s) => Decimal(p, s), DataType::Map(field, _) => Map(Arc::new(field.as_ref().into())), DataType::Dictionary(_, data_type) => data_type.as_ref().clone().into(), DataType::RunEndEncoded(_, field) => field.data_type().clone().into(), diff --git a/datafusion/core/src/datasource/file_format/parquet.rs b/datafusion/core/src/datasource/file_format/parquet.rs index 088c4408fff5..db704bee8801 100644 --- a/datafusion/core/src/datasource/file_format/parquet.rs +++ b/datafusion/core/src/datasource/file_format/parquet.rs @@ -523,23 +523,11 @@ mod tests { let dic_array = DictionaryArray::::try_new(keys, Arc::new(values))?; let c_dic: ArrayRef = Arc::new(dic_array); - // Data for column string_truncation: ["a".repeat(128), null, "b".repeat(128), null] - let string_truncation: ArrayRef = Arc::new(StringArray::from(vec![ - Some("a".repeat(128)), - None, - Some("b".repeat(128)), - None, - ])); - - let batch1 = RecordBatch::try_from_iter(vec![ - ("c_dic", c_dic), - ("string_truncation", string_truncation), - ])?; + let batch1 = RecordBatch::try_from_iter(vec![("c_dic", c_dic)])?; // Use store_parquet to write each batch to its own file // . batch1 written into first file and includes: // - column c_dic that has 4 rows with no null. Stats min and max of dictionary column is available. - // - column string_truncation that has 4 rows with 2 nulls. Stats min and max of string column is available but not exact. let store = Arc::new(RequestCountingObjectStore::new(Arc::new( LocalFileSystem::new(), ))); @@ -575,19 +563,6 @@ mod tests { Precision::Exact(Utf8(Some("a".into()))) ); - // column string_truncation - let string_truncation_stats = &stats.column_statistics[1]; - - assert_eq!(string_truncation_stats.null_count, Precision::Exact(2)); - assert_eq!( - string_truncation_stats.max_value, - Precision::Inexact(ScalarValue::Utf8View(Some("b".repeat(63) + "c"))) - ); - assert_eq!( - string_truncation_stats.min_value, - Precision::Inexact(ScalarValue::Utf8View(Some("a".repeat(64)))) - ); - Ok(()) } diff --git a/datafusion/core/tests/fuzz_cases/pruning.rs b/datafusion/core/tests/fuzz_cases/pruning.rs index f8bd4dbc1a76..4c0073b94be7 100644 --- a/datafusion/core/tests/fuzz_cases/pruning.rs +++ b/datafusion/core/tests/fuzz_cases/pruning.rs @@ -319,9 +319,14 @@ async fn write_parquet_file( row_groups: Vec>, ) -> Bytes { let mut buf = BytesMut::new().writer(); - let props = WriterProperties::builder() - .set_statistics_enabled(EnabledStatistics::Chunk) // row group level - .set_statistics_truncate_length(truncation_length); + let mut props = WriterProperties::builder(); + if let Some(truncation_length) = truncation_length { + props = { + #[allow(deprecated)] + props.set_max_statistics_size(truncation_length) + } + } + props = props.set_statistics_enabled(EnabledStatistics::Chunk); // row group level let props = props.build(); { let mut writer = diff --git a/datafusion/core/tests/parquet/mod.rs b/datafusion/core/tests/parquet/mod.rs index c44d14abd381..d0ddd9a0b0f1 100644 --- a/datafusion/core/tests/parquet/mod.rs +++ b/datafusion/core/tests/parquet/mod.rs @@ -110,11 +110,11 @@ struct ContextWithParquet { /// The output of running one of the test cases struct TestOutput { - /// The input query SQL + /// The input string sql: String, /// Execution metrics for the Parquet Scan parquet_metrics: MetricsSet, - /// number of actual rows in results + /// number of rows in results result_rows: usize, /// the contents of the input, as a string pretty_input: String, diff --git a/datafusion/core/tests/parquet/row_group_pruning.rs b/datafusion/core/tests/parquet/row_group_pruning.rs index 44409166d3ce..8613cd481be1 100644 --- a/datafusion/core/tests/parquet/row_group_pruning.rs +++ b/datafusion/core/tests/parquet/row_group_pruning.rs @@ -34,7 +34,7 @@ struct RowGroupPruningTest { expected_files_pruned_by_statistics: Option, expected_row_group_matched_by_bloom_filter: Option, expected_row_group_pruned_by_bloom_filter: Option, - expected_rows: usize, + expected_results: usize, } impl RowGroupPruningTest { // Start building the test configuration @@ -48,7 +48,7 @@ impl RowGroupPruningTest { expected_files_pruned_by_statistics: None, expected_row_group_matched_by_bloom_filter: None, expected_row_group_pruned_by_bloom_filter: None, - expected_rows: 0, + expected_results: 0, } } @@ -99,9 +99,9 @@ impl RowGroupPruningTest { self } - /// Set the number of expected rows from the output of this test + // Set the expected rows for the test fn with_expected_rows(mut self, rows: usize) -> Self { - self.expected_rows = rows; + self.expected_results = rows; self } @@ -145,10 +145,8 @@ impl RowGroupPruningTest { ); assert_eq!( output.result_rows, - self.expected_rows, - "Expected {} rows, got {}: {}", - output.result_rows, - self.expected_rows, + self.expected_results, + "mismatched expected rows: {}", output.description(), ); } diff --git a/datafusion/datasource-avro/src/avro_to_arrow/schema.rs b/datafusion/datasource-avro/src/avro_to_arrow/schema.rs index 3fce0d4826a2..39d93d2f193e 100644 --- a/datafusion/datasource-avro/src/avro_to_arrow/schema.rs +++ b/datafusion/datasource-avro/src/avro_to_arrow/schema.rs @@ -238,8 +238,6 @@ fn default_field_name(dt: &DataType) -> &str { | DataType::LargeListView(_) => { unimplemented!("View support not implemented") } - DataType::Decimal32(_, _) => "decimal", - DataType::Decimal64(_, _) => "decimal", DataType::Decimal128(_, _) => "decimal", DataType::Decimal256(_, _) => "decimal", } diff --git a/datafusion/datasource-parquet/src/file_format.rs b/datafusion/datasource-parquet/src/file_format.rs index 1fcc1721017c..1ec2015ce549 100644 --- a/datafusion/datasource-parquet/src/file_format.rs +++ b/datafusion/datasource-parquet/src/file_format.rs @@ -19,11 +19,11 @@ use std::any::Any; use std::cell::RefCell; +use std::fmt; use std::fmt::Debug; use std::ops::Range; use std::rc::Rc; use std::sync::Arc; -use std::{fmt, vec}; use arrow::array::RecordBatch; use arrow::datatypes::{Fields, Schema, SchemaRef, TimeUnit}; diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs index 2e364d0d2b80..684eba59536b 100644 --- a/datafusion/expr/src/utils.rs +++ b/datafusion/expr/src/utils.rs @@ -814,8 +814,6 @@ pub fn can_hash(data_type: &DataType) -> bool { DataType::Float16 => true, DataType::Float32 => true, DataType::Float64 => true, - DataType::Decimal32(_, _) => true, - DataType::Decimal64(_, _) => true, DataType::Decimal128(_, _) => true, DataType::Decimal256(_, _) => true, DataType::Timestamp(_, _) => true, diff --git a/datafusion/physical-plan/src/sorts/cursor.rs b/datafusion/physical-plan/src/sorts/cursor.rs index 54dc2414e4f0..8ab603e04961 100644 --- a/datafusion/physical-plan/src/sorts/cursor.rs +++ b/datafusion/physical-plan/src/sorts/cursor.rs @@ -289,6 +289,120 @@ impl CursorArray for StringViewArray { } } +/// Todo use arrow-rs side api after: and released +/// Builds a 128-bit composite key for an inline value: +/// +/// - High 96 bits: the inline data in big-endian byte order (for correct lexicographical sorting). +/// - Low 32 bits: the length in big-endian byte order, acting as a tiebreaker so shorter strings +/// (or those with fewer meaningful bytes) always numerically sort before longer ones. +/// +/// This function extracts the length and the 12-byte inline string data from the raw +/// little-endian `u128` representation, converts them to big-endian ordering, and packs them +/// into a single `u128` value suitable for fast, branchless comparisons. +/// +/// # Why include length? +/// +/// A pure 96-bit content comparison can’t distinguish between two values whose inline bytes +/// compare equal—either because one is a true prefix of the other or because zero-padding +/// hides extra bytes. By tucking the 32-bit length into the lower bits, a single `u128` compare +/// handles both content and length in one go. +/// +/// Example: comparing "bar" (3 bytes) vs "bar\0" (4 bytes) +/// +/// | String | Bytes 0–4 (length LE) | Bytes 4–16 (data + padding) | +/// |------------|-----------------------|---------------------------------| +/// | `"bar"` | `03 00 00 00` | `62 61 72` + 9 × `00` | +/// | `"bar\0"`| `04 00 00 00` | `62 61 72 00` + 8 × `00` | +/// +/// Both inline parts become `62 61 72 00…00`, so they tie on content. The length field +/// then differentiates: +/// +/// ```text +/// key("bar") = 0x0000000000000000000062617200000003 +/// key("bar\0") = 0x0000000000000000000062617200000004 +/// ⇒ key("bar") < key("bar\0") +/// ``` +/// # Inlining and Endianness +/// +/// - We start by calling `.to_le_bytes()` on the `raw` `u128`, because Rust’s native in‑memory +/// representation is little‑endian on x86/ARM. +/// - We extract the low 32 bits numerically (`raw as u32`)—this step is endianness‑free. +/// - We copy the 12 bytes of inline data (original order) into `buf[0..12]`. +/// - We serialize `length` as big‑endian into `buf[12..16]`. +/// - Finally, `u128::from_be_bytes(buf)` treats `buf[0]` as the most significant byte +/// and `buf[15]` as the least significant, producing a `u128` whose integer value +/// directly encodes “inline data then length” in big‑endian form. +/// +/// This ensures that a simple `u128` comparison is equivalent to the desired +/// lexicographical comparison of the inline bytes followed by length. +#[inline(always)] +pub fn inline_key_fast(raw: u128) -> u128 { + // 1. Decompose `raw` into little‑endian bytes: + // - raw_bytes[0..4] = length in LE + // - raw_bytes[4..16] = inline string data + let raw_bytes = raw.to_le_bytes(); + + // 2. Numerically truncate to get the low 32‑bit length (endianness‑free). + let length = raw as u32; + + // 3. Build a 16‑byte buffer in big‑endian order: + // - buf[0..12] = inline string bytes (in original order) + // - buf[12..16] = length.to_be_bytes() (BE) + let mut buf = [0u8; 16]; + buf[0..12].copy_from_slice(&raw_bytes[4..16]); // inline data + + // Why convert length to big-endian for comparison? + // + // Rust (on most platforms) stores integers in little-endian format, + // meaning the least significant byte is at the lowest memory address. + // For example, an u32 value like 0x22345677 is stored in memory as: + // + // [0x77, 0x56, 0x34, 0x22] // little-endian layout + // ^ ^ ^ ^ + // LSB ↑↑↑ MSB + // + // This layout is efficient for arithmetic but *not* suitable for + // lexicographic (dictionary-style) comparison of byte arrays. + // + // To compare values by byte order—e.g., for sorted keys or binary trees— + // we must convert them to **big-endian**, where: + // + // - The most significant byte (MSB) comes first (index 0) + // - The least significant byte (LSB) comes last (index N-1) + // + // In big-endian, the same u32 = 0x22345677 would be represented as: + // + // [0x22, 0x34, 0x56, 0x77] + // + // This ordering aligns with natural string/byte sorting, so calling + // `.to_be_bytes()` allows us to construct + // keys where standard numeric comparison (e.g., `<`, `>`) behaves + // like lexicographic byte comparison. + buf[12..16].copy_from_slice(&length.to_be_bytes()); // length in BE + + // 4. Deserialize the buffer as a big‑endian u128: + // buf[0] is MSB, buf[15] is LSB. + // Details: + // Note on endianness and layout: + // + // Although `buf[0]` is stored at the lowest memory address, + // calling `u128::from_be_bytes(buf)` interprets it as the **most significant byte (MSB)**, + // and `buf[15]` as the **least significant byte (LSB)**. + // + // This is the core principle of **big-endian decoding**: + // - Byte at index 0 maps to bits 127..120 (highest) + // - Byte at index 1 maps to bits 119..112 + // - ... + // - Byte at index 15 maps to bits 7..0 (lowest) + // + // So even though memory layout goes from low to high (left to right), + // big-endian treats the **first byte** as highest in value. + // + // This guarantees that comparing two `u128` keys is equivalent to lexicographically + // comparing the original inline bytes, followed by length. + u128::from_be_bytes(buf) +} + impl CursorValues for StringViewArray { fn len(&self) -> usize { self.views().len() @@ -346,8 +460,7 @@ impl CursorValues for StringViewArray { if l.data_buffers().is_empty() && r.data_buffers().is_empty() { let l_view = unsafe { l.views().get_unchecked(l_idx) }; let r_view = unsafe { r.views().get_unchecked(r_idx) }; - return StringViewArray::inline_key_fast(*l_view) - .cmp(&StringViewArray::inline_key_fast(*r_view)); + return inline_key_fast(*l_view).cmp(&inline_key_fast(*r_view)); } unsafe { GenericByteViewArray::compare_unchecked(l, l_idx, r, r_idx) } @@ -442,6 +555,7 @@ impl CursorValues for ArrayValues { #[cfg(test)] mod tests { + use arrow::array::GenericBinaryArray; use datafusion_execution::memory_pool::{ GreedyMemoryPool, MemoryConsumer, MemoryPool, }; @@ -606,4 +720,100 @@ mod tests { b.advance(); assert_eq!(a.cmp(&b), Ordering::Less); } + + /// Integration tests for `inline_key_fast` covering: + /// + /// 1. Monotonic ordering across increasing lengths and lexical variations. + /// 2. Cross-check against `GenericBinaryArray` comparison to ensure semantic equivalence. + /// + /// This also includes a specific test for the “bar” vs. “bar\0” case, demonstrating why + /// the length field is required even when all inline bytes fit in 12 bytes. + /// + /// The test includes strings that verify correct byte order (prevent reversal bugs), + /// and length-based tie-breaking in the composite key. + /// + /// The test confirms that `inline_key_fast` produces keys which sort consistently + /// with the expected lexicographical order of the raw byte arrays. + #[test] + fn test_inline_key_fast_various_lengths_and_lexical() { + /// Helper to create a raw u128 value representing an inline ByteView: + /// - `length`: number of meaningful bytes (must be ≤ 12) + /// - `data`: the actual inline data bytes + /// + /// The first 4 bytes encode length in little-endian, + /// the following 12 bytes contain the inline string data (unpadded). + fn make_raw_inline(length: u32, data: &[u8]) -> u128 { + assert!(length as usize <= 12, "Inline length must be ≤ 12"); + assert!( + data.len() == length as usize, + "Data length must match `length`" + ); + + let mut raw_bytes = [0u8; 16]; + raw_bytes[0..4].copy_from_slice(&length.to_le_bytes()); // length stored little-endian + raw_bytes[4..(4 + data.len())].copy_from_slice(data); // inline data + u128::from_le_bytes(raw_bytes) + } + + // Test inputs: various lengths and lexical orders, + // plus special cases for byte order and length tie-breaking + let test_inputs: Vec<&[u8]> = vec![ + b"a", + b"aa", + b"aaa", + b"aab", + b"abcd", + b"abcde", + b"abcdef", + b"abcdefg", + b"abcdefgh", + b"abcdefghi", + b"abcdefghij", + b"abcdefghijk", + b"abcdefghijkl", + // Tests for byte-order reversal bug: + // Without the fix, "backend one" would compare as "eno dnekcab", + // causing incorrect sort order relative to "backend two". + b"backend one", + b"backend two", + // Tests length-tiebreaker logic: + // "bar" (3 bytes) and "bar\0" (4 bytes) have identical inline data, + // so only the length differentiates their ordering. + b"bar", + b"bar\0", + // Additional lexical and length tie-breaking cases with same prefix, in correct lex order: + b"than12Byt", + b"than12Bytes", + b"than12Bytes\0", + b"than12Bytesx", + b"than12Bytex", + b"than12Bytez", + // Additional lexical tests + b"xyy", + b"xyz", + b"xza", + ]; + + // Create a GenericBinaryArray for cross-comparison of lex order + let array: GenericBinaryArray = GenericBinaryArray::from( + test_inputs.iter().map(|s| Some(*s)).collect::>(), + ); + + for i in 0..array.len() - 1 { + let v1 = array.value(i); + let v2 = array.value(i + 1); + + // Assert the array's natural lexical ordering is correct + assert!(v1 < v2, "Array compare failed: {v1:?} !< {v2:?}"); + + // Assert the keys produced by inline_key_fast reflect the same ordering + let key1 = inline_key_fast(make_raw_inline(v1.len() as u32, v1)); + let key2 = inline_key_fast(make_raw_inline(v2.len() as u32, v2)); + + assert!( + key1 < key2, + "Key compare failed: key({v1:?})=0x{key1:032x} !< key({v2:?})=0x{key2:032x}", + ); + } + } } diff --git a/datafusion/proto-common/proto/datafusion_common.proto b/datafusion/proto-common/proto/datafusion_common.proto index f5c79cf3d9a4..8cb272605899 100644 --- a/datafusion/proto-common/proto/datafusion_common.proto +++ b/datafusion/proto-common/proto/datafusion_common.proto @@ -136,19 +136,7 @@ enum IntervalUnit{ MonthDayNano = 2; } -message Decimal32Type { - reserved 1, 2; - uint32 precision = 3; - int32 scale = 4; -} - -message Decimal64Type { - reserved 1, 2; - uint32 precision = 3; - int32 scale = 4; -} - -message Decimal128Type { +message Decimal{ reserved 1, 2; uint32 precision = 3; int32 scale = 4; @@ -298,8 +286,6 @@ message ScalarValue{ ScalarNestedValue struct_value = 32; ScalarNestedValue map_value = 41; - Decimal32 decimal32_value = 43; - Decimal64 decimal64_value = 44; Decimal128 decimal128_value = 20; Decimal256 decimal256_value = 39; @@ -324,18 +310,6 @@ message ScalarValue{ } } -message Decimal32{ - bytes value = 1; - int64 p = 2; - int64 s = 3; -} - -message Decimal64{ - bytes value = 1; - int64 p = 2; - int64 s = 3; -} - message Decimal128{ bytes value = 1; int64 p = 2; @@ -378,9 +352,7 @@ message ArrowType{ TimeUnit TIME32 = 21 ; TimeUnit TIME64 = 22 ; IntervalUnit INTERVAL = 23 ; - Decimal32Type DECIMAL32 = 40; - Decimal64Type DECIMAL64 = 41; - Decimal128Type DECIMAL128 = 24; + Decimal DECIMAL = 24 ; Decimal256Type DECIMAL256 = 36; List LIST = 25; List LARGE_LIST = 26; @@ -508,7 +480,9 @@ message ParquetColumnOptions { uint64 bloom_filter_ndv = 7; } - reserved 8; // used to be uint32 max_statistics_size = 8; + oneof max_statistics_size_opt { + uint32 max_statistics_size = 8; + } } message ParquetOptions { @@ -547,7 +521,9 @@ message ParquetOptions { string statistics_enabled = 13; } - reserved 14; // used to be uint32 max_statistics_size = 20; + oneof max_statistics_size_opt { + uint64 max_statistics_size = 14; + } oneof column_index_truncate_length_opt { uint64 column_index_truncate_length = 17; diff --git a/datafusion/proto-common/src/from_proto/mod.rs b/datafusion/proto-common/src/from_proto/mod.rs index c5242d0176e6..0823e150268d 100644 --- a/datafusion/proto-common/src/from_proto/mod.rs +++ b/datafusion/proto-common/src/from_proto/mod.rs @@ -37,7 +37,6 @@ use datafusion_common::{ TableParquetOptions, }, file_options::{csv_writer::CsvWriterOptions, json_writer::JsonWriterOptions}, - not_impl_err, parsers::CompressionTypeVariant, plan_datafusion_err, stats::Precision, @@ -258,15 +257,7 @@ impl TryFrom<&protobuf::arrow_type::ArrowTypeEnum> for DataType { arrow_type::ArrowTypeEnum::Interval(interval_unit) => { DataType::Interval(parse_i32_to_interval_unit(interval_unit)?) } - arrow_type::ArrowTypeEnum::Decimal32(protobuf::Decimal32Type { - precision, - scale, - }) => DataType::Decimal32(*precision as u8, *scale as i8), - arrow_type::ArrowTypeEnum::Decimal64(protobuf::Decimal64Type { - precision, - scale, - }) => DataType::Decimal64(*precision as u8, *scale as i8), - arrow_type::ArrowTypeEnum::Decimal128(protobuf::Decimal128Type { + arrow_type::ArrowTypeEnum::Decimal(protobuf::Decimal { precision, scale, }) => DataType::Decimal128(*precision as u8, *scale as i8), @@ -478,14 +469,6 @@ impl TryFrom<&protobuf::ScalarValue> for ScalarValue { let null_type: DataType = v.try_into()?; null_type.try_into().map_err(Error::DataFusionError)? } - Value::Decimal32Value(_val) => { - return not_impl_err!("Decimal32 protobuf deserialization") - .map_err(Error::DataFusionError) - } - Value::Decimal64Value(_val) => { - return not_impl_err!("Decimal64 protobuf deserialization") - .map_err(Error::DataFusionError) - } Value::Decimal128Value(val) => { let array = vec_to_array(val.value.clone()); Self::Decimal128( @@ -955,6 +938,12 @@ impl TryFrom<&protobuf::ParquetOptions> for ParquetOptions { protobuf::parquet_options::StatisticsEnabledOpt::StatisticsEnabled(v) => Some(v), }) .unwrap_or(None), + max_statistics_size: value + .max_statistics_size_opt.as_ref() + .map(|opt| match opt { + protobuf::parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v) => Some(*v as usize), + }) + .unwrap_or(None), max_row_group_size: value.max_row_group_size as usize, created_by: value.created_by.clone(), column_index_truncate_length: value @@ -1020,6 +1009,12 @@ impl TryFrom<&protobuf::ParquetColumnOptions> for ParquetColumnOptions { protobuf::parquet_column_options::StatisticsEnabledOpt::StatisticsEnabled(v) => Some(v), }) .unwrap_or(None), + max_statistics_size: value + .max_statistics_size_opt + .map(|opt| match opt { + protobuf::parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v) => Some(v as usize), + }) + .unwrap_or(None), encoding: value .encoding_opt.clone() .map(|opt| match opt { diff --git a/datafusion/proto-common/src/generated/pbjson.rs b/datafusion/proto-common/src/generated/pbjson.rs index 48782ff1d93a..f35fd1594695 100644 --- a/datafusion/proto-common/src/generated/pbjson.rs +++ b/datafusion/proto-common/src/generated/pbjson.rs @@ -243,14 +243,8 @@ impl serde::Serialize for ArrowType { .map_err(|_| serde::ser::Error::custom(format!("Invalid variant {}", *v)))?; struct_ser.serialize_field("INTERVAL", &v)?; } - arrow_type::ArrowTypeEnum::Decimal32(v) => { - struct_ser.serialize_field("DECIMAL32", v)?; - } - arrow_type::ArrowTypeEnum::Decimal64(v) => { - struct_ser.serialize_field("DECIMAL64", v)?; - } - arrow_type::ArrowTypeEnum::Decimal128(v) => { - struct_ser.serialize_field("DECIMAL128", v)?; + arrow_type::ArrowTypeEnum::Decimal(v) => { + struct_ser.serialize_field("DECIMAL", v)?; } arrow_type::ArrowTypeEnum::Decimal256(v) => { struct_ser.serialize_field("DECIMAL256", v)?; @@ -320,9 +314,7 @@ impl<'de> serde::Deserialize<'de> for ArrowType { "TIME32", "TIME64", "INTERVAL", - "DECIMAL32", - "DECIMAL64", - "DECIMAL128", + "DECIMAL", "DECIMAL256", "LIST", "LARGE_LIST", @@ -364,9 +356,7 @@ impl<'de> serde::Deserialize<'de> for ArrowType { Time32, Time64, Interval, - Decimal32, - Decimal64, - Decimal128, + Decimal, Decimal256, List, LargeList, @@ -423,9 +413,7 @@ impl<'de> serde::Deserialize<'de> for ArrowType { "TIME32" => Ok(GeneratedField::Time32), "TIME64" => Ok(GeneratedField::Time64), "INTERVAL" => Ok(GeneratedField::Interval), - "DECIMAL32" => Ok(GeneratedField::Decimal32), - "DECIMAL64" => Ok(GeneratedField::Decimal64), - "DECIMAL128" => Ok(GeneratedField::Decimal128), + "DECIMAL" => Ok(GeneratedField::Decimal), "DECIMAL256" => Ok(GeneratedField::Decimal256), "LIST" => Ok(GeneratedField::List), "LARGELIST" | "LARGE_LIST" => Ok(GeneratedField::LargeList), @@ -640,25 +628,11 @@ impl<'de> serde::Deserialize<'de> for ArrowType { } arrow_type_enum__ = map_.next_value::<::std::option::Option>()?.map(|x| arrow_type::ArrowTypeEnum::Interval(x as i32)); } - GeneratedField::Decimal32 => { + GeneratedField::Decimal => { if arrow_type_enum__.is_some() { - return Err(serde::de::Error::duplicate_field("DECIMAL32")); + return Err(serde::de::Error::duplicate_field("DECIMAL")); } - arrow_type_enum__ = map_.next_value::<::std::option::Option<_>>()?.map(arrow_type::ArrowTypeEnum::Decimal32) -; - } - GeneratedField::Decimal64 => { - if arrow_type_enum__.is_some() { - return Err(serde::de::Error::duplicate_field("DECIMAL64")); - } - arrow_type_enum__ = map_.next_value::<::std::option::Option<_>>()?.map(arrow_type::ArrowTypeEnum::Decimal64) -; - } - GeneratedField::Decimal128 => { - if arrow_type_enum__.is_some() { - return Err(serde::de::Error::duplicate_field("DECIMAL128")); - } - arrow_type_enum__ = map_.next_value::<::std::option::Option<_>>()?.map(arrow_type::ArrowTypeEnum::Decimal128) + arrow_type_enum__ = map_.next_value::<::std::option::Option<_>>()?.map(arrow_type::ArrowTypeEnum::Decimal) ; } GeneratedField::Decimal256 => { @@ -2225,454 +2199,68 @@ impl<'de> serde::Deserialize<'de> for CsvWriterOptions { date_format__ = Some(map_.next_value()?); } GeneratedField::DatetimeFormat => { - if datetime_format__.is_some() { - return Err(serde::de::Error::duplicate_field("datetimeFormat")); - } - datetime_format__ = Some(map_.next_value()?); - } - GeneratedField::TimestampFormat => { - if timestamp_format__.is_some() { - return Err(serde::de::Error::duplicate_field("timestampFormat")); - } - timestamp_format__ = Some(map_.next_value()?); - } - GeneratedField::TimeFormat => { - if time_format__.is_some() { - return Err(serde::de::Error::duplicate_field("timeFormat")); - } - time_format__ = Some(map_.next_value()?); - } - GeneratedField::NullValue => { - if null_value__.is_some() { - return Err(serde::de::Error::duplicate_field("nullValue")); - } - null_value__ = Some(map_.next_value()?); - } - GeneratedField::Quote => { - if quote__.is_some() { - return Err(serde::de::Error::duplicate_field("quote")); - } - quote__ = Some(map_.next_value()?); - } - GeneratedField::Escape => { - if escape__.is_some() { - return Err(serde::de::Error::duplicate_field("escape")); - } - escape__ = Some(map_.next_value()?); - } - GeneratedField::DoubleQuote => { - if double_quote__.is_some() { - return Err(serde::de::Error::duplicate_field("doubleQuote")); - } - double_quote__ = Some(map_.next_value()?); - } - } - } - Ok(CsvWriterOptions { - compression: compression__.unwrap_or_default(), - delimiter: delimiter__.unwrap_or_default(), - has_header: has_header__.unwrap_or_default(), - date_format: date_format__.unwrap_or_default(), - datetime_format: datetime_format__.unwrap_or_default(), - timestamp_format: timestamp_format__.unwrap_or_default(), - time_format: time_format__.unwrap_or_default(), - null_value: null_value__.unwrap_or_default(), - quote: quote__.unwrap_or_default(), - escape: escape__.unwrap_or_default(), - double_quote: double_quote__.unwrap_or_default(), - }) - } - } - deserializer.deserialize_struct("datafusion_common.CsvWriterOptions", FIELDS, GeneratedVisitor) - } -} -impl serde::Serialize for Decimal128 { - #[allow(deprecated)] - fn serialize(&self, serializer: S) -> std::result::Result - where - S: serde::Serializer, - { - use serde::ser::SerializeStruct; - let mut len = 0; - if !self.value.is_empty() { - len += 1; - } - if self.p != 0 { - len += 1; - } - if self.s != 0 { - len += 1; - } - let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal128", len)?; - if !self.value.is_empty() { - #[allow(clippy::needless_borrow)] - #[allow(clippy::needless_borrows_for_generic_args)] - struct_ser.serialize_field("value", pbjson::private::base64::encode(&self.value).as_str())?; - } - if self.p != 0 { - #[allow(clippy::needless_borrow)] - #[allow(clippy::needless_borrows_for_generic_args)] - struct_ser.serialize_field("p", ToString::to_string(&self.p).as_str())?; - } - if self.s != 0 { - #[allow(clippy::needless_borrow)] - #[allow(clippy::needless_borrows_for_generic_args)] - struct_ser.serialize_field("s", ToString::to_string(&self.s).as_str())?; - } - struct_ser.end() - } -} -impl<'de> serde::Deserialize<'de> for Decimal128 { - #[allow(deprecated)] - fn deserialize(deserializer: D) -> std::result::Result - where - D: serde::Deserializer<'de>, - { - const FIELDS: &[&str] = &[ - "value", - "p", - "s", - ]; - - #[allow(clippy::enum_variant_names)] - enum GeneratedField { - Value, - P, - S, - } - impl<'de> serde::Deserialize<'de> for GeneratedField { - fn deserialize(deserializer: D) -> std::result::Result - where - D: serde::Deserializer<'de>, - { - struct GeneratedVisitor; - - impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { - type Value = GeneratedField; - - fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(formatter, "expected one of: {:?}", &FIELDS) - } - - #[allow(unused_variables)] - fn visit_str(self, value: &str) -> std::result::Result - where - E: serde::de::Error, - { - match value { - "value" => Ok(GeneratedField::Value), - "p" => Ok(GeneratedField::P), - "s" => Ok(GeneratedField::S), - _ => Err(serde::de::Error::unknown_field(value, FIELDS)), - } - } - } - deserializer.deserialize_identifier(GeneratedVisitor) - } - } - struct GeneratedVisitor; - impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { - type Value = Decimal128; - - fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - formatter.write_str("struct datafusion_common.Decimal128") - } - - fn visit_map(self, mut map_: V) -> std::result::Result - where - V: serde::de::MapAccess<'de>, - { - let mut value__ = None; - let mut p__ = None; - let mut s__ = None; - while let Some(k) = map_.next_key()? { - match k { - GeneratedField::Value => { - if value__.is_some() { - return Err(serde::de::Error::duplicate_field("value")); - } - value__ = - Some(map_.next_value::<::pbjson::private::BytesDeserialize<_>>()?.0) - ; - } - GeneratedField::P => { - if p__.is_some() { - return Err(serde::de::Error::duplicate_field("p")); - } - p__ = - Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0) - ; - } - GeneratedField::S => { - if s__.is_some() { - return Err(serde::de::Error::duplicate_field("s")); - } - s__ = - Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0) - ; - } - } - } - Ok(Decimal128 { - value: value__.unwrap_or_default(), - p: p__.unwrap_or_default(), - s: s__.unwrap_or_default(), - }) - } - } - deserializer.deserialize_struct("datafusion_common.Decimal128", FIELDS, GeneratedVisitor) - } -} -impl serde::Serialize for Decimal128Type { - #[allow(deprecated)] - fn serialize(&self, serializer: S) -> std::result::Result - where - S: serde::Serializer, - { - use serde::ser::SerializeStruct; - let mut len = 0; - if self.precision != 0 { - len += 1; - } - if self.scale != 0 { - len += 1; - } - let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal128Type", len)?; - if self.precision != 0 { - struct_ser.serialize_field("precision", &self.precision)?; - } - if self.scale != 0 { - struct_ser.serialize_field("scale", &self.scale)?; - } - struct_ser.end() - } -} -impl<'de> serde::Deserialize<'de> for Decimal128Type { - #[allow(deprecated)] - fn deserialize(deserializer: D) -> std::result::Result - where - D: serde::Deserializer<'de>, - { - const FIELDS: &[&str] = &[ - "precision", - "scale", - ]; - - #[allow(clippy::enum_variant_names)] - enum GeneratedField { - Precision, - Scale, - } - impl<'de> serde::Deserialize<'de> for GeneratedField { - fn deserialize(deserializer: D) -> std::result::Result - where - D: serde::Deserializer<'de>, - { - struct GeneratedVisitor; - - impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { - type Value = GeneratedField; - - fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(formatter, "expected one of: {:?}", &FIELDS) - } - - #[allow(unused_variables)] - fn visit_str(self, value: &str) -> std::result::Result - where - E: serde::de::Error, - { - match value { - "precision" => Ok(GeneratedField::Precision), - "scale" => Ok(GeneratedField::Scale), - _ => Err(serde::de::Error::unknown_field(value, FIELDS)), - } - } - } - deserializer.deserialize_identifier(GeneratedVisitor) - } - } - struct GeneratedVisitor; - impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { - type Value = Decimal128Type; - - fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - formatter.write_str("struct datafusion_common.Decimal128Type") - } - - fn visit_map(self, mut map_: V) -> std::result::Result - where - V: serde::de::MapAccess<'de>, - { - let mut precision__ = None; - let mut scale__ = None; - while let Some(k) = map_.next_key()? { - match k { - GeneratedField::Precision => { - if precision__.is_some() { - return Err(serde::de::Error::duplicate_field("precision")); - } - precision__ = - Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0) - ; - } - GeneratedField::Scale => { - if scale__.is_some() { - return Err(serde::de::Error::duplicate_field("scale")); - } - scale__ = - Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0) - ; - } - } - } - Ok(Decimal128Type { - precision: precision__.unwrap_or_default(), - scale: scale__.unwrap_or_default(), - }) - } - } - deserializer.deserialize_struct("datafusion_common.Decimal128Type", FIELDS, GeneratedVisitor) - } -} -impl serde::Serialize for Decimal256 { - #[allow(deprecated)] - fn serialize(&self, serializer: S) -> std::result::Result - where - S: serde::Serializer, - { - use serde::ser::SerializeStruct; - let mut len = 0; - if !self.value.is_empty() { - len += 1; - } - if self.p != 0 { - len += 1; - } - if self.s != 0 { - len += 1; - } - let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal256", len)?; - if !self.value.is_empty() { - #[allow(clippy::needless_borrow)] - #[allow(clippy::needless_borrows_for_generic_args)] - struct_ser.serialize_field("value", pbjson::private::base64::encode(&self.value).as_str())?; - } - if self.p != 0 { - #[allow(clippy::needless_borrow)] - #[allow(clippy::needless_borrows_for_generic_args)] - struct_ser.serialize_field("p", ToString::to_string(&self.p).as_str())?; - } - if self.s != 0 { - #[allow(clippy::needless_borrow)] - #[allow(clippy::needless_borrows_for_generic_args)] - struct_ser.serialize_field("s", ToString::to_string(&self.s).as_str())?; - } - struct_ser.end() - } -} -impl<'de> serde::Deserialize<'de> for Decimal256 { - #[allow(deprecated)] - fn deserialize(deserializer: D) -> std::result::Result - where - D: serde::Deserializer<'de>, - { - const FIELDS: &[&str] = &[ - "value", - "p", - "s", - ]; - - #[allow(clippy::enum_variant_names)] - enum GeneratedField { - Value, - P, - S, - } - impl<'de> serde::Deserialize<'de> for GeneratedField { - fn deserialize(deserializer: D) -> std::result::Result - where - D: serde::Deserializer<'de>, - { - struct GeneratedVisitor; - - impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { - type Value = GeneratedField; - - fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(formatter, "expected one of: {:?}", &FIELDS) - } - - #[allow(unused_variables)] - fn visit_str(self, value: &str) -> std::result::Result - where - E: serde::de::Error, - { - match value { - "value" => Ok(GeneratedField::Value), - "p" => Ok(GeneratedField::P), - "s" => Ok(GeneratedField::S), - _ => Err(serde::de::Error::unknown_field(value, FIELDS)), + if datetime_format__.is_some() { + return Err(serde::de::Error::duplicate_field("datetimeFormat")); + } + datetime_format__ = Some(map_.next_value()?); } - } - } - deserializer.deserialize_identifier(GeneratedVisitor) - } - } - struct GeneratedVisitor; - impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { - type Value = Decimal256; - - fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - formatter.write_str("struct datafusion_common.Decimal256") - } - - fn visit_map(self, mut map_: V) -> std::result::Result - where - V: serde::de::MapAccess<'de>, - { - let mut value__ = None; - let mut p__ = None; - let mut s__ = None; - while let Some(k) = map_.next_key()? { - match k { - GeneratedField::Value => { - if value__.is_some() { - return Err(serde::de::Error::duplicate_field("value")); + GeneratedField::TimestampFormat => { + if timestamp_format__.is_some() { + return Err(serde::de::Error::duplicate_field("timestampFormat")); } - value__ = - Some(map_.next_value::<::pbjson::private::BytesDeserialize<_>>()?.0) - ; + timestamp_format__ = Some(map_.next_value()?); } - GeneratedField::P => { - if p__.is_some() { - return Err(serde::de::Error::duplicate_field("p")); + GeneratedField::TimeFormat => { + if time_format__.is_some() { + return Err(serde::de::Error::duplicate_field("timeFormat")); } - p__ = - Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0) - ; + time_format__ = Some(map_.next_value()?); } - GeneratedField::S => { - if s__.is_some() { - return Err(serde::de::Error::duplicate_field("s")); + GeneratedField::NullValue => { + if null_value__.is_some() { + return Err(serde::de::Error::duplicate_field("nullValue")); } - s__ = - Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0) - ; + null_value__ = Some(map_.next_value()?); + } + GeneratedField::Quote => { + if quote__.is_some() { + return Err(serde::de::Error::duplicate_field("quote")); + } + quote__ = Some(map_.next_value()?); + } + GeneratedField::Escape => { + if escape__.is_some() { + return Err(serde::de::Error::duplicate_field("escape")); + } + escape__ = Some(map_.next_value()?); + } + GeneratedField::DoubleQuote => { + if double_quote__.is_some() { + return Err(serde::de::Error::duplicate_field("doubleQuote")); + } + double_quote__ = Some(map_.next_value()?); } } } - Ok(Decimal256 { - value: value__.unwrap_or_default(), - p: p__.unwrap_or_default(), - s: s__.unwrap_or_default(), + Ok(CsvWriterOptions { + compression: compression__.unwrap_or_default(), + delimiter: delimiter__.unwrap_or_default(), + has_header: has_header__.unwrap_or_default(), + date_format: date_format__.unwrap_or_default(), + datetime_format: datetime_format__.unwrap_or_default(), + timestamp_format: timestamp_format__.unwrap_or_default(), + time_format: time_format__.unwrap_or_default(), + null_value: null_value__.unwrap_or_default(), + quote: quote__.unwrap_or_default(), + escape: escape__.unwrap_or_default(), + double_quote: double_quote__.unwrap_or_default(), }) } } - deserializer.deserialize_struct("datafusion_common.Decimal256", FIELDS, GeneratedVisitor) + deserializer.deserialize_struct("datafusion_common.CsvWriterOptions", FIELDS, GeneratedVisitor) } } -impl serde::Serialize for Decimal256Type { +impl serde::Serialize for Decimal { #[allow(deprecated)] fn serialize(&self, serializer: S) -> std::result::Result where @@ -2686,7 +2274,7 @@ impl serde::Serialize for Decimal256Type { if self.scale != 0 { len += 1; } - let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal256Type", len)?; + let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal", len)?; if self.precision != 0 { struct_ser.serialize_field("precision", &self.precision)?; } @@ -2696,7 +2284,7 @@ impl serde::Serialize for Decimal256Type { struct_ser.end() } } -impl<'de> serde::Deserialize<'de> for Decimal256Type { +impl<'de> serde::Deserialize<'de> for Decimal { #[allow(deprecated)] fn deserialize(deserializer: D) -> std::result::Result where @@ -2743,13 +2331,13 @@ impl<'de> serde::Deserialize<'de> for Decimal256Type { } struct GeneratedVisitor; impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { - type Value = Decimal256Type; + type Value = Decimal; fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - formatter.write_str("struct datafusion_common.Decimal256Type") + formatter.write_str("struct datafusion_common.Decimal") } - fn visit_map(self, mut map_: V) -> std::result::Result + fn visit_map(self, mut map_: V) -> std::result::Result where V: serde::de::MapAccess<'de>, { @@ -2775,16 +2363,16 @@ impl<'de> serde::Deserialize<'de> for Decimal256Type { } } } - Ok(Decimal256Type { + Ok(Decimal { precision: precision__.unwrap_or_default(), scale: scale__.unwrap_or_default(), }) } } - deserializer.deserialize_struct("datafusion_common.Decimal256Type", FIELDS, GeneratedVisitor) + deserializer.deserialize_struct("datafusion_common.Decimal", FIELDS, GeneratedVisitor) } } -impl serde::Serialize for Decimal32 { +impl serde::Serialize for Decimal128 { #[allow(deprecated)] fn serialize(&self, serializer: S) -> std::result::Result where @@ -2801,7 +2389,7 @@ impl serde::Serialize for Decimal32 { if self.s != 0 { len += 1; } - let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal32", len)?; + let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal128", len)?; if !self.value.is_empty() { #[allow(clippy::needless_borrow)] #[allow(clippy::needless_borrows_for_generic_args)] @@ -2820,7 +2408,7 @@ impl serde::Serialize for Decimal32 { struct_ser.end() } } -impl<'de> serde::Deserialize<'de> for Decimal32 { +impl<'de> serde::Deserialize<'de> for Decimal128 { #[allow(deprecated)] fn deserialize(deserializer: D) -> std::result::Result where @@ -2870,13 +2458,13 @@ impl<'de> serde::Deserialize<'de> for Decimal32 { } struct GeneratedVisitor; impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { - type Value = Decimal32; + type Value = Decimal128; fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - formatter.write_str("struct datafusion_common.Decimal32") + formatter.write_str("struct datafusion_common.Decimal128") } - fn visit_map(self, mut map_: V) -> std::result::Result + fn visit_map(self, mut map_: V) -> std::result::Result where V: serde::de::MapAccess<'de>, { @@ -2911,129 +2499,17 @@ impl<'de> serde::Deserialize<'de> for Decimal32 { } } } - Ok(Decimal32 { + Ok(Decimal128 { value: value__.unwrap_or_default(), p: p__.unwrap_or_default(), s: s__.unwrap_or_default(), }) } } - deserializer.deserialize_struct("datafusion_common.Decimal32", FIELDS, GeneratedVisitor) - } -} -impl serde::Serialize for Decimal32Type { - #[allow(deprecated)] - fn serialize(&self, serializer: S) -> std::result::Result - where - S: serde::Serializer, - { - use serde::ser::SerializeStruct; - let mut len = 0; - if self.precision != 0 { - len += 1; - } - if self.scale != 0 { - len += 1; - } - let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal32Type", len)?; - if self.precision != 0 { - struct_ser.serialize_field("precision", &self.precision)?; - } - if self.scale != 0 { - struct_ser.serialize_field("scale", &self.scale)?; - } - struct_ser.end() - } -} -impl<'de> serde::Deserialize<'de> for Decimal32Type { - #[allow(deprecated)] - fn deserialize(deserializer: D) -> std::result::Result - where - D: serde::Deserializer<'de>, - { - const FIELDS: &[&str] = &[ - "precision", - "scale", - ]; - - #[allow(clippy::enum_variant_names)] - enum GeneratedField { - Precision, - Scale, - } - impl<'de> serde::Deserialize<'de> for GeneratedField { - fn deserialize(deserializer: D) -> std::result::Result - where - D: serde::Deserializer<'de>, - { - struct GeneratedVisitor; - - impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { - type Value = GeneratedField; - - fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(formatter, "expected one of: {:?}", &FIELDS) - } - - #[allow(unused_variables)] - fn visit_str(self, value: &str) -> std::result::Result - where - E: serde::de::Error, - { - match value { - "precision" => Ok(GeneratedField::Precision), - "scale" => Ok(GeneratedField::Scale), - _ => Err(serde::de::Error::unknown_field(value, FIELDS)), - } - } - } - deserializer.deserialize_identifier(GeneratedVisitor) - } - } - struct GeneratedVisitor; - impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { - type Value = Decimal32Type; - - fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - formatter.write_str("struct datafusion_common.Decimal32Type") - } - - fn visit_map(self, mut map_: V) -> std::result::Result - where - V: serde::de::MapAccess<'de>, - { - let mut precision__ = None; - let mut scale__ = None; - while let Some(k) = map_.next_key()? { - match k { - GeneratedField::Precision => { - if precision__.is_some() { - return Err(serde::de::Error::duplicate_field("precision")); - } - precision__ = - Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0) - ; - } - GeneratedField::Scale => { - if scale__.is_some() { - return Err(serde::de::Error::duplicate_field("scale")); - } - scale__ = - Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0) - ; - } - } - } - Ok(Decimal32Type { - precision: precision__.unwrap_or_default(), - scale: scale__.unwrap_or_default(), - }) - } - } - deserializer.deserialize_struct("datafusion_common.Decimal32Type", FIELDS, GeneratedVisitor) + deserializer.deserialize_struct("datafusion_common.Decimal128", FIELDS, GeneratedVisitor) } } -impl serde::Serialize for Decimal64 { +impl serde::Serialize for Decimal256 { #[allow(deprecated)] fn serialize(&self, serializer: S) -> std::result::Result where @@ -3050,7 +2526,7 @@ impl serde::Serialize for Decimal64 { if self.s != 0 { len += 1; } - let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal64", len)?; + let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal256", len)?; if !self.value.is_empty() { #[allow(clippy::needless_borrow)] #[allow(clippy::needless_borrows_for_generic_args)] @@ -3069,7 +2545,7 @@ impl serde::Serialize for Decimal64 { struct_ser.end() } } -impl<'de> serde::Deserialize<'de> for Decimal64 { +impl<'de> serde::Deserialize<'de> for Decimal256 { #[allow(deprecated)] fn deserialize(deserializer: D) -> std::result::Result where @@ -3119,13 +2595,13 @@ impl<'de> serde::Deserialize<'de> for Decimal64 { } struct GeneratedVisitor; impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { - type Value = Decimal64; + type Value = Decimal256; fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - formatter.write_str("struct datafusion_common.Decimal64") + formatter.write_str("struct datafusion_common.Decimal256") } - fn visit_map(self, mut map_: V) -> std::result::Result + fn visit_map(self, mut map_: V) -> std::result::Result where V: serde::de::MapAccess<'de>, { @@ -3160,17 +2636,17 @@ impl<'de> serde::Deserialize<'de> for Decimal64 { } } } - Ok(Decimal64 { + Ok(Decimal256 { value: value__.unwrap_or_default(), p: p__.unwrap_or_default(), s: s__.unwrap_or_default(), }) } } - deserializer.deserialize_struct("datafusion_common.Decimal64", FIELDS, GeneratedVisitor) + deserializer.deserialize_struct("datafusion_common.Decimal256", FIELDS, GeneratedVisitor) } } -impl serde::Serialize for Decimal64Type { +impl serde::Serialize for Decimal256Type { #[allow(deprecated)] fn serialize(&self, serializer: S) -> std::result::Result where @@ -3184,7 +2660,7 @@ impl serde::Serialize for Decimal64Type { if self.scale != 0 { len += 1; } - let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal64Type", len)?; + let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal256Type", len)?; if self.precision != 0 { struct_ser.serialize_field("precision", &self.precision)?; } @@ -3194,7 +2670,7 @@ impl serde::Serialize for Decimal64Type { struct_ser.end() } } -impl<'de> serde::Deserialize<'de> for Decimal64Type { +impl<'de> serde::Deserialize<'de> for Decimal256Type { #[allow(deprecated)] fn deserialize(deserializer: D) -> std::result::Result where @@ -3241,13 +2717,13 @@ impl<'de> serde::Deserialize<'de> for Decimal64Type { } struct GeneratedVisitor; impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { - type Value = Decimal64Type; + type Value = Decimal256Type; fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - formatter.write_str("struct datafusion_common.Decimal64Type") + formatter.write_str("struct datafusion_common.Decimal256Type") } - fn visit_map(self, mut map_: V) -> std::result::Result + fn visit_map(self, mut map_: V) -> std::result::Result where V: serde::de::MapAccess<'de>, { @@ -3273,13 +2749,13 @@ impl<'de> serde::Deserialize<'de> for Decimal64Type { } } } - Ok(Decimal64Type { + Ok(Decimal256Type { precision: precision__.unwrap_or_default(), scale: scale__.unwrap_or_default(), }) } } - deserializer.deserialize_struct("datafusion_common.Decimal64Type", FIELDS, GeneratedVisitor) + deserializer.deserialize_struct("datafusion_common.Decimal256Type", FIELDS, GeneratedVisitor) } } impl serde::Serialize for DfField { @@ -5113,6 +4589,9 @@ impl serde::Serialize for ParquetColumnOptions { if self.bloom_filter_ndv_opt.is_some() { len += 1; } + if self.max_statistics_size_opt.is_some() { + len += 1; + } let mut struct_ser = serializer.serialize_struct("datafusion_common.ParquetColumnOptions", len)?; if let Some(v) = self.bloom_filter_enabled_opt.as_ref() { match v { @@ -5165,6 +4644,13 @@ impl serde::Serialize for ParquetColumnOptions { } } } + if let Some(v) = self.max_statistics_size_opt.as_ref() { + match v { + parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v) => { + struct_ser.serialize_field("maxStatisticsSize", v)?; + } + } + } struct_ser.end() } } @@ -5187,6 +4673,8 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions { "bloomFilterFpp", "bloom_filter_ndv", "bloomFilterNdv", + "max_statistics_size", + "maxStatisticsSize", ]; #[allow(clippy::enum_variant_names)] @@ -5198,6 +4686,7 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions { StatisticsEnabled, BloomFilterFpp, BloomFilterNdv, + MaxStatisticsSize, } impl<'de> serde::Deserialize<'de> for GeneratedField { fn deserialize(deserializer: D) -> std::result::Result @@ -5226,6 +4715,7 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions { "statisticsEnabled" | "statistics_enabled" => Ok(GeneratedField::StatisticsEnabled), "bloomFilterFpp" | "bloom_filter_fpp" => Ok(GeneratedField::BloomFilterFpp), "bloomFilterNdv" | "bloom_filter_ndv" => Ok(GeneratedField::BloomFilterNdv), + "maxStatisticsSize" | "max_statistics_size" => Ok(GeneratedField::MaxStatisticsSize), _ => Err(serde::de::Error::unknown_field(value, FIELDS)), } } @@ -5252,6 +4742,7 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions { let mut statistics_enabled_opt__ = None; let mut bloom_filter_fpp_opt__ = None; let mut bloom_filter_ndv_opt__ = None; + let mut max_statistics_size_opt__ = None; while let Some(k) = map_.next_key()? { match k { GeneratedField::BloomFilterEnabled => { @@ -5296,6 +4787,12 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions { } bloom_filter_ndv_opt__ = map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| parquet_column_options::BloomFilterNdvOpt::BloomFilterNdv(x.0)); } + GeneratedField::MaxStatisticsSize => { + if max_statistics_size_opt__.is_some() { + return Err(serde::de::Error::duplicate_field("maxStatisticsSize")); + } + max_statistics_size_opt__ = map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize(x.0)); + } } } Ok(ParquetColumnOptions { @@ -5306,6 +4803,7 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions { statistics_enabled_opt: statistics_enabled_opt__, bloom_filter_fpp_opt: bloom_filter_fpp_opt__, bloom_filter_ndv_opt: bloom_filter_ndv_opt__, + max_statistics_size_opt: max_statistics_size_opt__, }) } } @@ -5592,6 +5090,9 @@ impl serde::Serialize for ParquetOptions { if self.statistics_enabled_opt.is_some() { len += 1; } + if self.max_statistics_size_opt.is_some() { + len += 1; + } if self.column_index_truncate_length_opt.is_some() { len += 1; } @@ -5715,6 +5216,15 @@ impl serde::Serialize for ParquetOptions { } } } + if let Some(v) = self.max_statistics_size_opt.as_ref() { + match v { + parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v) => { + #[allow(clippy::needless_borrow)] + #[allow(clippy::needless_borrows_for_generic_args)] + struct_ser.serialize_field("maxStatisticsSize", ToString::to_string(&v).as_str())?; + } + } + } if let Some(v) = self.column_index_truncate_length_opt.as_ref() { match v { parquet_options::ColumnIndexTruncateLengthOpt::ColumnIndexTruncateLength(v) => { @@ -5819,6 +5329,8 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { "dictionaryEnabled", "statistics_enabled", "statisticsEnabled", + "max_statistics_size", + "maxStatisticsSize", "column_index_truncate_length", "columnIndexTruncateLength", "statistics_truncate_length", @@ -5858,6 +5370,7 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { Compression, DictionaryEnabled, StatisticsEnabled, + MaxStatisticsSize, ColumnIndexTruncateLength, StatisticsTruncateLength, Encoding, @@ -5909,6 +5422,7 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { "compression" => Ok(GeneratedField::Compression), "dictionaryEnabled" | "dictionary_enabled" => Ok(GeneratedField::DictionaryEnabled), "statisticsEnabled" | "statistics_enabled" => Ok(GeneratedField::StatisticsEnabled), + "maxStatisticsSize" | "max_statistics_size" => Ok(GeneratedField::MaxStatisticsSize), "columnIndexTruncateLength" | "column_index_truncate_length" => Ok(GeneratedField::ColumnIndexTruncateLength), "statisticsTruncateLength" | "statistics_truncate_length" => Ok(GeneratedField::StatisticsTruncateLength), "encoding" => Ok(GeneratedField::Encoding), @@ -5958,6 +5472,7 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { let mut compression_opt__ = None; let mut dictionary_enabled_opt__ = None; let mut statistics_enabled_opt__ = None; + let mut max_statistics_size_opt__ = None; let mut column_index_truncate_length_opt__ = None; let mut statistics_truncate_length_opt__ = None; let mut encoding_opt__ = None; @@ -6124,6 +5639,12 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { } statistics_enabled_opt__ = map_.next_value::<::std::option::Option<_>>()?.map(parquet_options::StatisticsEnabledOpt::StatisticsEnabled); } + GeneratedField::MaxStatisticsSize => { + if max_statistics_size_opt__.is_some() { + return Err(serde::de::Error::duplicate_field("maxStatisticsSize")); + } + max_statistics_size_opt__ = map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(x.0)); + } GeneratedField::ColumnIndexTruncateLength => { if column_index_truncate_length_opt__.is_some() { return Err(serde::de::Error::duplicate_field("columnIndexTruncateLength")); @@ -6187,6 +5708,7 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { compression_opt: compression_opt__, dictionary_enabled_opt: dictionary_enabled_opt__, statistics_enabled_opt: statistics_enabled_opt__, + max_statistics_size_opt: max_statistics_size_opt__, column_index_truncate_length_opt: column_index_truncate_length_opt__, statistics_truncate_length_opt: statistics_truncate_length_opt__, encoding_opt: encoding_opt__, @@ -7437,12 +6959,6 @@ impl serde::Serialize for ScalarValue { scalar_value::Value::MapValue(v) => { struct_ser.serialize_field("mapValue", v)?; } - scalar_value::Value::Decimal32Value(v) => { - struct_ser.serialize_field("decimal32Value", v)?; - } - scalar_value::Value::Decimal64Value(v) => { - struct_ser.serialize_field("decimal64Value", v)?; - } scalar_value::Value::Decimal128Value(v) => { struct_ser.serialize_field("decimal128Value", v)?; } @@ -7569,10 +7085,6 @@ impl<'de> serde::Deserialize<'de> for ScalarValue { "structValue", "map_value", "mapValue", - "decimal32_value", - "decimal32Value", - "decimal64_value", - "decimal64Value", "decimal128_value", "decimal128Value", "decimal256_value", @@ -7635,8 +7147,6 @@ impl<'de> serde::Deserialize<'de> for ScalarValue { FixedSizeListValue, StructValue, MapValue, - Decimal32Value, - Decimal64Value, Decimal128Value, Decimal256Value, Date64Value, @@ -7698,8 +7208,6 @@ impl<'de> serde::Deserialize<'de> for ScalarValue { "fixedSizeListValue" | "fixed_size_list_value" => Ok(GeneratedField::FixedSizeListValue), "structValue" | "struct_value" => Ok(GeneratedField::StructValue), "mapValue" | "map_value" => Ok(GeneratedField::MapValue), - "decimal32Value" | "decimal32_value" => Ok(GeneratedField::Decimal32Value), - "decimal64Value" | "decimal64_value" => Ok(GeneratedField::Decimal64Value), "decimal128Value" | "decimal128_value" => Ok(GeneratedField::Decimal128Value), "decimal256Value" | "decimal256_value" => Ok(GeneratedField::Decimal256Value), "date64Value" | "date_64_value" => Ok(GeneratedField::Date64Value), @@ -7877,20 +7385,6 @@ impl<'de> serde::Deserialize<'de> for ScalarValue { return Err(serde::de::Error::duplicate_field("mapValue")); } value__ = map_.next_value::<::std::option::Option<_>>()?.map(scalar_value::Value::MapValue) -; - } - GeneratedField::Decimal32Value => { - if value__.is_some() { - return Err(serde::de::Error::duplicate_field("decimal32Value")); - } - value__ = map_.next_value::<::std::option::Option<_>>()?.map(scalar_value::Value::Decimal32Value) -; - } - GeneratedField::Decimal64Value => { - if value__.is_some() { - return Err(serde::de::Error::duplicate_field("decimal64Value")); - } - value__ = map_.next_value::<::std::option::Option<_>>()?.map(scalar_value::Value::Decimal64Value) ; } GeneratedField::Decimal128Value => { diff --git a/datafusion/proto-common/src/generated/prost.rs b/datafusion/proto-common/src/generated/prost.rs index aa23cea57470..ac4a9ea4be69 100644 --- a/datafusion/proto-common/src/generated/prost.rs +++ b/datafusion/proto-common/src/generated/prost.rs @@ -117,21 +117,7 @@ pub struct Timestamp { pub timezone: ::prost::alloc::string::String, } #[derive(Clone, Copy, PartialEq, ::prost::Message)] -pub struct Decimal32Type { - #[prost(uint32, tag = "3")] - pub precision: u32, - #[prost(int32, tag = "4")] - pub scale: i32, -} -#[derive(Clone, Copy, PartialEq, ::prost::Message)] -pub struct Decimal64Type { - #[prost(uint32, tag = "3")] - pub precision: u32, - #[prost(int32, tag = "4")] - pub scale: i32, -} -#[derive(Clone, Copy, PartialEq, ::prost::Message)] -pub struct Decimal128Type { +pub struct Decimal { #[prost(uint32, tag = "3")] pub precision: u32, #[prost(int32, tag = "4")] @@ -311,7 +297,7 @@ pub struct ScalarFixedSizeBinary { pub struct ScalarValue { #[prost( oneof = "scalar_value::Value", - tags = "33, 1, 2, 3, 23, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 32, 41, 43, 44, 20, 39, 21, 24, 35, 36, 37, 38, 26, 27, 28, 29, 22, 30, 25, 31, 34, 42" + tags = "33, 1, 2, 3, 23, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 32, 41, 20, 39, 21, 24, 35, 36, 37, 38, 26, 27, 28, 29, 22, 30, 25, 31, 34, 42" )] pub value: ::core::option::Option, } @@ -366,10 +352,6 @@ pub mod scalar_value { StructValue(super::ScalarNestedValue), #[prost(message, tag = "41")] MapValue(super::ScalarNestedValue), - #[prost(message, tag = "43")] - Decimal32Value(super::Decimal32), - #[prost(message, tag = "44")] - Decimal64Value(super::Decimal64), #[prost(message, tag = "20")] Decimal128Value(super::Decimal128), #[prost(message, tag = "39")] @@ -409,24 +391,6 @@ pub mod scalar_value { } } #[derive(Clone, PartialEq, ::prost::Message)] -pub struct Decimal32 { - #[prost(bytes = "vec", tag = "1")] - pub value: ::prost::alloc::vec::Vec, - #[prost(int64, tag = "2")] - pub p: i64, - #[prost(int64, tag = "3")] - pub s: i64, -} -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct Decimal64 { - #[prost(bytes = "vec", tag = "1")] - pub value: ::prost::alloc::vec::Vec, - #[prost(int64, tag = "2")] - pub p: i64, - #[prost(int64, tag = "3")] - pub s: i64, -} -#[derive(Clone, PartialEq, ::prost::Message)] pub struct Decimal128 { #[prost(bytes = "vec", tag = "1")] pub value: ::prost::alloc::vec::Vec, @@ -449,7 +413,7 @@ pub struct Decimal256 { pub struct ArrowType { #[prost( oneof = "arrow_type::ArrowTypeEnum", - tags = "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 35, 32, 15, 34, 16, 31, 17, 18, 19, 20, 21, 22, 23, 40, 41, 24, 36, 25, 26, 27, 28, 29, 30, 33" + tags = "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 35, 32, 15, 34, 16, 31, 17, 18, 19, 20, 21, 22, 23, 24, 36, 25, 26, 27, 28, 29, 30, 33" )] pub arrow_type_enum: ::core::option::Option, } @@ -516,12 +480,8 @@ pub mod arrow_type { Time64(i32), #[prost(enumeration = "super::IntervalUnit", tag = "23")] Interval(i32), - #[prost(message, tag = "40")] - Decimal32(super::Decimal32Type), - #[prost(message, tag = "41")] - Decimal64(super::Decimal64Type), #[prost(message, tag = "24")] - Decimal128(super::Decimal128Type), + Decimal(super::Decimal), #[prost(message, tag = "36")] Decimal256(super::Decimal256Type), #[prost(message, tag = "25")] @@ -702,6 +662,10 @@ pub struct ParquetColumnOptions { pub bloom_filter_ndv_opt: ::core::option::Option< parquet_column_options::BloomFilterNdvOpt, >, + #[prost(oneof = "parquet_column_options::MaxStatisticsSizeOpt", tags = "8")] + pub max_statistics_size_opt: ::core::option::Option< + parquet_column_options::MaxStatisticsSizeOpt, + >, } /// Nested message and enum types in `ParquetColumnOptions`. pub mod parquet_column_options { @@ -740,6 +704,11 @@ pub mod parquet_column_options { #[prost(uint64, tag = "7")] BloomFilterNdv(u64), } + #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] + pub enum MaxStatisticsSizeOpt { + #[prost(uint32, tag = "8")] + MaxStatisticsSize(u32), + } } #[derive(Clone, PartialEq, ::prost::Message)] pub struct ParquetOptions { @@ -817,6 +786,10 @@ pub struct ParquetOptions { pub statistics_enabled_opt: ::core::option::Option< parquet_options::StatisticsEnabledOpt, >, + #[prost(oneof = "parquet_options::MaxStatisticsSizeOpt", tags = "14")] + pub max_statistics_size_opt: ::core::option::Option< + parquet_options::MaxStatisticsSizeOpt, + >, #[prost(oneof = "parquet_options::ColumnIndexTruncateLengthOpt", tags = "17")] pub column_index_truncate_length_opt: ::core::option::Option< parquet_options::ColumnIndexTruncateLengthOpt, @@ -857,6 +830,11 @@ pub mod parquet_options { StatisticsEnabled(::prost::alloc::string::String), } #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] + pub enum MaxStatisticsSizeOpt { + #[prost(uint64, tag = "14")] + MaxStatisticsSize(u64), + } + #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] pub enum ColumnIndexTruncateLengthOpt { #[prost(uint64, tag = "17")] ColumnIndexTruncateLength(u64), diff --git a/datafusion/proto-common/src/to_proto/mod.rs b/datafusion/proto-common/src/to_proto/mod.rs index c06427065733..b6cbe5759cfc 100644 --- a/datafusion/proto-common/src/to_proto/mod.rs +++ b/datafusion/proto-common/src/to_proto/mod.rs @@ -189,15 +189,7 @@ impl TryFrom<&DataType> for protobuf::arrow_type::ArrowTypeEnum { value: Some(Box::new(value_type.as_ref().try_into()?)), })) } - DataType::Decimal32(precision, scale) => Self::Decimal32(protobuf::Decimal32Type { - precision: *precision as u32, - scale: *scale as i32, - }), - DataType::Decimal64(precision, scale) => Self::Decimal64(protobuf::Decimal64Type { - precision: *precision as u32, - scale: *scale as i32, - }), - DataType::Decimal128(precision, scale) => Self::Decimal128(protobuf::Decimal128Type { + DataType::Decimal128(precision, scale) => Self::Decimal(protobuf::Decimal { precision: *precision as u32, scale: *scale as i32, }), @@ -825,6 +817,8 @@ impl TryFrom<&ParquetOptions> for protobuf::ParquetOptions { dictionary_enabled_opt: value.dictionary_enabled.map(protobuf::parquet_options::DictionaryEnabledOpt::DictionaryEnabled), dictionary_page_size_limit: value.dictionary_page_size_limit as u64, statistics_enabled_opt: value.statistics_enabled.clone().map(protobuf::parquet_options::StatisticsEnabledOpt::StatisticsEnabled), + #[allow(deprecated)] + max_statistics_size_opt: value.max_statistics_size.map(|v| protobuf::parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v as u64)), max_row_group_size: value.max_row_group_size as u64, created_by: value.created_by.clone(), column_index_truncate_length_opt: value.column_index_truncate_length.map(|v| protobuf::parquet_options::ColumnIndexTruncateLengthOpt::ColumnIndexTruncateLength(v as u64)), @@ -864,6 +858,12 @@ impl TryFrom<&ParquetColumnOptions> for protobuf::ParquetColumnOptions { .statistics_enabled .clone() .map(protobuf::parquet_column_options::StatisticsEnabledOpt::StatisticsEnabled), + #[allow(deprecated)] + max_statistics_size_opt: value.max_statistics_size.map(|v| { + protobuf::parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize( + v as u32, + ) + }), encoding_opt: value .encoding .clone() diff --git a/datafusion/proto/src/generated/datafusion_proto_common.rs b/datafusion/proto/src/generated/datafusion_proto_common.rs index aa23cea57470..ac4a9ea4be69 100644 --- a/datafusion/proto/src/generated/datafusion_proto_common.rs +++ b/datafusion/proto/src/generated/datafusion_proto_common.rs @@ -117,21 +117,7 @@ pub struct Timestamp { pub timezone: ::prost::alloc::string::String, } #[derive(Clone, Copy, PartialEq, ::prost::Message)] -pub struct Decimal32Type { - #[prost(uint32, tag = "3")] - pub precision: u32, - #[prost(int32, tag = "4")] - pub scale: i32, -} -#[derive(Clone, Copy, PartialEq, ::prost::Message)] -pub struct Decimal64Type { - #[prost(uint32, tag = "3")] - pub precision: u32, - #[prost(int32, tag = "4")] - pub scale: i32, -} -#[derive(Clone, Copy, PartialEq, ::prost::Message)] -pub struct Decimal128Type { +pub struct Decimal { #[prost(uint32, tag = "3")] pub precision: u32, #[prost(int32, tag = "4")] @@ -311,7 +297,7 @@ pub struct ScalarFixedSizeBinary { pub struct ScalarValue { #[prost( oneof = "scalar_value::Value", - tags = "33, 1, 2, 3, 23, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 32, 41, 43, 44, 20, 39, 21, 24, 35, 36, 37, 38, 26, 27, 28, 29, 22, 30, 25, 31, 34, 42" + tags = "33, 1, 2, 3, 23, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 32, 41, 20, 39, 21, 24, 35, 36, 37, 38, 26, 27, 28, 29, 22, 30, 25, 31, 34, 42" )] pub value: ::core::option::Option, } @@ -366,10 +352,6 @@ pub mod scalar_value { StructValue(super::ScalarNestedValue), #[prost(message, tag = "41")] MapValue(super::ScalarNestedValue), - #[prost(message, tag = "43")] - Decimal32Value(super::Decimal32), - #[prost(message, tag = "44")] - Decimal64Value(super::Decimal64), #[prost(message, tag = "20")] Decimal128Value(super::Decimal128), #[prost(message, tag = "39")] @@ -409,24 +391,6 @@ pub mod scalar_value { } } #[derive(Clone, PartialEq, ::prost::Message)] -pub struct Decimal32 { - #[prost(bytes = "vec", tag = "1")] - pub value: ::prost::alloc::vec::Vec, - #[prost(int64, tag = "2")] - pub p: i64, - #[prost(int64, tag = "3")] - pub s: i64, -} -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct Decimal64 { - #[prost(bytes = "vec", tag = "1")] - pub value: ::prost::alloc::vec::Vec, - #[prost(int64, tag = "2")] - pub p: i64, - #[prost(int64, tag = "3")] - pub s: i64, -} -#[derive(Clone, PartialEq, ::prost::Message)] pub struct Decimal128 { #[prost(bytes = "vec", tag = "1")] pub value: ::prost::alloc::vec::Vec, @@ -449,7 +413,7 @@ pub struct Decimal256 { pub struct ArrowType { #[prost( oneof = "arrow_type::ArrowTypeEnum", - tags = "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 35, 32, 15, 34, 16, 31, 17, 18, 19, 20, 21, 22, 23, 40, 41, 24, 36, 25, 26, 27, 28, 29, 30, 33" + tags = "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 35, 32, 15, 34, 16, 31, 17, 18, 19, 20, 21, 22, 23, 24, 36, 25, 26, 27, 28, 29, 30, 33" )] pub arrow_type_enum: ::core::option::Option, } @@ -516,12 +480,8 @@ pub mod arrow_type { Time64(i32), #[prost(enumeration = "super::IntervalUnit", tag = "23")] Interval(i32), - #[prost(message, tag = "40")] - Decimal32(super::Decimal32Type), - #[prost(message, tag = "41")] - Decimal64(super::Decimal64Type), #[prost(message, tag = "24")] - Decimal128(super::Decimal128Type), + Decimal(super::Decimal), #[prost(message, tag = "36")] Decimal256(super::Decimal256Type), #[prost(message, tag = "25")] @@ -702,6 +662,10 @@ pub struct ParquetColumnOptions { pub bloom_filter_ndv_opt: ::core::option::Option< parquet_column_options::BloomFilterNdvOpt, >, + #[prost(oneof = "parquet_column_options::MaxStatisticsSizeOpt", tags = "8")] + pub max_statistics_size_opt: ::core::option::Option< + parquet_column_options::MaxStatisticsSizeOpt, + >, } /// Nested message and enum types in `ParquetColumnOptions`. pub mod parquet_column_options { @@ -740,6 +704,11 @@ pub mod parquet_column_options { #[prost(uint64, tag = "7")] BloomFilterNdv(u64), } + #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] + pub enum MaxStatisticsSizeOpt { + #[prost(uint32, tag = "8")] + MaxStatisticsSize(u32), + } } #[derive(Clone, PartialEq, ::prost::Message)] pub struct ParquetOptions { @@ -817,6 +786,10 @@ pub struct ParquetOptions { pub statistics_enabled_opt: ::core::option::Option< parquet_options::StatisticsEnabledOpt, >, + #[prost(oneof = "parquet_options::MaxStatisticsSizeOpt", tags = "14")] + pub max_statistics_size_opt: ::core::option::Option< + parquet_options::MaxStatisticsSizeOpt, + >, #[prost(oneof = "parquet_options::ColumnIndexTruncateLengthOpt", tags = "17")] pub column_index_truncate_length_opt: ::core::option::Option< parquet_options::ColumnIndexTruncateLengthOpt, @@ -857,6 +830,11 @@ pub mod parquet_options { StatisticsEnabled(::prost::alloc::string::String), } #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] + pub enum MaxStatisticsSizeOpt { + #[prost(uint64, tag = "14")] + MaxStatisticsSize(u64), + } + #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] pub enum ColumnIndexTruncateLengthOpt { #[prost(uint64, tag = "17")] ColumnIndexTruncateLength(u64), diff --git a/datafusion/proto/src/logical_plan/file_formats.rs b/datafusion/proto/src/logical_plan/file_formats.rs index 654607bd733d..620442c79e72 100644 --- a/datafusion/proto/src/logical_plan/file_formats.rs +++ b/datafusion/proto/src/logical_plan/file_formats.rs @@ -382,6 +382,9 @@ impl TableParquetOptionsProto { statistics_enabled_opt: global_options.global.statistics_enabled.map(|enabled| { parquet_options::StatisticsEnabledOpt::StatisticsEnabled(enabled) }), + max_statistics_size_opt: global_options.global.max_statistics_size.map(|size| { + parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(size as u64) + }), max_row_group_size: global_options.global.max_row_group_size as u64, created_by: global_options.global.created_by.clone(), column_index_truncate_length_opt: global_options.global.column_index_truncate_length.map(|length| { @@ -437,6 +440,9 @@ impl TableParquetOptionsProto { bloom_filter_ndv_opt: options.bloom_filter_ndv.map(|ndv| { parquet_column_options::BloomFilterNdvOpt::BloomFilterNdv(ndv) }), + max_statistics_size_opt: options.max_statistics_size.map(|size| { + parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize(size as u32) + }), }) } }).collect(), @@ -475,6 +481,9 @@ impl From<&ParquetOptionsProto> for ParquetOptions { statistics_enabled: proto.statistics_enabled_opt.as_ref().map(|opt| match opt { parquet_options::StatisticsEnabledOpt::StatisticsEnabled(statistics) => statistics.clone(), }), + max_statistics_size: proto.max_statistics_size_opt.as_ref().map(|opt| match opt { + parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(size) => *size as usize, + }), max_row_group_size: proto.max_row_group_size as usize, created_by: proto.created_by.clone(), column_index_truncate_length: proto.column_index_truncate_length_opt.as_ref().map(|opt| match opt { @@ -533,6 +542,11 @@ impl From for ParquetColumnOptions { bloom_filter_ndv: proto .bloom_filter_ndv_opt .map(|parquet_column_options::BloomFilterNdvOpt::BloomFilterNdv(v)| v), + max_statistics_size: proto.max_statistics_size_opt.map( + |parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v)| { + v as usize + }, + ), } } } diff --git a/datafusion/sql/src/unparser/expr.rs b/datafusion/sql/src/unparser/expr.rs index d4e911a62c09..c4baf9b438e9 100644 --- a/datafusion/sql/src/unparser/expr.rs +++ b/datafusion/sql/src/unparser/expr.rs @@ -1726,12 +1726,6 @@ impl Unparser<'_> { not_impl_err!("Unsupported DataType: conversion: {data_type:?}") } DataType::Dictionary(_, val) => self.arrow_dtype_to_ast_dtype(val), - DataType::Decimal32(_precision, _scale) => { - not_impl_err!("Unsupported DataType: conversion: {data_type:?}") - } - DataType::Decimal64(_precision, _scale) => { - not_impl_err!("Unsupported DataType: conversion: {data_type:?}") - } DataType::Decimal128(precision, scale) | DataType::Decimal256(precision, scale) => { let mut new_precision = *precision as u64; diff --git a/datafusion/sqllogictest/test_files/array.slt b/datafusion/sqllogictest/test_files/array.slt index 96ab84ab9095..9ec3ff8409c2 100644 --- a/datafusion/sqllogictest/test_files/array.slt +++ b/datafusion/sqllogictest/test_files/array.slt @@ -8062,19 +8062,7 @@ select arrow_typeof(a) from fixed_size_col_table; FixedSizeList(Field { name: "item", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, 3) FixedSizeList(Field { name: "item", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, 3) -query ? rowsort -SELECT DISTINCT a FROM fixed_size_col_table ----- -[1, 2, 3] -[4, 5, 6] - -query ?I rowsort -SELECT a, count(*) FROM fixed_size_col_table GROUP BY a ----- -[1, 2, 3] 1 -[4, 5, 6] 1 - -statement error Cast error: Cannot cast to FixedSizeList\(3\): value at index 0 has length 2 +statement error create table varying_fixed_size_col_table (a int[3]) as values ([1,2,3]), ([4,5]); # https://github.com/apache/datafusion/issues/16187 diff --git a/datafusion/sqllogictest/test_files/copy.slt b/datafusion/sqllogictest/test_files/copy.slt index 096cde86f26f..5eeb05e814ac 100644 --- a/datafusion/sqllogictest/test_files/copy.slt +++ b/datafusion/sqllogictest/test_files/copy.slt @@ -332,6 +332,7 @@ OPTIONS ( 'format.dictionary_enabled' false, 'format.statistics_enabled' page, 'format.statistics_enabled::col2' none, +'format.max_statistics_size' 123, 'format.bloom_filter_fpp' 0.001, 'format.bloom_filter_ndv' 100, 'format.metadata::key' 'value' diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index fb2c89020112..8eb1fa9a0086 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -242,6 +242,7 @@ datafusion.execution.parquet.dictionary_page_size_limit 1048576 datafusion.execution.parquet.enable_page_index true datafusion.execution.parquet.encoding NULL datafusion.execution.parquet.max_row_group_size 1048576 +datafusion.execution.parquet.max_statistics_size 4096 datafusion.execution.parquet.maximum_buffered_record_batches_per_stream 2 datafusion.execution.parquet.maximum_parallel_row_group_writers 1 datafusion.execution.parquet.metadata_size_hint NULL @@ -252,7 +253,7 @@ datafusion.execution.parquet.schema_force_view_types true datafusion.execution.parquet.skip_arrow_metadata false datafusion.execution.parquet.skip_metadata true datafusion.execution.parquet.statistics_enabled page -datafusion.execution.parquet.statistics_truncate_length 64 +datafusion.execution.parquet.statistics_truncate_length NULL datafusion.execution.parquet.write_batch_size 1024 datafusion.execution.parquet.writer_version 1.0 datafusion.execution.planning_concurrency 13 @@ -356,6 +357,7 @@ datafusion.execution.parquet.dictionary_page_size_limit 1048576 (writing) Sets b datafusion.execution.parquet.enable_page_index true (reading) If true, reads the Parquet data page level metadata (the Page Index), if present, to reduce the I/O and number of rows decoded. datafusion.execution.parquet.encoding NULL (writing) Sets default encoding for any column. Valid values are: plain, plain_dictionary, rle, bit_packed, delta_binary_packed, delta_length_byte_array, delta_byte_array, rle_dictionary, and byte_stream_split. These values are not case sensitive. If NULL, uses default parquet writer setting datafusion.execution.parquet.max_row_group_size 1048576 (writing) Target maximum number of rows in each row group (defaults to 1M rows). Writing larger row groups requires more memory to write, but can get better compression and be faster to read. +datafusion.execution.parquet.max_statistics_size 4096 (writing) Sets max statistics size for any column. If NULL, uses default parquet writer setting max_statistics_size is deprecated, currently it is not being used datafusion.execution.parquet.maximum_buffered_record_batches_per_stream 2 (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame. datafusion.execution.parquet.maximum_parallel_row_group_writers 1 (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame. datafusion.execution.parquet.metadata_size_hint NULL (reading) If specified, the parquet reader will try and fetch the last `size_hint` bytes of the parquet file optimistically. If not specified, two reads are required: One read to fetch the 8-byte parquet footer and another to fetch the metadata length encoded in the footer @@ -366,7 +368,7 @@ datafusion.execution.parquet.schema_force_view_types true (reading) If true, par datafusion.execution.parquet.skip_arrow_metadata false (writing) Skip encoding the embedded arrow metadata in the KV_meta This is analogous to the `ArrowWriterOptions::with_skip_arrow_metadata`. Refer to datafusion.execution.parquet.skip_metadata true (reading) If true, the parquet reader skip the optional embedded metadata that may be in the file Schema. This setting can help avoid schema conflicts when querying multiple parquet files with schemas containing compatible types but different metadata datafusion.execution.parquet.statistics_enabled page (writing) Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting -datafusion.execution.parquet.statistics_truncate_length 64 (writing) Sets statistics truncate length. If NULL, uses default parquet writer setting +datafusion.execution.parquet.statistics_truncate_length NULL (writing) Sets statistics truncate length. If NULL, uses default parquet writer setting datafusion.execution.parquet.write_batch_size 1024 (writing) Sets write_batch_size in bytes datafusion.execution.parquet.writer_version 1.0 (writing) Sets parquet writer version valid values are "1.0" and "2.0" datafusion.execution.planning_concurrency 13 Fan-out during initial physical planning. This is mostly use to plan `UNION` children in parallel. Defaults to the number of CPU cores on the system diff --git a/datafusion/sqllogictest/test_files/listing_table_statistics.slt b/datafusion/sqllogictest/test_files/listing_table_statistics.slt index 37daf551c2c3..a43d4be62551 100644 --- a/datafusion/sqllogictest/test_files/listing_table_statistics.slt +++ b/datafusion/sqllogictest/test_files/listing_table_statistics.slt @@ -35,7 +35,7 @@ query TT explain format indent select * from t; ---- logical_plan TableScan: t projection=[int_col, str_col] -physical_plan DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/listing_table_statistics/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/listing_table_statistics/2.parquet]]}, projection=[int_col, str_col], file_type=parquet, statistics=[Rows=Exact(4), Bytes=Exact(212), [(Col[0]: Min=Exact(Int64(-1)) Max=Exact(Int64(3)) Null=Exact(0)),(Col[1]: Min=Exact(Utf8View("a")) Max=Exact(Utf8View("d")) Null=Exact(0))]] +physical_plan DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/listing_table_statistics/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/listing_table_statistics/2.parquet]]}, projection=[int_col, str_col], file_type=parquet, statistics=[Rows=Exact(4), Bytes=Exact(288), [(Col[0]: Min=Exact(Int64(-1)) Max=Exact(Int64(3)) Null=Exact(0)),(Col[1]: Min=Exact(Utf8View("a")) Max=Exact(Utf8View("d")) Null=Exact(0))]] statement ok drop table t; diff --git a/datafusion/sqllogictest/test_files/parquet_statistics.slt b/datafusion/sqllogictest/test_files/parquet_statistics.slt index f9e89902998c..2b70f4f9c00a 100644 --- a/datafusion/sqllogictest/test_files/parquet_statistics.slt +++ b/datafusion/sqllogictest/test_files/parquet_statistics.slt @@ -59,10 +59,10 @@ query TT EXPLAIN SELECT * FROM test_table WHERE column1 = 1; ---- physical_plan -01)CoalesceBatchesExec: target_batch_size=8192, statistics=[Rows=Inexact(2), Bytes=Inexact(31), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) Null=Inexact(0))]] -02)--FilterExec: column1@0 = 1, statistics=[Rows=Inexact(2), Bytes=Inexact(31), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) Null=Inexact(0))]] -03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2, statistics=[Rows=Inexact(5), Bytes=Inexact(121), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0))]] -04)------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/1.parquet]]}, projection=[column1], file_type=parquet, predicate=column1@0 = 1, pruning_predicate=column1_null_count@2 != row_count@3 AND column1_min@0 <= 1 AND 1 <= column1_max@1, required_guarantees=[column1 in (1)], statistics=[Rows=Inexact(5), Bytes=Inexact(121), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0))]] +01)CoalesceBatchesExec: target_batch_size=8192, statistics=[Rows=Inexact(2), Bytes=Inexact(44), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) Null=Inexact(0))]] +02)--FilterExec: column1@0 = 1, statistics=[Rows=Inexact(2), Bytes=Inexact(44), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) Null=Inexact(0))]] +03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2, statistics=[Rows=Inexact(5), Bytes=Inexact(173), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0))]] +04)------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/1.parquet]]}, projection=[column1], file_type=parquet, predicate=column1@0 = 1, pruning_predicate=column1_null_count@2 != row_count@3 AND column1_min@0 <= 1 AND 1 <= column1_max@1, required_guarantees=[column1 in (1)], statistics=[Rows=Inexact(5), Bytes=Inexact(173), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0))]] # cleanup statement ok @@ -85,10 +85,10 @@ query TT EXPLAIN SELECT * FROM test_table WHERE column1 = 1; ---- physical_plan -01)CoalesceBatchesExec: target_batch_size=8192, statistics=[Rows=Inexact(2), Bytes=Inexact(31), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) Null=Inexact(0))]] -02)--FilterExec: column1@0 = 1, statistics=[Rows=Inexact(2), Bytes=Inexact(31), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) Null=Inexact(0))]] -03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2, statistics=[Rows=Inexact(5), Bytes=Inexact(121), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0))]] -04)------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/1.parquet]]}, projection=[column1], file_type=parquet, predicate=column1@0 = 1, pruning_predicate=column1_null_count@2 != row_count@3 AND column1_min@0 <= 1 AND 1 <= column1_max@1, required_guarantees=[column1 in (1)], statistics=[Rows=Inexact(5), Bytes=Inexact(121), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0))]] +01)CoalesceBatchesExec: target_batch_size=8192, statistics=[Rows=Inexact(2), Bytes=Inexact(44), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) Null=Inexact(0))]] +02)--FilterExec: column1@0 = 1, statistics=[Rows=Inexact(2), Bytes=Inexact(44), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) Null=Inexact(0))]] +03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2, statistics=[Rows=Inexact(5), Bytes=Inexact(173), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0))]] +04)------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/1.parquet]]}, projection=[column1], file_type=parquet, predicate=column1@0 = 1, pruning_predicate=column1_null_count@2 != row_count@3 AND column1_min@0 <= 1 AND 1 <= column1_max@1, required_guarantees=[column1 in (1)], statistics=[Rows=Inexact(5), Bytes=Inexact(173), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0))]] # cleanup statement ok diff --git a/datafusion/sqllogictest/test_files/repartition_scan.slt b/datafusion/sqllogictest/test_files/repartition_scan.slt index c536c8165c5a..0b851f917855 100644 --- a/datafusion/sqllogictest/test_files/repartition_scan.slt +++ b/datafusion/sqllogictest/test_files/repartition_scan.slt @@ -61,7 +61,7 @@ logical_plan physical_plan 01)CoalesceBatchesExec: target_batch_size=8192 02)--FilterExec: column1@0 != 42 -03)----DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..135], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:135..270], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:270..405], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:405..537]]}, projection=[column1], file_type=parquet, predicate=column1@0 != 42, pruning_predicate=column1_null_count@2 != row_count@3 AND (column1_min@0 != 42 OR 42 != column1_max@1), required_guarantees=[column1 not in (42)] +03)----DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..141], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:141..282], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:282..423], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:423..563]]}, projection=[column1], file_type=parquet, predicate=column1@0 != 42, pruning_predicate=column1_null_count@2 != row_count@3 AND (column1_min@0 != 42 OR 42 != column1_max@1), required_guarantees=[column1 not in (42)] # disable round robin repartitioning statement ok @@ -77,7 +77,7 @@ logical_plan physical_plan 01)CoalesceBatchesExec: target_batch_size=8192 02)--FilterExec: column1@0 != 42 -03)----DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..135], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:135..270], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:270..405], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:405..537]]}, projection=[column1], file_type=parquet, predicate=column1@0 != 42, pruning_predicate=column1_null_count@2 != row_count@3 AND (column1_min@0 != 42 OR 42 != column1_max@1), required_guarantees=[column1 not in (42)] +03)----DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..141], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:141..282], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:282..423], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:423..563]]}, projection=[column1], file_type=parquet, predicate=column1@0 != 42, pruning_predicate=column1_null_count@2 != row_count@3 AND (column1_min@0 != 42 OR 42 != column1_max@1), required_guarantees=[column1 not in (42)] # enable round robin repartitioning again statement ok @@ -102,7 +102,7 @@ physical_plan 02)--SortExec: expr=[column1@0 ASC NULLS LAST], preserve_partitioning=[true] 03)----CoalesceBatchesExec: target_batch_size=8192 04)------FilterExec: column1@0 != 42 -05)--------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:0..266], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:266..526, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..6], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:6..272], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:272..537]]}, projection=[column1], file_type=parquet, predicate=column1@0 != 42, pruning_predicate=column1_null_count@2 != row_count@3 AND (column1_min@0 != 42 OR 42 != column1_max@1), required_guarantees=[column1 not in (42)] +05)--------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:0..280], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:280..554, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..6], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:6..286], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:286..563]]}, projection=[column1], file_type=parquet, predicate=column1@0 != 42, pruning_predicate=column1_null_count@2 != row_count@3 AND (column1_min@0 != 42 OR 42 != column1_max@1), required_guarantees=[column1 not in (42)] ## Read the files as though they are ordered @@ -138,7 +138,7 @@ physical_plan 01)SortPreservingMergeExec: [column1@0 ASC NULLS LAST] 02)--CoalesceBatchesExec: target_batch_size=8192 03)----FilterExec: column1@0 != 42 -04)------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:0..263], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..268], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:268..537], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:263..526]]}, projection=[column1], output_ordering=[column1@0 ASC NULLS LAST], file_type=parquet, predicate=column1@0 != 42, pruning_predicate=column1_null_count@2 != row_count@3 AND (column1_min@0 != 42 OR 42 != column1_max@1), required_guarantees=[column1 not in (42)] +04)------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:0..277], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:281..563], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:277..554]]}, projection=[column1], output_ordering=[column1@0 ASC NULLS LAST], file_type=parquet, predicate=column1@0 != 42, pruning_predicate=column1_null_count@2 != row_count@3 AND (column1_min@0 != 42 OR 42 != column1_max@1), required_guarantees=[column1 not in (42)] # Cleanup statement ok diff --git a/datafusion/substrait/src/logical_plan/consumer/utils.rs b/datafusion/substrait/src/logical_plan/consumer/utils.rs index f7eedcb7a2b2..67215e8e343e 100644 --- a/datafusion/substrait/src/logical_plan/consumer/utils.rs +++ b/datafusion/substrait/src/logical_plan/consumer/utils.rs @@ -213,8 +213,6 @@ pub fn rename_data_type( | DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View - | DataType::Decimal32(_, _) - | DataType::Decimal64(_, _) | DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => Ok(data_type.clone()), } diff --git a/docs/source/library-user-guide/upgrading.md b/docs/source/library-user-guide/upgrading.md index 828beec8545d..a9c39b086118 100644 --- a/docs/source/library-user-guide/upgrading.md +++ b/docs/source/library-user-guide/upgrading.md @@ -212,12 +212,6 @@ use datafusion_physical_expr_adapter::{ }; ``` -### Upgrade to arrow `56.0.0` and parquet `56.0.0` - -This version of DataFusion upgrades the underlying Apache Arrow implementation -to version `56.0.0`. See the [release notes](https://github.com/apache/arrow-rs/releases/tag/56.0.0) -for more details. - ### Added `ExecutionPlan::reset_state` In order to fix a bug in DataFusion `49.0.0` where dynamic filters (currently only generated in the presence of a query such as `ORDER BY ... LIMIT ...`) diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index 3d4730958fb3..cdc6d615e37f 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -96,10 +96,11 @@ The following configuration settings are available: | datafusion.execution.parquet.dictionary_enabled | true | (writing) Sets if dictionary encoding is enabled. If NULL, uses default parquet writer setting | | datafusion.execution.parquet.dictionary_page_size_limit | 1048576 | (writing) Sets best effort maximum dictionary page size, in bytes | | datafusion.execution.parquet.statistics_enabled | page | (writing) Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting | +| datafusion.execution.parquet.max_statistics_size | 4096 | (writing) Sets max statistics size for any column. If NULL, uses default parquet writer setting max_statistics_size is deprecated, currently it is not being used | | datafusion.execution.parquet.max_row_group_size | 1048576 | (writing) Target maximum number of rows in each row group (defaults to 1M rows). Writing larger row groups requires more memory to write, but can get better compression and be faster to read. | | datafusion.execution.parquet.created_by | datafusion version 50.0.0 | (writing) Sets "created by" property | | datafusion.execution.parquet.column_index_truncate_length | 64 | (writing) Sets column index truncate length | -| datafusion.execution.parquet.statistics_truncate_length | 64 | (writing) Sets statistics truncate length. If NULL, uses default parquet writer setting | +| datafusion.execution.parquet.statistics_truncate_length | NULL | (writing) Sets statistics truncate length. If NULL, uses default parquet writer setting | | datafusion.execution.parquet.data_page_row_count_limit | 20000 | (writing) Sets best effort maximum number of rows in data page | | datafusion.execution.parquet.encoding | NULL | (writing) Sets default encoding for any column. Valid values are: plain, plain_dictionary, rle, bit_packed, delta_binary_packed, delta_length_byte_array, delta_byte_array, rle_dictionary, and byte_stream_split. These values are not case sensitive. If NULL, uses default parquet writer setting | | datafusion.execution.parquet.bloom_filter_on_write | false | (writing) Write bloom filters for all columns when creating parquet files | diff --git a/parquet-testing b/parquet-testing index 107b36603e05..a3d96a65e11e 160000 --- a/parquet-testing +++ b/parquet-testing @@ -1 +1 @@ -Subproject commit 107b36603e051aee26bd93e04b871034f6c756c0 +Subproject commit a3d96a65e11e2bbca7d22a894e8313ede90a33a3 diff --git a/testing b/testing index d2a137123034..0d60ccae40d0 160000 --- a/testing +++ b/testing @@ -1 +1 @@ -Subproject commit d2a13712303498963395318a4eb42872e66aead7 +Subproject commit 0d60ccae40d0e8f2d22c15fafb01c5d4be8c63a6 diff --git a/typos.toml b/typos.toml index 46f21febcf86..5741ab8e9cc9 100644 --- a/typos.toml +++ b/typos.toml @@ -15,6 +15,7 @@ aroun = "aroun" abov = "abov" Ois = "Ois" alo = "alo" +Byt = "Byt" # abbreviations, common words, etc. typ = "typ" From ceea46169f2853c8d10237f1402ea44410f314f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=ADa=20Adriana?= Date: Wed, 1 Oct 2025 15:15:17 +0200 Subject: [PATCH 05/13] fix(SubqueryAlias): use maybe_project_redundant_column (#17478) (#51) * fix(SubqueryAlias): use maybe_project_redundant_column Fixes #17405 * chore: format * ci: retry * chore(SubqueryAlias): restructore duplicate detection and add tests * docs: add examples and context to the reproducer (cherry picked from commit c910db4b9726a4b6f44713b0c92d79d82d9a18ef) Co-authored-by: Filippo Rossi <12383260+notfilippo@users.noreply.github.com> --- datafusion/core/src/physical_planner.rs | 181 ++++++------------ datafusion/expr/src/logical_plan/builder.rs | 122 +++++++----- datafusion/expr/src/logical_plan/plan.rs | 48 ++++- .../tests/cases/consumer_integration.rs | 42 ++-- 4 files changed, 196 insertions(+), 197 deletions(-) diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index 6618d9495d78..a06106a96465 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -61,9 +61,7 @@ use arrow::array::{builder::StringBuilder, RecordBatch}; use arrow::compute::SortOptions; use arrow::datatypes::{Schema, SchemaRef}; use datafusion_common::display::ToStringifiedPlan; -use datafusion_common::tree_node::{ - Transformed, TransformedResult, TreeNode, TreeNodeRecursion, TreeNodeVisitor, -}; +use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion, TreeNodeVisitor}; use datafusion_common::{ exec_err, internal_datafusion_err, internal_err, not_impl_err, plan_err, DFSchema, ScalarValue, @@ -83,7 +81,7 @@ use datafusion_expr::{ WindowFrameBound, WriteOp, }; use datafusion_physical_expr::aggregate::{AggregateExprBuilder, AggregateFunctionExpr}; -use datafusion_physical_expr::expressions::{Column, Literal}; +use datafusion_physical_expr::expressions::Literal; use datafusion_physical_expr::{ create_physical_sort_exprs, LexOrdering, PhysicalSortExpr, }; @@ -2177,11 +2175,7 @@ impl DefaultPhysicalPlanner { let physical_expr = self.create_physical_expr(e, input_logical_schema, session_state); - // Check for possible column name mismatches - let final_physical_expr = - maybe_fix_physical_column_name(physical_expr, &input_physical_schema); - - tuple_err((final_physical_expr, physical_name)) + tuple_err((physical_expr, physical_name)) }) .collect::>>()?; @@ -2287,47 +2281,6 @@ fn tuple_err(value: (Result, Result)) -> Result<(T, R)> { } } -// Handle the case where the name of a physical column expression does not match the corresponding physical input fields names. -// Physical column names are derived from the physical schema, whereas physical column expressions are derived from the logical column names. -// -// This is a special case that applies only to column expressions. Logical plans may slightly modify column names by appending a suffix (e.g., using ':'), -// to avoid duplicates—since DFSchemas do not allow duplicate names. For example: `count(Int64(1)):1`. -fn maybe_fix_physical_column_name( - expr: Result>, - input_physical_schema: &SchemaRef, -) -> Result> { - let Ok(expr) = expr else { return expr }; - expr.transform_down(|node| { - if let Some(column) = node.as_any().downcast_ref::() { - let idx = column.index(); - let physical_field = input_physical_schema.field(idx); - let expr_col_name = column.name(); - let physical_name = physical_field.name(); - - if expr_col_name != physical_name { - // handle edge cases where the physical_name contains ':'. - let colon_count = physical_name.matches(':').count(); - let mut splits = expr_col_name.match_indices(':'); - let split_pos = splits.nth(colon_count); - - if let Some((i, _)) = split_pos { - let base_name = &expr_col_name[..i]; - if base_name == physical_name { - let updated_column = Column::new(physical_name, idx); - return Ok(Transformed::yes(Arc::new(updated_column))); - } - } - } - - // If names already match or fix is not possible, just leave it as it is - Ok(Transformed::no(node)) - } else { - Ok(Transformed::no(node)) - } - }) - .data() -} - struct OptimizationInvariantChecker<'a> { rule: &'a Arc, } @@ -2431,12 +2384,10 @@ mod tests { }; use datafusion_execution::runtime_env::RuntimeEnv; use datafusion_execution::TaskContext; - use datafusion_expr::{ - col, lit, LogicalPlanBuilder, Operator, UserDefinedLogicalNodeCore, - }; + use datafusion_expr::builder::subquery_alias; + use datafusion_expr::{col, lit, LogicalPlanBuilder, UserDefinedLogicalNodeCore}; use datafusion_functions_aggregate::count::count_all; use datafusion_functions_aggregate::expr_fn::sum; - use datafusion_physical_expr::expressions::{BinaryExpr, IsNotNullExpr}; use datafusion_physical_expr::EquivalenceProperties; use datafusion_physical_plan::execution_plan::{Boundedness, EmissionType}; @@ -2997,71 +2948,6 @@ mod tests { } } - #[tokio::test] - async fn test_maybe_fix_colon_in_physical_name() { - // The physical schema has a field name with a colon - let schema = Schema::new(vec![Field::new("metric:avg", DataType::Int32, false)]); - let schema_ref: SchemaRef = Arc::new(schema); - - // What might happen after deduplication - let logical_col_name = "metric:avg:1"; - let expr_with_suffix = - Arc::new(Column::new(logical_col_name, 0)) as Arc; - let expr_result = Ok(expr_with_suffix); - - // Call function under test - let fixed_expr = - maybe_fix_physical_column_name(expr_result, &schema_ref).unwrap(); - - // Downcast back to Column so we can check the name - let col = fixed_expr - .as_any() - .downcast_ref::() - .expect("Column"); - - assert_eq!(col.name(), "metric:avg"); - } - - #[tokio::test] - async fn test_maybe_fix_nested_column_name_with_colon() { - let schema = Schema::new(vec![Field::new("column", DataType::Int32, false)]); - let schema_ref: SchemaRef = Arc::new(schema); - - // Construct the nested expr - let col_expr = Arc::new(Column::new("column:1", 0)) as Arc; - let is_not_null_expr = Arc::new(IsNotNullExpr::new(col_expr.clone())); - - // Create a binary expression and put the column inside - let binary_expr = Arc::new(BinaryExpr::new( - is_not_null_expr.clone(), - Operator::Or, - is_not_null_expr.clone(), - )) as Arc; - - let fixed_expr = - maybe_fix_physical_column_name(Ok(binary_expr), &schema_ref).unwrap(); - - let bin = fixed_expr - .as_any() - .downcast_ref::() - .expect("Expected BinaryExpr"); - - // Check that both sides where renamed - for expr in &[bin.left(), bin.right()] { - let is_not_null = expr - .as_any() - .downcast_ref::() - .expect("Expected IsNotNull"); - - let col = is_not_null - .arg() - .as_any() - .downcast_ref::() - .expect("Expected Column"); - - assert_eq!(col.name(), "column"); - } - } struct ErrorExtensionPlanner {} #[async_trait] @@ -3558,4 +3444,61 @@ digraph { Ok(()) } + + // Reproducer for DataFusion issue #17405: + // + // The following SQL is semantically invalid. Notably, the `SELECT left_table.a, right_table.a` + // clause is missing from the explicit logical plan: + // + // SELECT a FROM ( + // -- SELECT left_table.a, right_table.a + // FROM left_table + // FULL JOIN right_table ON left_table.a = right_table.a + // ) AS alias + // GROUP BY a; + // + // As a result, the variables within `alias` subquery are not properly distinguished, which + // leads to a bug for logical and physical planning. + // + // The fix is to implicitly insert a Projection node to represent the missing SELECT clause to + // ensure each field is correctly aliased to a unique name when the SubqueryAlias node is added. + #[tokio::test] + async fn subquery_alias_confusing_the_optimizer() -> Result<()> { + let state = make_session_state(); + + let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]); + let schema = Arc::new(schema); + + let table = MemTable::try_new(schema.clone(), vec![vec![]])?; + let table = Arc::new(table); + + let source = DefaultTableSource::new(table); + let source = Arc::new(source); + + let left = LogicalPlanBuilder::scan("left", source.clone(), None)?; + let right = LogicalPlanBuilder::scan("right", source, None)?.build()?; + + let join_keys = ( + vec![datafusion_common::Column::new(Some("left"), "a")], + vec![datafusion_common::Column::new(Some("right"), "a")], + ); + + let join = left.join(right, JoinType::Full, join_keys, None)?.build()?; + + let alias = subquery_alias(join, "alias")?; + + let planner = DefaultPhysicalPlanner::default(); + + let logical_plan = LogicalPlanBuilder::new(alias) + .aggregate(vec![col("a:1")], Vec::::new())? + .build()?; + let _physical_plan = planner.create_physical_plan(&logical_plan, &state).await?; + + let optimized_logical_plan = state.optimize(&logical_plan)?; + let _optimized_physical_plan = planner + .create_physical_plan(&optimized_logical_plan, &state) + .await?; + + Ok(()) + } } diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index 88d49722a587..206a2fae326e 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -18,6 +18,7 @@ //! This module provides a builder for creating LogicalPlans use std::any::Any; +use std::borrow::Cow; use std::cmp::Ordering; use std::collections::{HashMap, HashSet}; use std::iter::once; @@ -1517,37 +1518,49 @@ impl ValuesFields { } } -// `name_map` tracks a mapping between a field name and the number of appearances of that field. -// -// Some field names might already come to this function with the count (number of times it appeared) -// as a suffix e.g. id:1, so there's still a chance of name collisions, for example, -// if these three fields passed to this function: "col:1", "col" and "col", the function -// would rename them to -> col:1, col, col:1 causing a posteriror error when building the DFSchema. -// that's why we need the `seen` set, so the fields are always unique. -// -pub fn change_redundant_column(fields: &Fields) -> Vec { - let mut name_map = HashMap::new(); - let mut seen: HashSet = HashSet::new(); +/// Returns aliases to make field names unique. +/// +/// Returns a vector of optional aliases, one per input field. `None` means keep the original name, +/// `Some(alias)` means rename to the alias to ensure uniqueness. +/// +/// Used when creating [`SubqueryAlias`] or similar operations that strip table qualifiers but need +/// to maintain unique column names. +/// +/// # Example +/// Input fields: `[a, a, b, b, a, a:1]` ([`DFSchema`] valid when duplicate fields have different qualifiers) +/// Returns: `[None, Some("a:1"), None, Some("b:1"), Some("a:2"), Some("a:1:1")]` +pub fn unique_field_aliases(fields: &Fields) -> Vec> { + // Some field names might already come to this function with the count (number of times it appeared) + // as a suffix e.g. id:1, so there's still a chance of name collisions, for example, + // if these three fields passed to this function: "col:1", "col" and "col", the function + // would rename them to -> col:1, col, col:1 causing a posterior error when building the DFSchema. + // That's why we need the `seen` set, so the fields are always unique. + + // Tracks a mapping between a field name and the number of appearances of that field. + let mut name_map = HashMap::<&str, usize>::new(); + // Tracks all the fields and aliases that were previously seen. + let mut seen = HashSet::>::new(); fields - .into_iter() + .iter() .map(|field| { - let base_name = field.name(); - let count = name_map.entry(base_name.clone()).or_insert(0); - let mut new_name = base_name.clone(); + let original_name = field.name(); + let mut name = Cow::Borrowed(original_name); + + let count = name_map.entry(original_name).or_insert(0); - // Loop until we find a name that hasn't been used - while seen.contains(&new_name) { + // Loop until we find a name that hasn't been used. + while seen.contains(&name) { *count += 1; - new_name = format!("{base_name}:{count}"); + name = Cow::Owned(format!("{original_name}:{count}")); } - seen.insert(new_name.clone()); + seen.insert(name.clone()); - let mut modified_field = - Field::new(&new_name, field.data_type().clone(), field.is_nullable()); - modified_field.set_metadata(field.metadata().clone()); - modified_field + match name { + Cow::Borrowed(_) => None, + Cow::Owned(alias) => Some(alias), + } }) .collect() } @@ -2675,34 +2688,6 @@ mod tests { Ok(()) } - #[test] - fn test_change_redundant_column() -> Result<()> { - let t1_field_1 = Field::new("a", DataType::Int32, false); - let t2_field_1 = Field::new("a", DataType::Int32, false); - let t2_field_3 = Field::new("a", DataType::Int32, false); - let t2_field_4 = Field::new("a:1", DataType::Int32, false); - let t1_field_2 = Field::new("b", DataType::Int32, false); - let t2_field_2 = Field::new("b", DataType::Int32, false); - - let field_vec = vec![ - t1_field_1, t2_field_1, t1_field_2, t2_field_2, t2_field_3, t2_field_4, - ]; - let remove_redundant = change_redundant_column(&Fields::from(field_vec)); - - assert_eq!( - remove_redundant, - vec![ - Field::new("a", DataType::Int32, false), - Field::new("a:1", DataType::Int32, false), - Field::new("b", DataType::Int32, false), - Field::new("b:1", DataType::Int32, false), - Field::new("a:2", DataType::Int32, false), - Field::new("a:1:1", DataType::Int32, false), - ] - ); - Ok(()) - } - #[test] fn plan_builder_from_logical_plan() -> Result<()> { let plan = @@ -2787,4 +2772,39 @@ mod tests { Ok(()) } + + #[test] + fn test_unique_field_aliases() { + let t1_field_1 = Field::new("a", DataType::Int32, false); + let t2_field_1 = Field::new("a", DataType::Int32, false); + let t2_field_3 = Field::new("a", DataType::Int32, false); + let t2_field_4 = Field::new("a:1", DataType::Int32, false); + let t1_field_2 = Field::new("b", DataType::Int32, false); + let t2_field_2 = Field::new("b", DataType::Int32, false); + + let fields = vec![ + t1_field_1, t2_field_1, t1_field_2, t2_field_2, t2_field_3, t2_field_4, + ]; + let fields = Fields::from(fields); + + let remove_redundant = unique_field_aliases(&fields); + + // Input [a, a, b, b, a, a:1] becomes [None, a:1, None, b:1, a:2, a:1:1] + // First occurrence of each field name keeps original name (None), duplicates get + // incremental suffixes (:1, :2, etc.). + // Crucially in this case the 2nd occurrence of `a` gets rewritten to `a:1` which later + // conflicts with the last column which is _actually_ called `a:1` so we need to rename it + // as well to `a:1:1`. + assert_eq!( + remove_redundant, + vec![ + None, + Some("a:1".to_string()), + None, + Some("b:1".to_string()), + Some("a:2".to_string()), + Some("a:1:1".to_string()), + ] + ); + } } diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index b4f2902cc43e..fdcc38996533 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -30,9 +30,9 @@ use super::invariants::{ InvariantLevel, }; use super::DdlStatement; -use crate::builder::{change_redundant_column, unnest_with_options}; +use crate::builder::{unique_field_aliases, unnest_with_options}; use crate::expr::{ - intersect_metadata_for_union, Placeholder, Sort as SortExpr, WindowFunction, + intersect_metadata_for_union, Alias, Placeholder, Sort as SortExpr, WindowFunction, WindowFunctionParams, }; use crate::expr_rewriter::{ @@ -2223,13 +2223,45 @@ impl SubqueryAlias { alias: impl Into, ) -> Result { let alias = alias.into(); - let fields = change_redundant_column(plan.schema().fields()); - let meta_data = plan.schema().as_ref().metadata().clone(); - let schema: Schema = - DFSchema::from_unqualified_fields(fields.into(), meta_data)?.into(); - // Since schema is the same, other than qualifier, we can use existing - // functional dependencies: + + // Since SubqueryAlias will replace all field qualification for the output schema of `plan`, + // no field must share the same column name as this would lead to ambiguity when referencing + // columns in parent logical nodes. + + // Compute unique aliases, if any, for each column of the input's schema. + let aliases = unique_field_aliases(plan.schema().fields()); + let is_projection_needed = aliases.iter().any(Option::is_some); + + // Insert a projection node, if needed, to make sure aliases are applied. + let plan = if is_projection_needed { + let projection_expressions = aliases + .iter() + .zip(plan.schema().iter()) + .map(|(alias, (qualifier, field))| { + let column = + Expr::Column(Column::new(qualifier.cloned(), field.name())); + match alias { + None => column, + Some(alias) => { + Expr::Alias(Alias::new(column, qualifier.cloned(), alias)) + } + } + }) + .collect(); + let projection = Projection::try_new(projection_expressions, plan)?; + Arc::new(LogicalPlan::Projection(projection)) + } else { + plan + }; + + // Requalify fields with the new `alias`. + let fields = plan.schema().fields().clone(); + let meta_data = plan.schema().metadata().clone(); let func_dependencies = plan.schema().functional_dependencies().clone(); + + let schema = DFSchema::from_unqualified_fields(fields, meta_data)?; + let schema = Schema::from(schema); + let schema = DFSchemaRef::new( DFSchema::try_from_qualified_schema(alias.clone(), &schema)? .with_functional_dependencies(func_dependencies)?, diff --git a/datafusion/substrait/tests/cases/consumer_integration.rs b/datafusion/substrait/tests/cases/consumer_integration.rs index 2845cc304bcd..e0101845aa19 100644 --- a/datafusion/substrait/tests/cases/consumer_integration.rs +++ b/datafusion/substrait/tests/cases/consumer_integration.rs @@ -605,26 +605,30 @@ mod tests { #[tokio::test] async fn test_multiple_joins() -> Result<()> { let plan_str = test_plan_to_string("multiple_joins.json").await?; - assert_eq!( + assert_snapshot!( plan_str, - "Projection: left.count(Int64(1)) AS count_first, left.category, left.count(Int64(1)):1 AS count_second, right.count(Int64(1)) AS count_third\ - \n Left Join: left.id = right.id\ - \n SubqueryAlias: left\ - \n Left Join: left.id = right.id\ - \n SubqueryAlias: left\ - \n Left Join: left.id = right.id\ - \n SubqueryAlias: left\ - \n Aggregate: groupBy=[[id]], aggr=[[count(Int64(1))]]\ - \n Values: (Int64(1)), (Int64(2))\ - \n SubqueryAlias: right\ - \n Aggregate: groupBy=[[id, category]], aggr=[[]]\ - \n Values: (Int64(1), Utf8(\"info\")), (Int64(2), Utf8(\"low\"))\ - \n SubqueryAlias: right\ - \n Aggregate: groupBy=[[id]], aggr=[[count(Int64(1))]]\ - \n Values: (Int64(1)), (Int64(2))\ - \n SubqueryAlias: right\ - \n Aggregate: groupBy=[[id]], aggr=[[count(Int64(1))]]\ - \n Values: (Int64(1)), (Int64(2))" + @r#" + Projection: left.count(Int64(1)) AS count_first, left.category, left.count(Int64(1)):1 AS count_second, right.count(Int64(1)) AS count_third + Left Join: left.id = right.id + SubqueryAlias: left + Projection: left.id, left.count(Int64(1)), left.id:1, left.category, right.id AS id:2, right.count(Int64(1)) AS count(Int64(1)):1 + Left Join: left.id = right.id + SubqueryAlias: left + Projection: left.id, left.count(Int64(1)), right.id AS id:1, right.category + Left Join: left.id = right.id + SubqueryAlias: left + Aggregate: groupBy=[[id]], aggr=[[count(Int64(1))]] + Values: (Int64(1)), (Int64(2)) + SubqueryAlias: right + Aggregate: groupBy=[[id, category]], aggr=[[]] + Values: (Int64(1), Utf8("info")), (Int64(2), Utf8("low")) + SubqueryAlias: right + Aggregate: groupBy=[[id]], aggr=[[count(Int64(1))]] + Values: (Int64(1)), (Int64(2)) + SubqueryAlias: right + Aggregate: groupBy=[[id]], aggr=[[count(Int64(1))]] + Values: (Int64(1)), (Int64(2)) + "# ); Ok(()) } From 5035d76de48af79f0a20502b8938b09d0d2789dd Mon Sep 17 00:00:00 2001 From: Brian Myers Date: Tue, 14 Oct 2025 12:48:49 +0200 Subject: [PATCH 06/13] Set update = none for datafusion-testing git submodule --- .gitmodules | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitmodules b/.gitmodules index 037accdbe424..7406be483598 100644 --- a/.gitmodules +++ b/.gitmodules @@ -8,3 +8,4 @@ path = datafusion-testing url = https://github.com/apache/datafusion-testing.git branch = main + update = none From ff2533ca2ea932bbb5a1bb865a2d1fba09423b28 Mon Sep 17 00:00:00 2001 From: Brian Myers Date: Tue, 14 Oct 2025 14:36:08 +0200 Subject: [PATCH 07/13] fix typos --- datafusion/common/src/scalar/mod.rs | 2 +- datafusion/core/src/datasource/listing/table.rs | 2 +- datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs | 2 +- datafusion/physical-expr/src/expressions/case.rs | 1 - docs/source/user-guide/introduction.md | 2 +- typos.toml | 1 + 6 files changed, 5 insertions(+), 5 deletions(-) diff --git a/datafusion/common/src/scalar/mod.rs b/datafusion/common/src/scalar/mod.rs index 452d6c13b3e0..d85f45bbd875 100644 --- a/datafusion/common/src/scalar/mod.rs +++ b/datafusion/common/src/scalar/mod.rs @@ -2378,7 +2378,7 @@ impl ScalarValue { Arc::new(array) } // explicitly enumerate unsupported types so newly added - // types must be aknowledged, Time32 and Time64 types are + // types must be acknowledged, Time32 and Time64 types are // not supported if the TimeUnit is not valid (Time32 can // only be used with Second and Millisecond, Time64 only // with Microsecond and Nanosecond) diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs index 690ce31d0dc7..14b5bfa54eda 100644 --- a/datafusion/core/src/datasource/listing/table.rs +++ b/datafusion/core/src/datasource/listing/table.rs @@ -1131,7 +1131,7 @@ impl ListingTable { } } -// Expressions can be used for parttion pruning if they can be evaluated using +// Expressions can be used for partition pruning if they can be evaluated using // only the partition columns and there are partition columns. fn can_be_evaluated_for_partition_pruning( partition_column_names: &[&str], diff --git a/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs b/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs index eaf3be2b86ed..b949ae159756 100644 --- a/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs +++ b/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs @@ -1102,7 +1102,7 @@ async fn test_hashjoin_dynamic_filter_pushdown_partitioned() { Arc::new(CoalesceBatchesExec::new(hash_join, 8192)) as Arc; // Top-level CoalescePartitionsExec let cp = Arc::new(CoalescePartitionsExec::new(cb)) as Arc; - // Add a sort for determistic output + // Add a sort for deterministic output let plan = Arc::new(SortExec::new( LexOrdering::new(vec![PhysicalSortExpr::new( col("a", &probe_side_schema).unwrap(), diff --git a/datafusion/physical-expr/src/expressions/case.rs b/datafusion/physical-expr/src/expressions/case.rs index 65a210826664..5409cfe8e7e4 100644 --- a/datafusion/physical-expr/src/expressions/case.rs +++ b/datafusion/physical-expr/src/expressions/case.rs @@ -1070,7 +1070,6 @@ mod tests { .into_iter() .collect(); - //let valid_array = vec![true, false, false, true, false, tru let null_buffer = Buffer::from([0b00101001u8]); let load4 = load4 .into_data() diff --git a/docs/source/user-guide/introduction.md b/docs/source/user-guide/introduction.md index 040405f8f63e..68164c1cbfed 100644 --- a/docs/source/user-guide/introduction.md +++ b/docs/source/user-guide/introduction.md @@ -86,7 +86,7 @@ Here are some example systems built using DataFusion: By using DataFusion, projects are freed to focus on their specific features, and avoid reimplementing general (but still necessary) features such as an expression representation, standard optimizations, -parellelized streaming execution plans, file format support, etc. +parallelized streaming execution plans, file format support, etc. ## Known Users diff --git a/typos.toml b/typos.toml index 5741ab8e9cc9..7d08ce2f09c3 100644 --- a/typos.toml +++ b/typos.toml @@ -16,6 +16,7 @@ abov = "abov" Ois = "Ois" alo = "alo" Byt = "Byt" +nteger = "nteger" # abbreviations, common words, etc. typ = "typ" From 89f1b7c37ceacd6fe3aef4bfdafad84949215229 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=ADa=20Adriana?= Date: Wed, 15 Oct 2025 12:11:30 +0200 Subject: [PATCH 08/13] Revert "Revert arrow upgrade and related changes" (#52) * Revert "Revert arrow upgrade and related changes (#50)" This reverts commit 5506e698c41810525331e57766f73e865d86e2a9. * Allow typo --- Cargo.lock | 166 ++-- Cargo.toml | 16 +- datafusion-examples/Cargo.toml | 2 +- datafusion-testing | 2 +- datafusion/common/Cargo.toml | 2 +- datafusion/common/src/config.rs | 16 +- .../common/src/file_options/parquet_writer.rs | 25 +- datafusion/common/src/scalar/mod.rs | 19 +- datafusion/common/src/types/native.rs | 5 +- .../src/datasource/file_format/parquet.rs | 27 +- datafusion/core/tests/fuzz_cases/pruning.rs | 11 +- datafusion/core/tests/parquet/mod.rs | 4 +- .../core/tests/parquet/row_group_pruning.rs | 14 +- .../src/avro_to_arrow/schema.rs | 2 + .../datasource-parquet/src/file_format.rs | 2 +- datafusion/expr/src/utils.rs | 2 + datafusion/physical-plan/src/sorts/cursor.rs | 214 +---- .../proto/datafusion_common.proto | 40 +- datafusion/proto-common/src/from_proto/mod.rs | 31 +- .../proto-common/src/generated/pbjson.rs | 728 +++++++++++++++--- .../proto-common/src/generated/prost.rs | 66 +- datafusion/proto-common/src/to_proto/mod.rs | 18 +- .../src/generated/datafusion_proto_common.rs | 66 +- .../proto/src/logical_plan/file_formats.rs | 14 - datafusion/sql/src/unparser/expr.rs | 6 + datafusion/sqllogictest/test_files/array.slt | 14 +- datafusion/sqllogictest/test_files/copy.slt | 1 - .../test_files/information_schema.slt | 6 +- .../test_files/listing_table_statistics.slt | 2 +- .../test_files/parquet_statistics.slt | 16 +- .../test_files/repartition_scan.slt | 8 +- .../src/logical_plan/consumer/utils.rs | 2 + docs/source/library-user-guide/upgrading.md | 6 + docs/source/user-guide/configs.md | 3 +- parquet-testing | 2 +- testing | 2 +- typos.toml | 3 +- 37 files changed, 949 insertions(+), 614 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e58f72ee7b2f..5186c6d6d598 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -240,9 +240,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "55.2.0" +version = "56.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3f15b4c6b148206ff3a2b35002e08929c2462467b62b9c02036d9c34f9ef994" +checksum = "fd798aea3553913a5986813e9c6ad31a2d2b04e931fe8ea4a37155eb541cebb5" dependencies = [ "arrow-arith", "arrow-array", @@ -264,9 +264,9 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "55.2.0" +version = "56.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30feb679425110209ae35c3fbf82404a39a4c0436bb3ec36164d8bffed2a4ce4" +checksum = "508dafb53e5804a238cab7fd97a59ddcbfab20cc4d9814b1ab5465b9fa147f2e" dependencies = [ "arrow-array", "arrow-buffer", @@ -278,9 +278,9 @@ dependencies = [ [[package]] name = "arrow-array" -version = "55.2.0" +version = "56.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70732f04d285d49054a48b72c54f791bb3424abae92d27aafdf776c98af161c8" +checksum = "e2730bc045d62bb2e53ef8395b7d4242f5c8102f41ceac15e8395b9ac3d08461" dependencies = [ "ahash 0.8.12", "arrow-buffer", @@ -295,9 +295,9 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "55.2.0" +version = "56.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "169b1d5d6cb390dd92ce582b06b23815c7953e9dfaaea75556e89d890d19993d" +checksum = "54295b93beb702ee9a6f6fbced08ad7f4d76ec1c297952d4b83cf68755421d1d" dependencies = [ "bytes", "half", @@ -306,9 +306,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "55.2.0" +version = "56.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4f12eccc3e1c05a766cafb31f6a60a46c2f8efec9b74c6e0648766d30686af8" +checksum = "67e8bcb7dc971d779a7280593a1bf0c2743533b8028909073e804552e85e75b5" dependencies = [ "arrow-array", "arrow-buffer", @@ -327,9 +327,9 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "55.2.0" +version = "56.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "012c9fef3f4a11573b2c74aec53712ff9fdae4a95f4ce452d1bbf088ee00f06b" +checksum = "673fd2b5fb57a1754fdbfac425efd7cf54c947ac9950c1cce86b14e248f1c458" dependencies = [ "arrow-array", "arrow-cast", @@ -342,9 +342,9 @@ dependencies = [ [[package]] name = "arrow-data" -version = "55.2.0" +version = "56.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8de1ce212d803199684b658fc4ba55fb2d7e87b213de5af415308d2fee3619c2" +checksum = "97c22fe3da840039c69e9f61f81e78092ea36d57037b4900151f063615a2f6b4" dependencies = [ "arrow-buffer", "arrow-schema", @@ -354,9 +354,9 @@ dependencies = [ [[package]] name = "arrow-flight" -version = "55.2.0" +version = "56.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5cb3e1d2b441e6d1d5988e3f7c4523c9466b18ef77d7c525d92d36d4cad49fbe" +checksum = "6808d235786b721e49e228c44dd94242f2e8b46b7e95b233b0733c46e758bfee" dependencies = [ "arrow-arith", "arrow-array", @@ -381,9 +381,9 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "55.2.0" +version = "56.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9ea5967e8b2af39aff5d9de2197df16e305f47f404781d3230b2dc672da5d92" +checksum = "778de14c5a69aedb27359e3dd06dd5f9c481d5f6ee9fbae912dba332fd64636b" dependencies = [ "arrow-array", "arrow-buffer", @@ -396,9 +396,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "55.2.0" +version = "56.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5709d974c4ea5be96d900c01576c7c0b99705f4a3eec343648cb1ca863988a9c" +checksum = "3860db334fe7b19fcf81f6b56f8d9d95053f3839ffe443d56b5436f7a29a1794" dependencies = [ "arrow-array", "arrow-buffer", @@ -418,9 +418,9 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "55.2.0" +version = "56.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6506e3a059e3be23023f587f79c82ef0bcf6d293587e3272d20f2d30b969b5a7" +checksum = "425fa0b42a39d3ff55160832e7c25553e7f012c3f187def3d70313e7a29ba5d9" dependencies = [ "arrow-array", "arrow-buffer", @@ -431,9 +431,9 @@ dependencies = [ [[package]] name = "arrow-pyarrow" -version = "55.2.0" +version = "56.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e55ecf16b9b61d433f6e63c72fc6afcf2597d7db96583de88ebb887d1822268" +checksum = "d944d8ae9b77230124e6570865b570416c33a5809f32c4136c679bbe774e45c9" dependencies = [ "arrow-array", "arrow-data", @@ -443,9 +443,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "55.2.0" +version = "56.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52bf7393166beaf79b4bed9bfdf19e97472af32ce5b6b48169d321518a08cae2" +checksum = "df9c9423c9e71abd1b08a7f788fcd203ba2698ac8e72a1f236f1faa1a06a7414" dependencies = [ "arrow-array", "arrow-buffer", @@ -456,9 +456,9 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "55.2.0" +version = "56.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af7686986a3bf2254c9fb130c623cdcb2f8e1f15763e7c71c310f0834da3d292" +checksum = "85fa1babc4a45fdc64a92175ef51ff00eba5ebbc0007962fecf8022ac1c6ce28" dependencies = [ "bitflags 2.9.1", "serde", @@ -467,9 +467,9 @@ dependencies = [ [[package]] name = "arrow-select" -version = "55.2.0" +version = "56.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd2b45757d6a2373faa3352d02ff5b54b098f5e21dccebc45a21806bc34501e5" +checksum = "d8854d15f1cf5005b4b358abeb60adea17091ff5bdd094dca5d3f73787d81170" dependencies = [ "ahash 0.8.12", "arrow-array", @@ -481,9 +481,9 @@ dependencies = [ [[package]] name = "arrow-string" -version = "55.2.0" +version = "56.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0377d532850babb4d927a06294314b316e23311503ed580ec6ce6a0158f49d40" +checksum = "2c477e8b89e1213d5927a2a84a72c384a9bf4dd0dbf15f9fd66d821aafd9e95e" dependencies = [ "arrow-array", "arrow-buffer", @@ -561,28 +561,6 @@ dependencies = [ "syn 2.0.106", ] -[[package]] -name = "async-stream" -version = "0.3.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b5a71a6f37880a80d1d7f19efd781e4b5de42c88f0722cc13bcb6cc2cfe8476" -dependencies = [ - "async-stream-impl", - "futures-core", - "pin-project-lite", -] - -[[package]] -name = "async-stream-impl" -version = "0.3.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.106", -] - [[package]] name = "async-trait" version = "0.1.89" @@ -844,7 +822,7 @@ dependencies = [ "rustls-pki-types", "tokio", "tokio-rustls", - "tower 0.5.2", + "tower", "tracing", ] @@ -965,11 +943,10 @@ dependencies = [ [[package]] name = "axum" -version = "0.7.9" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "edca88bc138befd0323b20752846e6587272d3b03b0343c8ea28a6f819e6e71f" +checksum = "021e862c184ae977658b36c4500f7feac3221ca5da43e3f25bd04ab6c79a29b5" dependencies = [ - "async-trait", "axum-core", "bytes", "futures-util", @@ -985,20 +962,19 @@ dependencies = [ "rustversion", "serde", "sync_wrapper", - "tower 0.5.2", + "tower", "tower-layer", "tower-service", ] [[package]] name = "axum-core" -version = "0.4.5" +version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09f2bd6146b97ae3359fa0cc6d6b376d9539582c7b4220f041a33ec24c226199" +checksum = "68464cd0412f486726fb3373129ef5d2993f90c34bc2bc1c1e9943b2f4fc7ca6" dependencies = [ - "async-trait", "bytes", - "futures-util", + "futures-core", "http 1.3.1", "http-body 1.0.1", "http-body-util", @@ -4190,9 +4166,9 @@ dependencies = [ [[package]] name = "matchit" -version = "0.7.3" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94" +checksum = "47e1ffaa40ddd1f3ed91f717a33c8c0ee23fff369e3aa8772b9605cc1d22f4c3" [[package]] name = "md-5" @@ -4553,9 +4529,9 @@ dependencies = [ [[package]] name = "parquet" -version = "55.2.0" +version = "56.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b17da4150748086bd43352bc77372efa9b6e3dbd06a04831d2a98c041c225cfa" +checksum = "c7288a07ed5d25939a90f9cb1ca5afa6855faa08ec7700613511ae64bdb0620c" dependencies = [ "ahash 0.8.12", "arrow-array", @@ -5052,11 +5028,10 @@ dependencies = [ [[package]] name = "pyo3" -version = "0.24.2" +version = "0.25.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5203598f366b11a02b13aa20cab591229ff0a89fd121a308a5df751d5fc9219" +checksum = "8970a78afe0628a3e3430376fc5fd76b6b45c4d43360ffd6cdd40bdde72b682a" dependencies = [ - "cfg-if", "indoc", "libc", "memoffset", @@ -5070,9 +5045,9 @@ dependencies = [ [[package]] name = "pyo3-build-config" -version = "0.24.2" +version = "0.25.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99636d423fa2ca130fa5acde3059308006d46f98caac629418e53f7ebb1e9999" +checksum = "458eb0c55e7ece017adeba38f2248ff3ac615e53660d7c71a238d7d2a01c7598" dependencies = [ "once_cell", "target-lexicon", @@ -5080,9 +5055,9 @@ dependencies = [ [[package]] name = "pyo3-ffi" -version = "0.24.2" +version = "0.25.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78f9cf92ba9c409279bc3305b5409d90db2d2c22392d443a87df3a1adad59e33" +checksum = "7114fe5457c61b276ab77c5055f206295b812608083644a5c5b2640c3102565c" dependencies = [ "libc", "pyo3-build-config", @@ -5090,9 +5065,9 @@ dependencies = [ [[package]] name = "pyo3-macros" -version = "0.24.2" +version = "0.25.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b999cb1a6ce21f9a6b147dcf1be9ffedf02e0043aec74dc390f3007047cecd9" +checksum = "a8725c0a622b374d6cb051d11a0983786448f7785336139c3c94f5aa6bef7e50" dependencies = [ "proc-macro2", "pyo3-macros-backend", @@ -5102,9 +5077,9 @@ dependencies = [ [[package]] name = "pyo3-macros-backend" -version = "0.24.2" +version = "0.25.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "822ece1c7e1012745607d5cf0bcb2874769f0f7cb34c4cde03b9358eb9ef911a" +checksum = "4109984c22491085343c05b0dbc54ddc405c3cf7b4374fc533f5c3313a572ccc" dependencies = [ "heck 0.5.0", "proc-macro2", @@ -5474,7 +5449,7 @@ dependencies = [ "tokio", "tokio-rustls", "tokio-util", - "tower 0.5.2", + "tower", "tower-http", "tower-service", "url", @@ -6679,11 +6654,10 @@ dependencies = [ [[package]] name = "tonic" -version = "0.12.3" +version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "877c5b330756d856ffcc4553ab34a5684481ade925ecc54bcd1bf02b1d0d4d52" +checksum = "7e581ba15a835f4d9ea06c55ab1bd4dce26fc53752c69a04aac00703bfb49ba9" dependencies = [ - "async-stream", "async-trait", "axum", "base64 0.22.1", @@ -6701,27 +6675,7 @@ dependencies = [ "socket2 0.5.10", "tokio", "tokio-stream", - "tower 0.4.13", - "tower-layer", - "tower-service", - "tracing", -] - -[[package]] -name = "tower" -version = "0.4.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c" -dependencies = [ - "futures-core", - "futures-util", - "indexmap 1.9.3", - "pin-project", - "pin-project-lite", - "rand 0.8.5", - "slab", - "tokio", - "tokio-util", + "tower", "tower-layer", "tower-service", "tracing", @@ -6735,11 +6689,15 @@ checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9" dependencies = [ "futures-core", "futures-util", + "indexmap 2.11.0", "pin-project-lite", + "slab", "sync_wrapper", "tokio", + "tokio-util", "tower-layer", "tower-service", + "tracing", ] [[package]] @@ -6755,7 +6713,7 @@ dependencies = [ "http-body 1.0.1", "iri-string", "pin-project-lite", - "tower 0.5.2", + "tower", "tower-layer", "tower-service", ] diff --git a/Cargo.toml b/Cargo.toml index 6f7c61268e0e..53c35ed35f0d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -90,20 +90,20 @@ ahash = { version = "0.8", default-features = false, features = [ "runtime-rng", ] } apache-avro = { version = "0.20", default-features = false } -arrow = { version = "55.2.0", features = [ +arrow = { version = "56.0.0", features = [ "prettyprint", "chrono-tz", ] } -arrow-buffer = { version = "55.2.0", default-features = false } -arrow-flight = { version = "55.2.0", features = [ +arrow-buffer = { version = "56.0.0", default-features = false } +arrow-flight = { version = "56.0.0", features = [ "flight-sql-experimental", ] } -arrow-ipc = { version = "55.2.0", default-features = false, features = [ +arrow-ipc = { version = "56.0.0", default-features = false, features = [ "lz4", ] } -arrow-ord = { version = "55.2.0", default-features = false } -arrow-schema = { version = "55.2.0", default-features = false } -async-trait = "0.1.88" +arrow-ord = { version = "56.0.0", default-features = false } +arrow-schema = { version = "56.0.0", default-features = false } +async-trait = "0.1.89" bigdecimal = "0.4.8" bytes = "1.10" chrono = { version = "0.4.41", default-features = false } @@ -157,7 +157,7 @@ itertools = "0.14" log = "^0.4" object_store = { version = "0.12.3", default-features = false } parking_lot = "0.12" -parquet = { version = "55.2.0", default-features = false, features = [ +parquet = { version = "56.0.0", default-features = false, features = [ "arrow", "async", "object_store", diff --git a/datafusion-examples/Cargo.toml b/datafusion-examples/Cargo.toml index 204fc8794fbb..68bb5376a1ac 100644 --- a/datafusion-examples/Cargo.toml +++ b/datafusion-examples/Cargo.toml @@ -81,7 +81,7 @@ serde_json = { workspace = true } tempfile = { workspace = true } test-utils = { path = "../test-utils" } tokio = { workspace = true, features = ["rt-multi-thread", "parking_lot"] } -tonic = "0.12.1" +tonic = "0.13.1" tracing = { version = "0.1" } tracing-subscriber = { version = "0.3" } url = { workspace = true } diff --git a/datafusion-testing b/datafusion-testing index 905df5f65cc9..f72ac4075ada 160000 --- a/datafusion-testing +++ b/datafusion-testing @@ -1 +1 @@ -Subproject commit 905df5f65cc9d0851719c21f5a4dd5cd77621f19 +Subproject commit f72ac4075ada5ea9810551bc0c3e3161c61204a2 diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml index 6350c29b46de..aea5e51befb0 100644 --- a/datafusion/common/Cargo.toml +++ b/datafusion/common/Cargo.toml @@ -71,7 +71,7 @@ log = { workspace = true } object_store = { workspace = true, optional = true } parquet = { workspace = true, optional = true, default-features = true } paste = "1.0.15" -pyo3 = { version = "0.24.2", optional = true } +pyo3 = { version = "0.25", optional = true } recursive = { workspace = true, optional = true } sqlparser = { workspace = true } tokio = { workspace = true } diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 876d419b971f..cdd8e72a06cc 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -602,13 +602,6 @@ config_namespace! { /// default parquet writer setting pub statistics_enabled: Option, transform = str::to_lowercase, default = Some("page".into()) - /// (writing) Sets max statistics size for any column. If NULL, uses - /// default parquet writer setting - /// max_statistics_size is deprecated, currently it is not being used - // TODO: remove once deprecated - #[deprecated(since = "45.0.0", note = "Setting does not do anything")] - pub max_statistics_size: Option, default = Some(4096) - /// (writing) Target maximum number of rows in each row group (defaults to 1M /// rows). Writing larger row groups requires more memory to write, but /// can get better compression and be faster to read. @@ -622,7 +615,7 @@ config_namespace! { /// (writing) Sets statistics truncate length. If NULL, uses /// default parquet writer setting - pub statistics_truncate_length: Option, default = None + pub statistics_truncate_length: Option, default = Some(64) /// (writing) Sets best effort maximum number of rows in data page pub data_page_row_count_limit: usize, default = 20_000 @@ -2141,13 +2134,6 @@ config_namespace_with_hashmap! { /// Sets bloom filter number of distinct values. If NULL, uses /// default parquet options pub bloom_filter_ndv: Option, default = None - - /// Sets max statistics size for the column path. If NULL, uses - /// default parquet options - /// max_statistics_size is deprecated, currently it is not being used - // TODO: remove once deprecated - #[deprecated(since = "45.0.0", note = "Setting does not do anything")] - pub max_statistics_size: Option, default = None } } diff --git a/datafusion/common/src/file_options/parquet_writer.rs b/datafusion/common/src/file_options/parquet_writer.rs index a59cd3cb7554..185826aef47d 100644 --- a/datafusion/common/src/file_options/parquet_writer.rs +++ b/datafusion/common/src/file_options/parquet_writer.rs @@ -35,7 +35,7 @@ use parquet::{ metadata::KeyValue, properties::{ EnabledStatistics, WriterProperties, WriterPropertiesBuilder, WriterVersion, - DEFAULT_MAX_STATISTICS_SIZE, DEFAULT_STATISTICS_ENABLED, + DEFAULT_STATISTICS_ENABLED, }, }, schema::types::ColumnPath, @@ -160,16 +160,6 @@ impl TryFrom<&TableParquetOptions> for WriterPropertiesBuilder { builder = builder.set_column_bloom_filter_ndv(path.clone(), bloom_filter_ndv); } - - // max_statistics_size is deprecated, currently it is not being used - // TODO: remove once deprecated - #[allow(deprecated)] - if let Some(max_statistics_size) = options.max_statistics_size { - builder = { - #[allow(deprecated)] - builder.set_column_max_statistics_size(path, max_statistics_size) - } - } } Ok(builder) @@ -218,7 +208,6 @@ impl ParquetOptions { dictionary_enabled, dictionary_page_size_limit, statistics_enabled, - max_statistics_size, max_row_group_size, created_by, column_index_truncate_length, @@ -264,13 +253,6 @@ impl ParquetOptions { .set_data_page_row_count_limit(*data_page_row_count_limit) .set_bloom_filter_enabled(*bloom_filter_on_write); - builder = { - #[allow(deprecated)] - builder.set_max_statistics_size( - max_statistics_size.unwrap_or(DEFAULT_MAX_STATISTICS_SIZE), - ) - }; - if let Some(bloom_filter_fpp) = bloom_filter_fpp { builder = builder.set_bloom_filter_fpp(*bloom_filter_fpp); }; @@ -463,12 +445,10 @@ mod tests { fn column_options_with_non_defaults( src_col_defaults: &ParquetOptions, ) -> ParquetColumnOptions { - #[allow(deprecated)] // max_statistics_size ParquetColumnOptions { compression: Some("zstd(22)".into()), dictionary_enabled: src_col_defaults.dictionary_enabled.map(|v| !v), statistics_enabled: Some("none".into()), - max_statistics_size: Some(72), encoding: Some("RLE".into()), bloom_filter_enabled: Some(true), bloom_filter_fpp: Some(0.72), @@ -493,7 +473,6 @@ mod tests { dictionary_enabled: Some(!defaults.dictionary_enabled.unwrap_or(false)), dictionary_page_size_limit: 42, statistics_enabled: Some("chunk".into()), - max_statistics_size: Some(42), max_row_group_size: 42, created_by: "wordy".into(), column_index_truncate_length: Some(42), @@ -551,7 +530,6 @@ mod tests { ), bloom_filter_fpp: bloom_filter_default_props.map(|p| p.fpp), bloom_filter_ndv: bloom_filter_default_props.map(|p| p.ndv), - max_statistics_size: Some(props.max_statistics_size(&col)), } } @@ -608,7 +586,6 @@ mod tests { compression: default_col_props.compression, dictionary_enabled: default_col_props.dictionary_enabled, statistics_enabled: default_col_props.statistics_enabled, - max_statistics_size: default_col_props.max_statistics_size, bloom_filter_on_write: default_col_props .bloom_filter_enabled .unwrap_or_default(), diff --git a/datafusion/common/src/scalar/mod.rs b/datafusion/common/src/scalar/mod.rs index d85f45bbd875..91058575723e 100644 --- a/datafusion/common/src/scalar/mod.rs +++ b/datafusion/common/src/scalar/mod.rs @@ -904,11 +904,10 @@ pub fn dict_from_values( .map(|index| { if values_array.is_valid(index) { let native_index = K::Native::from_usize(index).ok_or_else(|| { - DataFusionError::Internal(format!( - "Can not create index of type {} from value {}", - K::DATA_TYPE, - index - )) + _internal_datafusion_err!( + "Can not create index of type {} from value {index}", + K::DATA_TYPE + ) })?; Ok(Some(native_index)) } else { @@ -2203,6 +2202,16 @@ impl ScalarValue { } let array: ArrayRef = match &data_type { + DataType::Decimal32(_precision, _scale) => { + return _not_impl_err!( + "Decimal32 not supported in ScalarValue::iter_to_array" + ); + } + DataType::Decimal64(_precision, _scale) => { + return _not_impl_err!( + "Decimal64 not supported in ScalarValue::iter_to_array" + ); + } DataType::Decimal128(precision, scale) => { let decimal_array = ScalarValue::iter_to_decimal_array(scalars, *precision, *scale)?; diff --git a/datafusion/common/src/types/native.rs b/datafusion/common/src/types/native.rs index 39c79b4b9974..76629e555b8c 100644 --- a/datafusion/common/src/types/native.rs +++ b/datafusion/common/src/types/native.rs @@ -407,7 +407,10 @@ impl From for NativeType { DataType::Union(union_fields, _) => { Union(LogicalUnionFields::from(&union_fields)) } - DataType::Decimal128(p, s) | DataType::Decimal256(p, s) => Decimal(p, s), + DataType::Decimal32(p, s) + | DataType::Decimal64(p, s) + | DataType::Decimal128(p, s) + | DataType::Decimal256(p, s) => Decimal(p, s), DataType::Map(field, _) => Map(Arc::new(field.as_ref().into())), DataType::Dictionary(_, data_type) => data_type.as_ref().clone().into(), DataType::RunEndEncoded(_, field) => field.data_type().clone().into(), diff --git a/datafusion/core/src/datasource/file_format/parquet.rs b/datafusion/core/src/datasource/file_format/parquet.rs index db704bee8801..088c4408fff5 100644 --- a/datafusion/core/src/datasource/file_format/parquet.rs +++ b/datafusion/core/src/datasource/file_format/parquet.rs @@ -523,11 +523,23 @@ mod tests { let dic_array = DictionaryArray::::try_new(keys, Arc::new(values))?; let c_dic: ArrayRef = Arc::new(dic_array); - let batch1 = RecordBatch::try_from_iter(vec![("c_dic", c_dic)])?; + // Data for column string_truncation: ["a".repeat(128), null, "b".repeat(128), null] + let string_truncation: ArrayRef = Arc::new(StringArray::from(vec![ + Some("a".repeat(128)), + None, + Some("b".repeat(128)), + None, + ])); + + let batch1 = RecordBatch::try_from_iter(vec![ + ("c_dic", c_dic), + ("string_truncation", string_truncation), + ])?; // Use store_parquet to write each batch to its own file // . batch1 written into first file and includes: // - column c_dic that has 4 rows with no null. Stats min and max of dictionary column is available. + // - column string_truncation that has 4 rows with 2 nulls. Stats min and max of string column is available but not exact. let store = Arc::new(RequestCountingObjectStore::new(Arc::new( LocalFileSystem::new(), ))); @@ -563,6 +575,19 @@ mod tests { Precision::Exact(Utf8(Some("a".into()))) ); + // column string_truncation + let string_truncation_stats = &stats.column_statistics[1]; + + assert_eq!(string_truncation_stats.null_count, Precision::Exact(2)); + assert_eq!( + string_truncation_stats.max_value, + Precision::Inexact(ScalarValue::Utf8View(Some("b".repeat(63) + "c"))) + ); + assert_eq!( + string_truncation_stats.min_value, + Precision::Inexact(ScalarValue::Utf8View(Some("a".repeat(64)))) + ); + Ok(()) } diff --git a/datafusion/core/tests/fuzz_cases/pruning.rs b/datafusion/core/tests/fuzz_cases/pruning.rs index 4c0073b94be7..f8bd4dbc1a76 100644 --- a/datafusion/core/tests/fuzz_cases/pruning.rs +++ b/datafusion/core/tests/fuzz_cases/pruning.rs @@ -319,14 +319,9 @@ async fn write_parquet_file( row_groups: Vec>, ) -> Bytes { let mut buf = BytesMut::new().writer(); - let mut props = WriterProperties::builder(); - if let Some(truncation_length) = truncation_length { - props = { - #[allow(deprecated)] - props.set_max_statistics_size(truncation_length) - } - } - props = props.set_statistics_enabled(EnabledStatistics::Chunk); // row group level + let props = WriterProperties::builder() + .set_statistics_enabled(EnabledStatistics::Chunk) // row group level + .set_statistics_truncate_length(truncation_length); let props = props.build(); { let mut writer = diff --git a/datafusion/core/tests/parquet/mod.rs b/datafusion/core/tests/parquet/mod.rs index d0ddd9a0b0f1..c44d14abd381 100644 --- a/datafusion/core/tests/parquet/mod.rs +++ b/datafusion/core/tests/parquet/mod.rs @@ -110,11 +110,11 @@ struct ContextWithParquet { /// The output of running one of the test cases struct TestOutput { - /// The input string + /// The input query SQL sql: String, /// Execution metrics for the Parquet Scan parquet_metrics: MetricsSet, - /// number of rows in results + /// number of actual rows in results result_rows: usize, /// the contents of the input, as a string pretty_input: String, diff --git a/datafusion/core/tests/parquet/row_group_pruning.rs b/datafusion/core/tests/parquet/row_group_pruning.rs index 8613cd481be1..44409166d3ce 100644 --- a/datafusion/core/tests/parquet/row_group_pruning.rs +++ b/datafusion/core/tests/parquet/row_group_pruning.rs @@ -34,7 +34,7 @@ struct RowGroupPruningTest { expected_files_pruned_by_statistics: Option, expected_row_group_matched_by_bloom_filter: Option, expected_row_group_pruned_by_bloom_filter: Option, - expected_results: usize, + expected_rows: usize, } impl RowGroupPruningTest { // Start building the test configuration @@ -48,7 +48,7 @@ impl RowGroupPruningTest { expected_files_pruned_by_statistics: None, expected_row_group_matched_by_bloom_filter: None, expected_row_group_pruned_by_bloom_filter: None, - expected_results: 0, + expected_rows: 0, } } @@ -99,9 +99,9 @@ impl RowGroupPruningTest { self } - // Set the expected rows for the test + /// Set the number of expected rows from the output of this test fn with_expected_rows(mut self, rows: usize) -> Self { - self.expected_results = rows; + self.expected_rows = rows; self } @@ -145,8 +145,10 @@ impl RowGroupPruningTest { ); assert_eq!( output.result_rows, - self.expected_results, - "mismatched expected rows: {}", + self.expected_rows, + "Expected {} rows, got {}: {}", + output.result_rows, + self.expected_rows, output.description(), ); } diff --git a/datafusion/datasource-avro/src/avro_to_arrow/schema.rs b/datafusion/datasource-avro/src/avro_to_arrow/schema.rs index 39d93d2f193e..3fce0d4826a2 100644 --- a/datafusion/datasource-avro/src/avro_to_arrow/schema.rs +++ b/datafusion/datasource-avro/src/avro_to_arrow/schema.rs @@ -238,6 +238,8 @@ fn default_field_name(dt: &DataType) -> &str { | DataType::LargeListView(_) => { unimplemented!("View support not implemented") } + DataType::Decimal32(_, _) => "decimal", + DataType::Decimal64(_, _) => "decimal", DataType::Decimal128(_, _) => "decimal", DataType::Decimal256(_, _) => "decimal", } diff --git a/datafusion/datasource-parquet/src/file_format.rs b/datafusion/datasource-parquet/src/file_format.rs index 1ec2015ce549..1fcc1721017c 100644 --- a/datafusion/datasource-parquet/src/file_format.rs +++ b/datafusion/datasource-parquet/src/file_format.rs @@ -19,11 +19,11 @@ use std::any::Any; use std::cell::RefCell; -use std::fmt; use std::fmt::Debug; use std::ops::Range; use std::rc::Rc; use std::sync::Arc; +use std::{fmt, vec}; use arrow::array::RecordBatch; use arrow::datatypes::{Fields, Schema, SchemaRef, TimeUnit}; diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs index 684eba59536b..2e364d0d2b80 100644 --- a/datafusion/expr/src/utils.rs +++ b/datafusion/expr/src/utils.rs @@ -814,6 +814,8 @@ pub fn can_hash(data_type: &DataType) -> bool { DataType::Float16 => true, DataType::Float32 => true, DataType::Float64 => true, + DataType::Decimal32(_, _) => true, + DataType::Decimal64(_, _) => true, DataType::Decimal128(_, _) => true, DataType::Decimal256(_, _) => true, DataType::Timestamp(_, _) => true, diff --git a/datafusion/physical-plan/src/sorts/cursor.rs b/datafusion/physical-plan/src/sorts/cursor.rs index 8ab603e04961..54dc2414e4f0 100644 --- a/datafusion/physical-plan/src/sorts/cursor.rs +++ b/datafusion/physical-plan/src/sorts/cursor.rs @@ -289,120 +289,6 @@ impl CursorArray for StringViewArray { } } -/// Todo use arrow-rs side api after: and released -/// Builds a 128-bit composite key for an inline value: -/// -/// - High 96 bits: the inline data in big-endian byte order (for correct lexicographical sorting). -/// - Low 32 bits: the length in big-endian byte order, acting as a tiebreaker so shorter strings -/// (or those with fewer meaningful bytes) always numerically sort before longer ones. -/// -/// This function extracts the length and the 12-byte inline string data from the raw -/// little-endian `u128` representation, converts them to big-endian ordering, and packs them -/// into a single `u128` value suitable for fast, branchless comparisons. -/// -/// # Why include length? -/// -/// A pure 96-bit content comparison can’t distinguish between two values whose inline bytes -/// compare equal—either because one is a true prefix of the other or because zero-padding -/// hides extra bytes. By tucking the 32-bit length into the lower bits, a single `u128` compare -/// handles both content and length in one go. -/// -/// Example: comparing "bar" (3 bytes) vs "bar\0" (4 bytes) -/// -/// | String | Bytes 0–4 (length LE) | Bytes 4–16 (data + padding) | -/// |------------|-----------------------|---------------------------------| -/// | `"bar"` | `03 00 00 00` | `62 61 72` + 9 × `00` | -/// | `"bar\0"`| `04 00 00 00` | `62 61 72 00` + 8 × `00` | -/// -/// Both inline parts become `62 61 72 00…00`, so they tie on content. The length field -/// then differentiates: -/// -/// ```text -/// key("bar") = 0x0000000000000000000062617200000003 -/// key("bar\0") = 0x0000000000000000000062617200000004 -/// ⇒ key("bar") < key("bar\0") -/// ``` -/// # Inlining and Endianness -/// -/// - We start by calling `.to_le_bytes()` on the `raw` `u128`, because Rust’s native in‑memory -/// representation is little‑endian on x86/ARM. -/// - We extract the low 32 bits numerically (`raw as u32`)—this step is endianness‑free. -/// - We copy the 12 bytes of inline data (original order) into `buf[0..12]`. -/// - We serialize `length` as big‑endian into `buf[12..16]`. -/// - Finally, `u128::from_be_bytes(buf)` treats `buf[0]` as the most significant byte -/// and `buf[15]` as the least significant, producing a `u128` whose integer value -/// directly encodes “inline data then length” in big‑endian form. -/// -/// This ensures that a simple `u128` comparison is equivalent to the desired -/// lexicographical comparison of the inline bytes followed by length. -#[inline(always)] -pub fn inline_key_fast(raw: u128) -> u128 { - // 1. Decompose `raw` into little‑endian bytes: - // - raw_bytes[0..4] = length in LE - // - raw_bytes[4..16] = inline string data - let raw_bytes = raw.to_le_bytes(); - - // 2. Numerically truncate to get the low 32‑bit length (endianness‑free). - let length = raw as u32; - - // 3. Build a 16‑byte buffer in big‑endian order: - // - buf[0..12] = inline string bytes (in original order) - // - buf[12..16] = length.to_be_bytes() (BE) - let mut buf = [0u8; 16]; - buf[0..12].copy_from_slice(&raw_bytes[4..16]); // inline data - - // Why convert length to big-endian for comparison? - // - // Rust (on most platforms) stores integers in little-endian format, - // meaning the least significant byte is at the lowest memory address. - // For example, an u32 value like 0x22345677 is stored in memory as: - // - // [0x77, 0x56, 0x34, 0x22] // little-endian layout - // ^ ^ ^ ^ - // LSB ↑↑↑ MSB - // - // This layout is efficient for arithmetic but *not* suitable for - // lexicographic (dictionary-style) comparison of byte arrays. - // - // To compare values by byte order—e.g., for sorted keys or binary trees— - // we must convert them to **big-endian**, where: - // - // - The most significant byte (MSB) comes first (index 0) - // - The least significant byte (LSB) comes last (index N-1) - // - // In big-endian, the same u32 = 0x22345677 would be represented as: - // - // [0x22, 0x34, 0x56, 0x77] - // - // This ordering aligns with natural string/byte sorting, so calling - // `.to_be_bytes()` allows us to construct - // keys where standard numeric comparison (e.g., `<`, `>`) behaves - // like lexicographic byte comparison. - buf[12..16].copy_from_slice(&length.to_be_bytes()); // length in BE - - // 4. Deserialize the buffer as a big‑endian u128: - // buf[0] is MSB, buf[15] is LSB. - // Details: - // Note on endianness and layout: - // - // Although `buf[0]` is stored at the lowest memory address, - // calling `u128::from_be_bytes(buf)` interprets it as the **most significant byte (MSB)**, - // and `buf[15]` as the **least significant byte (LSB)**. - // - // This is the core principle of **big-endian decoding**: - // - Byte at index 0 maps to bits 127..120 (highest) - // - Byte at index 1 maps to bits 119..112 - // - ... - // - Byte at index 15 maps to bits 7..0 (lowest) - // - // So even though memory layout goes from low to high (left to right), - // big-endian treats the **first byte** as highest in value. - // - // This guarantees that comparing two `u128` keys is equivalent to lexicographically - // comparing the original inline bytes, followed by length. - u128::from_be_bytes(buf) -} - impl CursorValues for StringViewArray { fn len(&self) -> usize { self.views().len() @@ -460,7 +346,8 @@ impl CursorValues for StringViewArray { if l.data_buffers().is_empty() && r.data_buffers().is_empty() { let l_view = unsafe { l.views().get_unchecked(l_idx) }; let r_view = unsafe { r.views().get_unchecked(r_idx) }; - return inline_key_fast(*l_view).cmp(&inline_key_fast(*r_view)); + return StringViewArray::inline_key_fast(*l_view) + .cmp(&StringViewArray::inline_key_fast(*r_view)); } unsafe { GenericByteViewArray::compare_unchecked(l, l_idx, r, r_idx) } @@ -555,7 +442,6 @@ impl CursorValues for ArrayValues { #[cfg(test)] mod tests { - use arrow::array::GenericBinaryArray; use datafusion_execution::memory_pool::{ GreedyMemoryPool, MemoryConsumer, MemoryPool, }; @@ -720,100 +606,4 @@ mod tests { b.advance(); assert_eq!(a.cmp(&b), Ordering::Less); } - - /// Integration tests for `inline_key_fast` covering: - /// - /// 1. Monotonic ordering across increasing lengths and lexical variations. - /// 2. Cross-check against `GenericBinaryArray` comparison to ensure semantic equivalence. - /// - /// This also includes a specific test for the “bar” vs. “bar\0” case, demonstrating why - /// the length field is required even when all inline bytes fit in 12 bytes. - /// - /// The test includes strings that verify correct byte order (prevent reversal bugs), - /// and length-based tie-breaking in the composite key. - /// - /// The test confirms that `inline_key_fast` produces keys which sort consistently - /// with the expected lexicographical order of the raw byte arrays. - #[test] - fn test_inline_key_fast_various_lengths_and_lexical() { - /// Helper to create a raw u128 value representing an inline ByteView: - /// - `length`: number of meaningful bytes (must be ≤ 12) - /// - `data`: the actual inline data bytes - /// - /// The first 4 bytes encode length in little-endian, - /// the following 12 bytes contain the inline string data (unpadded). - fn make_raw_inline(length: u32, data: &[u8]) -> u128 { - assert!(length as usize <= 12, "Inline length must be ≤ 12"); - assert!( - data.len() == length as usize, - "Data length must match `length`" - ); - - let mut raw_bytes = [0u8; 16]; - raw_bytes[0..4].copy_from_slice(&length.to_le_bytes()); // length stored little-endian - raw_bytes[4..(4 + data.len())].copy_from_slice(data); // inline data - u128::from_le_bytes(raw_bytes) - } - - // Test inputs: various lengths and lexical orders, - // plus special cases for byte order and length tie-breaking - let test_inputs: Vec<&[u8]> = vec![ - b"a", - b"aa", - b"aaa", - b"aab", - b"abcd", - b"abcde", - b"abcdef", - b"abcdefg", - b"abcdefgh", - b"abcdefghi", - b"abcdefghij", - b"abcdefghijk", - b"abcdefghijkl", - // Tests for byte-order reversal bug: - // Without the fix, "backend one" would compare as "eno dnekcab", - // causing incorrect sort order relative to "backend two". - b"backend one", - b"backend two", - // Tests length-tiebreaker logic: - // "bar" (3 bytes) and "bar\0" (4 bytes) have identical inline data, - // so only the length differentiates their ordering. - b"bar", - b"bar\0", - // Additional lexical and length tie-breaking cases with same prefix, in correct lex order: - b"than12Byt", - b"than12Bytes", - b"than12Bytes\0", - b"than12Bytesx", - b"than12Bytex", - b"than12Bytez", - // Additional lexical tests - b"xyy", - b"xyz", - b"xza", - ]; - - // Create a GenericBinaryArray for cross-comparison of lex order - let array: GenericBinaryArray = GenericBinaryArray::from( - test_inputs.iter().map(|s| Some(*s)).collect::>(), - ); - - for i in 0..array.len() - 1 { - let v1 = array.value(i); - let v2 = array.value(i + 1); - - // Assert the array's natural lexical ordering is correct - assert!(v1 < v2, "Array compare failed: {v1:?} !< {v2:?}"); - - // Assert the keys produced by inline_key_fast reflect the same ordering - let key1 = inline_key_fast(make_raw_inline(v1.len() as u32, v1)); - let key2 = inline_key_fast(make_raw_inline(v2.len() as u32, v2)); - - assert!( - key1 < key2, - "Key compare failed: key({v1:?})=0x{key1:032x} !< key({v2:?})=0x{key2:032x}", - ); - } - } } diff --git a/datafusion/proto-common/proto/datafusion_common.proto b/datafusion/proto-common/proto/datafusion_common.proto index 8cb272605899..f5c79cf3d9a4 100644 --- a/datafusion/proto-common/proto/datafusion_common.proto +++ b/datafusion/proto-common/proto/datafusion_common.proto @@ -136,7 +136,19 @@ enum IntervalUnit{ MonthDayNano = 2; } -message Decimal{ +message Decimal32Type { + reserved 1, 2; + uint32 precision = 3; + int32 scale = 4; +} + +message Decimal64Type { + reserved 1, 2; + uint32 precision = 3; + int32 scale = 4; +} + +message Decimal128Type { reserved 1, 2; uint32 precision = 3; int32 scale = 4; @@ -286,6 +298,8 @@ message ScalarValue{ ScalarNestedValue struct_value = 32; ScalarNestedValue map_value = 41; + Decimal32 decimal32_value = 43; + Decimal64 decimal64_value = 44; Decimal128 decimal128_value = 20; Decimal256 decimal256_value = 39; @@ -310,6 +324,18 @@ message ScalarValue{ } } +message Decimal32{ + bytes value = 1; + int64 p = 2; + int64 s = 3; +} + +message Decimal64{ + bytes value = 1; + int64 p = 2; + int64 s = 3; +} + message Decimal128{ bytes value = 1; int64 p = 2; @@ -352,7 +378,9 @@ message ArrowType{ TimeUnit TIME32 = 21 ; TimeUnit TIME64 = 22 ; IntervalUnit INTERVAL = 23 ; - Decimal DECIMAL = 24 ; + Decimal32Type DECIMAL32 = 40; + Decimal64Type DECIMAL64 = 41; + Decimal128Type DECIMAL128 = 24; Decimal256Type DECIMAL256 = 36; List LIST = 25; List LARGE_LIST = 26; @@ -480,9 +508,7 @@ message ParquetColumnOptions { uint64 bloom_filter_ndv = 7; } - oneof max_statistics_size_opt { - uint32 max_statistics_size = 8; - } + reserved 8; // used to be uint32 max_statistics_size = 8; } message ParquetOptions { @@ -521,9 +547,7 @@ message ParquetOptions { string statistics_enabled = 13; } - oneof max_statistics_size_opt { - uint64 max_statistics_size = 14; - } + reserved 14; // used to be uint32 max_statistics_size = 20; oneof column_index_truncate_length_opt { uint64 column_index_truncate_length = 17; diff --git a/datafusion/proto-common/src/from_proto/mod.rs b/datafusion/proto-common/src/from_proto/mod.rs index 0823e150268d..c5242d0176e6 100644 --- a/datafusion/proto-common/src/from_proto/mod.rs +++ b/datafusion/proto-common/src/from_proto/mod.rs @@ -37,6 +37,7 @@ use datafusion_common::{ TableParquetOptions, }, file_options::{csv_writer::CsvWriterOptions, json_writer::JsonWriterOptions}, + not_impl_err, parsers::CompressionTypeVariant, plan_datafusion_err, stats::Precision, @@ -257,7 +258,15 @@ impl TryFrom<&protobuf::arrow_type::ArrowTypeEnum> for DataType { arrow_type::ArrowTypeEnum::Interval(interval_unit) => { DataType::Interval(parse_i32_to_interval_unit(interval_unit)?) } - arrow_type::ArrowTypeEnum::Decimal(protobuf::Decimal { + arrow_type::ArrowTypeEnum::Decimal32(protobuf::Decimal32Type { + precision, + scale, + }) => DataType::Decimal32(*precision as u8, *scale as i8), + arrow_type::ArrowTypeEnum::Decimal64(protobuf::Decimal64Type { + precision, + scale, + }) => DataType::Decimal64(*precision as u8, *scale as i8), + arrow_type::ArrowTypeEnum::Decimal128(protobuf::Decimal128Type { precision, scale, }) => DataType::Decimal128(*precision as u8, *scale as i8), @@ -469,6 +478,14 @@ impl TryFrom<&protobuf::ScalarValue> for ScalarValue { let null_type: DataType = v.try_into()?; null_type.try_into().map_err(Error::DataFusionError)? } + Value::Decimal32Value(_val) => { + return not_impl_err!("Decimal32 protobuf deserialization") + .map_err(Error::DataFusionError) + } + Value::Decimal64Value(_val) => { + return not_impl_err!("Decimal64 protobuf deserialization") + .map_err(Error::DataFusionError) + } Value::Decimal128Value(val) => { let array = vec_to_array(val.value.clone()); Self::Decimal128( @@ -938,12 +955,6 @@ impl TryFrom<&protobuf::ParquetOptions> for ParquetOptions { protobuf::parquet_options::StatisticsEnabledOpt::StatisticsEnabled(v) => Some(v), }) .unwrap_or(None), - max_statistics_size: value - .max_statistics_size_opt.as_ref() - .map(|opt| match opt { - protobuf::parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v) => Some(*v as usize), - }) - .unwrap_or(None), max_row_group_size: value.max_row_group_size as usize, created_by: value.created_by.clone(), column_index_truncate_length: value @@ -1009,12 +1020,6 @@ impl TryFrom<&protobuf::ParquetColumnOptions> for ParquetColumnOptions { protobuf::parquet_column_options::StatisticsEnabledOpt::StatisticsEnabled(v) => Some(v), }) .unwrap_or(None), - max_statistics_size: value - .max_statistics_size_opt - .map(|opt| match opt { - protobuf::parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v) => Some(v as usize), - }) - .unwrap_or(None), encoding: value .encoding_opt.clone() .map(|opt| match opt { diff --git a/datafusion/proto-common/src/generated/pbjson.rs b/datafusion/proto-common/src/generated/pbjson.rs index f35fd1594695..48782ff1d93a 100644 --- a/datafusion/proto-common/src/generated/pbjson.rs +++ b/datafusion/proto-common/src/generated/pbjson.rs @@ -243,8 +243,14 @@ impl serde::Serialize for ArrowType { .map_err(|_| serde::ser::Error::custom(format!("Invalid variant {}", *v)))?; struct_ser.serialize_field("INTERVAL", &v)?; } - arrow_type::ArrowTypeEnum::Decimal(v) => { - struct_ser.serialize_field("DECIMAL", v)?; + arrow_type::ArrowTypeEnum::Decimal32(v) => { + struct_ser.serialize_field("DECIMAL32", v)?; + } + arrow_type::ArrowTypeEnum::Decimal64(v) => { + struct_ser.serialize_field("DECIMAL64", v)?; + } + arrow_type::ArrowTypeEnum::Decimal128(v) => { + struct_ser.serialize_field("DECIMAL128", v)?; } arrow_type::ArrowTypeEnum::Decimal256(v) => { struct_ser.serialize_field("DECIMAL256", v)?; @@ -314,7 +320,9 @@ impl<'de> serde::Deserialize<'de> for ArrowType { "TIME32", "TIME64", "INTERVAL", - "DECIMAL", + "DECIMAL32", + "DECIMAL64", + "DECIMAL128", "DECIMAL256", "LIST", "LARGE_LIST", @@ -356,7 +364,9 @@ impl<'de> serde::Deserialize<'de> for ArrowType { Time32, Time64, Interval, - Decimal, + Decimal32, + Decimal64, + Decimal128, Decimal256, List, LargeList, @@ -413,7 +423,9 @@ impl<'de> serde::Deserialize<'de> for ArrowType { "TIME32" => Ok(GeneratedField::Time32), "TIME64" => Ok(GeneratedField::Time64), "INTERVAL" => Ok(GeneratedField::Interval), - "DECIMAL" => Ok(GeneratedField::Decimal), + "DECIMAL32" => Ok(GeneratedField::Decimal32), + "DECIMAL64" => Ok(GeneratedField::Decimal64), + "DECIMAL128" => Ok(GeneratedField::Decimal128), "DECIMAL256" => Ok(GeneratedField::Decimal256), "LIST" => Ok(GeneratedField::List), "LARGELIST" | "LARGE_LIST" => Ok(GeneratedField::LargeList), @@ -628,11 +640,25 @@ impl<'de> serde::Deserialize<'de> for ArrowType { } arrow_type_enum__ = map_.next_value::<::std::option::Option>()?.map(|x| arrow_type::ArrowTypeEnum::Interval(x as i32)); } - GeneratedField::Decimal => { + GeneratedField::Decimal32 => { + if arrow_type_enum__.is_some() { + return Err(serde::de::Error::duplicate_field("DECIMAL32")); + } + arrow_type_enum__ = map_.next_value::<::std::option::Option<_>>()?.map(arrow_type::ArrowTypeEnum::Decimal32) +; + } + GeneratedField::Decimal64 => { if arrow_type_enum__.is_some() { - return Err(serde::de::Error::duplicate_field("DECIMAL")); + return Err(serde::de::Error::duplicate_field("DECIMAL64")); } - arrow_type_enum__ = map_.next_value::<::std::option::Option<_>>()?.map(arrow_type::ArrowTypeEnum::Decimal) + arrow_type_enum__ = map_.next_value::<::std::option::Option<_>>()?.map(arrow_type::ArrowTypeEnum::Decimal64) +; + } + GeneratedField::Decimal128 => { + if arrow_type_enum__.is_some() { + return Err(serde::de::Error::duplicate_field("DECIMAL128")); + } + arrow_type_enum__ = map_.next_value::<::std::option::Option<_>>()?.map(arrow_type::ArrowTypeEnum::Decimal128) ; } GeneratedField::Decimal256 => { @@ -2222,45 +2248,431 @@ impl<'de> serde::Deserialize<'de> for CsvWriterOptions { } null_value__ = Some(map_.next_value()?); } - GeneratedField::Quote => { - if quote__.is_some() { - return Err(serde::de::Error::duplicate_field("quote")); + GeneratedField::Quote => { + if quote__.is_some() { + return Err(serde::de::Error::duplicate_field("quote")); + } + quote__ = Some(map_.next_value()?); + } + GeneratedField::Escape => { + if escape__.is_some() { + return Err(serde::de::Error::duplicate_field("escape")); + } + escape__ = Some(map_.next_value()?); + } + GeneratedField::DoubleQuote => { + if double_quote__.is_some() { + return Err(serde::de::Error::duplicate_field("doubleQuote")); + } + double_quote__ = Some(map_.next_value()?); + } + } + } + Ok(CsvWriterOptions { + compression: compression__.unwrap_or_default(), + delimiter: delimiter__.unwrap_or_default(), + has_header: has_header__.unwrap_or_default(), + date_format: date_format__.unwrap_or_default(), + datetime_format: datetime_format__.unwrap_or_default(), + timestamp_format: timestamp_format__.unwrap_or_default(), + time_format: time_format__.unwrap_or_default(), + null_value: null_value__.unwrap_or_default(), + quote: quote__.unwrap_or_default(), + escape: escape__.unwrap_or_default(), + double_quote: double_quote__.unwrap_or_default(), + }) + } + } + deserializer.deserialize_struct("datafusion_common.CsvWriterOptions", FIELDS, GeneratedVisitor) + } +} +impl serde::Serialize for Decimal128 { + #[allow(deprecated)] + fn serialize(&self, serializer: S) -> std::result::Result + where + S: serde::Serializer, + { + use serde::ser::SerializeStruct; + let mut len = 0; + if !self.value.is_empty() { + len += 1; + } + if self.p != 0 { + len += 1; + } + if self.s != 0 { + len += 1; + } + let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal128", len)?; + if !self.value.is_empty() { + #[allow(clippy::needless_borrow)] + #[allow(clippy::needless_borrows_for_generic_args)] + struct_ser.serialize_field("value", pbjson::private::base64::encode(&self.value).as_str())?; + } + if self.p != 0 { + #[allow(clippy::needless_borrow)] + #[allow(clippy::needless_borrows_for_generic_args)] + struct_ser.serialize_field("p", ToString::to_string(&self.p).as_str())?; + } + if self.s != 0 { + #[allow(clippy::needless_borrow)] + #[allow(clippy::needless_borrows_for_generic_args)] + struct_ser.serialize_field("s", ToString::to_string(&self.s).as_str())?; + } + struct_ser.end() + } +} +impl<'de> serde::Deserialize<'de> for Decimal128 { + #[allow(deprecated)] + fn deserialize(deserializer: D) -> std::result::Result + where + D: serde::Deserializer<'de>, + { + const FIELDS: &[&str] = &[ + "value", + "p", + "s", + ]; + + #[allow(clippy::enum_variant_names)] + enum GeneratedField { + Value, + P, + S, + } + impl<'de> serde::Deserialize<'de> for GeneratedField { + fn deserialize(deserializer: D) -> std::result::Result + where + D: serde::Deserializer<'de>, + { + struct GeneratedVisitor; + + impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { + type Value = GeneratedField; + + fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(formatter, "expected one of: {:?}", &FIELDS) + } + + #[allow(unused_variables)] + fn visit_str(self, value: &str) -> std::result::Result + where + E: serde::de::Error, + { + match value { + "value" => Ok(GeneratedField::Value), + "p" => Ok(GeneratedField::P), + "s" => Ok(GeneratedField::S), + _ => Err(serde::de::Error::unknown_field(value, FIELDS)), + } + } + } + deserializer.deserialize_identifier(GeneratedVisitor) + } + } + struct GeneratedVisitor; + impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { + type Value = Decimal128; + + fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + formatter.write_str("struct datafusion_common.Decimal128") + } + + fn visit_map(self, mut map_: V) -> std::result::Result + where + V: serde::de::MapAccess<'de>, + { + let mut value__ = None; + let mut p__ = None; + let mut s__ = None; + while let Some(k) = map_.next_key()? { + match k { + GeneratedField::Value => { + if value__.is_some() { + return Err(serde::de::Error::duplicate_field("value")); + } + value__ = + Some(map_.next_value::<::pbjson::private::BytesDeserialize<_>>()?.0) + ; + } + GeneratedField::P => { + if p__.is_some() { + return Err(serde::de::Error::duplicate_field("p")); + } + p__ = + Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0) + ; + } + GeneratedField::S => { + if s__.is_some() { + return Err(serde::de::Error::duplicate_field("s")); + } + s__ = + Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0) + ; + } + } + } + Ok(Decimal128 { + value: value__.unwrap_or_default(), + p: p__.unwrap_or_default(), + s: s__.unwrap_or_default(), + }) + } + } + deserializer.deserialize_struct("datafusion_common.Decimal128", FIELDS, GeneratedVisitor) + } +} +impl serde::Serialize for Decimal128Type { + #[allow(deprecated)] + fn serialize(&self, serializer: S) -> std::result::Result + where + S: serde::Serializer, + { + use serde::ser::SerializeStruct; + let mut len = 0; + if self.precision != 0 { + len += 1; + } + if self.scale != 0 { + len += 1; + } + let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal128Type", len)?; + if self.precision != 0 { + struct_ser.serialize_field("precision", &self.precision)?; + } + if self.scale != 0 { + struct_ser.serialize_field("scale", &self.scale)?; + } + struct_ser.end() + } +} +impl<'de> serde::Deserialize<'de> for Decimal128Type { + #[allow(deprecated)] + fn deserialize(deserializer: D) -> std::result::Result + where + D: serde::Deserializer<'de>, + { + const FIELDS: &[&str] = &[ + "precision", + "scale", + ]; + + #[allow(clippy::enum_variant_names)] + enum GeneratedField { + Precision, + Scale, + } + impl<'de> serde::Deserialize<'de> for GeneratedField { + fn deserialize(deserializer: D) -> std::result::Result + where + D: serde::Deserializer<'de>, + { + struct GeneratedVisitor; + + impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { + type Value = GeneratedField; + + fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(formatter, "expected one of: {:?}", &FIELDS) + } + + #[allow(unused_variables)] + fn visit_str(self, value: &str) -> std::result::Result + where + E: serde::de::Error, + { + match value { + "precision" => Ok(GeneratedField::Precision), + "scale" => Ok(GeneratedField::Scale), + _ => Err(serde::de::Error::unknown_field(value, FIELDS)), + } + } + } + deserializer.deserialize_identifier(GeneratedVisitor) + } + } + struct GeneratedVisitor; + impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { + type Value = Decimal128Type; + + fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + formatter.write_str("struct datafusion_common.Decimal128Type") + } + + fn visit_map(self, mut map_: V) -> std::result::Result + where + V: serde::de::MapAccess<'de>, + { + let mut precision__ = None; + let mut scale__ = None; + while let Some(k) = map_.next_key()? { + match k { + GeneratedField::Precision => { + if precision__.is_some() { + return Err(serde::de::Error::duplicate_field("precision")); + } + precision__ = + Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0) + ; + } + GeneratedField::Scale => { + if scale__.is_some() { + return Err(serde::de::Error::duplicate_field("scale")); + } + scale__ = + Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0) + ; + } + } + } + Ok(Decimal128Type { + precision: precision__.unwrap_or_default(), + scale: scale__.unwrap_or_default(), + }) + } + } + deserializer.deserialize_struct("datafusion_common.Decimal128Type", FIELDS, GeneratedVisitor) + } +} +impl serde::Serialize for Decimal256 { + #[allow(deprecated)] + fn serialize(&self, serializer: S) -> std::result::Result + where + S: serde::Serializer, + { + use serde::ser::SerializeStruct; + let mut len = 0; + if !self.value.is_empty() { + len += 1; + } + if self.p != 0 { + len += 1; + } + if self.s != 0 { + len += 1; + } + let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal256", len)?; + if !self.value.is_empty() { + #[allow(clippy::needless_borrow)] + #[allow(clippy::needless_borrows_for_generic_args)] + struct_ser.serialize_field("value", pbjson::private::base64::encode(&self.value).as_str())?; + } + if self.p != 0 { + #[allow(clippy::needless_borrow)] + #[allow(clippy::needless_borrows_for_generic_args)] + struct_ser.serialize_field("p", ToString::to_string(&self.p).as_str())?; + } + if self.s != 0 { + #[allow(clippy::needless_borrow)] + #[allow(clippy::needless_borrows_for_generic_args)] + struct_ser.serialize_field("s", ToString::to_string(&self.s).as_str())?; + } + struct_ser.end() + } +} +impl<'de> serde::Deserialize<'de> for Decimal256 { + #[allow(deprecated)] + fn deserialize(deserializer: D) -> std::result::Result + where + D: serde::Deserializer<'de>, + { + const FIELDS: &[&str] = &[ + "value", + "p", + "s", + ]; + + #[allow(clippy::enum_variant_names)] + enum GeneratedField { + Value, + P, + S, + } + impl<'de> serde::Deserialize<'de> for GeneratedField { + fn deserialize(deserializer: D) -> std::result::Result + where + D: serde::Deserializer<'de>, + { + struct GeneratedVisitor; + + impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { + type Value = GeneratedField; + + fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(formatter, "expected one of: {:?}", &FIELDS) + } + + #[allow(unused_variables)] + fn visit_str(self, value: &str) -> std::result::Result + where + E: serde::de::Error, + { + match value { + "value" => Ok(GeneratedField::Value), + "p" => Ok(GeneratedField::P), + "s" => Ok(GeneratedField::S), + _ => Err(serde::de::Error::unknown_field(value, FIELDS)), + } + } + } + deserializer.deserialize_identifier(GeneratedVisitor) + } + } + struct GeneratedVisitor; + impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { + type Value = Decimal256; + + fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + formatter.write_str("struct datafusion_common.Decimal256") + } + + fn visit_map(self, mut map_: V) -> std::result::Result + where + V: serde::de::MapAccess<'de>, + { + let mut value__ = None; + let mut p__ = None; + let mut s__ = None; + while let Some(k) = map_.next_key()? { + match k { + GeneratedField::Value => { + if value__.is_some() { + return Err(serde::de::Error::duplicate_field("value")); } - quote__ = Some(map_.next_value()?); + value__ = + Some(map_.next_value::<::pbjson::private::BytesDeserialize<_>>()?.0) + ; } - GeneratedField::Escape => { - if escape__.is_some() { - return Err(serde::de::Error::duplicate_field("escape")); + GeneratedField::P => { + if p__.is_some() { + return Err(serde::de::Error::duplicate_field("p")); } - escape__ = Some(map_.next_value()?); + p__ = + Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0) + ; } - GeneratedField::DoubleQuote => { - if double_quote__.is_some() { - return Err(serde::de::Error::duplicate_field("doubleQuote")); + GeneratedField::S => { + if s__.is_some() { + return Err(serde::de::Error::duplicate_field("s")); } - double_quote__ = Some(map_.next_value()?); + s__ = + Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0) + ; } } } - Ok(CsvWriterOptions { - compression: compression__.unwrap_or_default(), - delimiter: delimiter__.unwrap_or_default(), - has_header: has_header__.unwrap_or_default(), - date_format: date_format__.unwrap_or_default(), - datetime_format: datetime_format__.unwrap_or_default(), - timestamp_format: timestamp_format__.unwrap_or_default(), - time_format: time_format__.unwrap_or_default(), - null_value: null_value__.unwrap_or_default(), - quote: quote__.unwrap_or_default(), - escape: escape__.unwrap_or_default(), - double_quote: double_quote__.unwrap_or_default(), + Ok(Decimal256 { + value: value__.unwrap_or_default(), + p: p__.unwrap_or_default(), + s: s__.unwrap_or_default(), }) } } - deserializer.deserialize_struct("datafusion_common.CsvWriterOptions", FIELDS, GeneratedVisitor) + deserializer.deserialize_struct("datafusion_common.Decimal256", FIELDS, GeneratedVisitor) } } -impl serde::Serialize for Decimal { +impl serde::Serialize for Decimal256Type { #[allow(deprecated)] fn serialize(&self, serializer: S) -> std::result::Result where @@ -2274,7 +2686,7 @@ impl serde::Serialize for Decimal { if self.scale != 0 { len += 1; } - let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal", len)?; + let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal256Type", len)?; if self.precision != 0 { struct_ser.serialize_field("precision", &self.precision)?; } @@ -2284,7 +2696,7 @@ impl serde::Serialize for Decimal { struct_ser.end() } } -impl<'de> serde::Deserialize<'de> for Decimal { +impl<'de> serde::Deserialize<'de> for Decimal256Type { #[allow(deprecated)] fn deserialize(deserializer: D) -> std::result::Result where @@ -2331,13 +2743,13 @@ impl<'de> serde::Deserialize<'de> for Decimal { } struct GeneratedVisitor; impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { - type Value = Decimal; + type Value = Decimal256Type; fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - formatter.write_str("struct datafusion_common.Decimal") + formatter.write_str("struct datafusion_common.Decimal256Type") } - fn visit_map(self, mut map_: V) -> std::result::Result + fn visit_map(self, mut map_: V) -> std::result::Result where V: serde::de::MapAccess<'de>, { @@ -2363,16 +2775,16 @@ impl<'de> serde::Deserialize<'de> for Decimal { } } } - Ok(Decimal { + Ok(Decimal256Type { precision: precision__.unwrap_or_default(), scale: scale__.unwrap_or_default(), }) } } - deserializer.deserialize_struct("datafusion_common.Decimal", FIELDS, GeneratedVisitor) + deserializer.deserialize_struct("datafusion_common.Decimal256Type", FIELDS, GeneratedVisitor) } } -impl serde::Serialize for Decimal128 { +impl serde::Serialize for Decimal32 { #[allow(deprecated)] fn serialize(&self, serializer: S) -> std::result::Result where @@ -2389,7 +2801,7 @@ impl serde::Serialize for Decimal128 { if self.s != 0 { len += 1; } - let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal128", len)?; + let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal32", len)?; if !self.value.is_empty() { #[allow(clippy::needless_borrow)] #[allow(clippy::needless_borrows_for_generic_args)] @@ -2408,7 +2820,7 @@ impl serde::Serialize for Decimal128 { struct_ser.end() } } -impl<'de> serde::Deserialize<'de> for Decimal128 { +impl<'de> serde::Deserialize<'de> for Decimal32 { #[allow(deprecated)] fn deserialize(deserializer: D) -> std::result::Result where @@ -2458,13 +2870,13 @@ impl<'de> serde::Deserialize<'de> for Decimal128 { } struct GeneratedVisitor; impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { - type Value = Decimal128; + type Value = Decimal32; fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - formatter.write_str("struct datafusion_common.Decimal128") + formatter.write_str("struct datafusion_common.Decimal32") } - fn visit_map(self, mut map_: V) -> std::result::Result + fn visit_map(self, mut map_: V) -> std::result::Result where V: serde::de::MapAccess<'de>, { @@ -2499,17 +2911,129 @@ impl<'de> serde::Deserialize<'de> for Decimal128 { } } } - Ok(Decimal128 { + Ok(Decimal32 { value: value__.unwrap_or_default(), p: p__.unwrap_or_default(), s: s__.unwrap_or_default(), }) } } - deserializer.deserialize_struct("datafusion_common.Decimal128", FIELDS, GeneratedVisitor) + deserializer.deserialize_struct("datafusion_common.Decimal32", FIELDS, GeneratedVisitor) } } -impl serde::Serialize for Decimal256 { +impl serde::Serialize for Decimal32Type { + #[allow(deprecated)] + fn serialize(&self, serializer: S) -> std::result::Result + where + S: serde::Serializer, + { + use serde::ser::SerializeStruct; + let mut len = 0; + if self.precision != 0 { + len += 1; + } + if self.scale != 0 { + len += 1; + } + let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal32Type", len)?; + if self.precision != 0 { + struct_ser.serialize_field("precision", &self.precision)?; + } + if self.scale != 0 { + struct_ser.serialize_field("scale", &self.scale)?; + } + struct_ser.end() + } +} +impl<'de> serde::Deserialize<'de> for Decimal32Type { + #[allow(deprecated)] + fn deserialize(deserializer: D) -> std::result::Result + where + D: serde::Deserializer<'de>, + { + const FIELDS: &[&str] = &[ + "precision", + "scale", + ]; + + #[allow(clippy::enum_variant_names)] + enum GeneratedField { + Precision, + Scale, + } + impl<'de> serde::Deserialize<'de> for GeneratedField { + fn deserialize(deserializer: D) -> std::result::Result + where + D: serde::Deserializer<'de>, + { + struct GeneratedVisitor; + + impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { + type Value = GeneratedField; + + fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(formatter, "expected one of: {:?}", &FIELDS) + } + + #[allow(unused_variables)] + fn visit_str(self, value: &str) -> std::result::Result + where + E: serde::de::Error, + { + match value { + "precision" => Ok(GeneratedField::Precision), + "scale" => Ok(GeneratedField::Scale), + _ => Err(serde::de::Error::unknown_field(value, FIELDS)), + } + } + } + deserializer.deserialize_identifier(GeneratedVisitor) + } + } + struct GeneratedVisitor; + impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { + type Value = Decimal32Type; + + fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + formatter.write_str("struct datafusion_common.Decimal32Type") + } + + fn visit_map(self, mut map_: V) -> std::result::Result + where + V: serde::de::MapAccess<'de>, + { + let mut precision__ = None; + let mut scale__ = None; + while let Some(k) = map_.next_key()? { + match k { + GeneratedField::Precision => { + if precision__.is_some() { + return Err(serde::de::Error::duplicate_field("precision")); + } + precision__ = + Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0) + ; + } + GeneratedField::Scale => { + if scale__.is_some() { + return Err(serde::de::Error::duplicate_field("scale")); + } + scale__ = + Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0) + ; + } + } + } + Ok(Decimal32Type { + precision: precision__.unwrap_or_default(), + scale: scale__.unwrap_or_default(), + }) + } + } + deserializer.deserialize_struct("datafusion_common.Decimal32Type", FIELDS, GeneratedVisitor) + } +} +impl serde::Serialize for Decimal64 { #[allow(deprecated)] fn serialize(&self, serializer: S) -> std::result::Result where @@ -2526,7 +3050,7 @@ impl serde::Serialize for Decimal256 { if self.s != 0 { len += 1; } - let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal256", len)?; + let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal64", len)?; if !self.value.is_empty() { #[allow(clippy::needless_borrow)] #[allow(clippy::needless_borrows_for_generic_args)] @@ -2545,7 +3069,7 @@ impl serde::Serialize for Decimal256 { struct_ser.end() } } -impl<'de> serde::Deserialize<'de> for Decimal256 { +impl<'de> serde::Deserialize<'de> for Decimal64 { #[allow(deprecated)] fn deserialize(deserializer: D) -> std::result::Result where @@ -2595,13 +3119,13 @@ impl<'de> serde::Deserialize<'de> for Decimal256 { } struct GeneratedVisitor; impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { - type Value = Decimal256; + type Value = Decimal64; fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - formatter.write_str("struct datafusion_common.Decimal256") + formatter.write_str("struct datafusion_common.Decimal64") } - fn visit_map(self, mut map_: V) -> std::result::Result + fn visit_map(self, mut map_: V) -> std::result::Result where V: serde::de::MapAccess<'de>, { @@ -2636,17 +3160,17 @@ impl<'de> serde::Deserialize<'de> for Decimal256 { } } } - Ok(Decimal256 { + Ok(Decimal64 { value: value__.unwrap_or_default(), p: p__.unwrap_or_default(), s: s__.unwrap_or_default(), }) } } - deserializer.deserialize_struct("datafusion_common.Decimal256", FIELDS, GeneratedVisitor) + deserializer.deserialize_struct("datafusion_common.Decimal64", FIELDS, GeneratedVisitor) } } -impl serde::Serialize for Decimal256Type { +impl serde::Serialize for Decimal64Type { #[allow(deprecated)] fn serialize(&self, serializer: S) -> std::result::Result where @@ -2660,7 +3184,7 @@ impl serde::Serialize for Decimal256Type { if self.scale != 0 { len += 1; } - let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal256Type", len)?; + let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal64Type", len)?; if self.precision != 0 { struct_ser.serialize_field("precision", &self.precision)?; } @@ -2670,7 +3194,7 @@ impl serde::Serialize for Decimal256Type { struct_ser.end() } } -impl<'de> serde::Deserialize<'de> for Decimal256Type { +impl<'de> serde::Deserialize<'de> for Decimal64Type { #[allow(deprecated)] fn deserialize(deserializer: D) -> std::result::Result where @@ -2717,13 +3241,13 @@ impl<'de> serde::Deserialize<'de> for Decimal256Type { } struct GeneratedVisitor; impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { - type Value = Decimal256Type; + type Value = Decimal64Type; fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - formatter.write_str("struct datafusion_common.Decimal256Type") + formatter.write_str("struct datafusion_common.Decimal64Type") } - fn visit_map(self, mut map_: V) -> std::result::Result + fn visit_map(self, mut map_: V) -> std::result::Result where V: serde::de::MapAccess<'de>, { @@ -2749,13 +3273,13 @@ impl<'de> serde::Deserialize<'de> for Decimal256Type { } } } - Ok(Decimal256Type { + Ok(Decimal64Type { precision: precision__.unwrap_or_default(), scale: scale__.unwrap_or_default(), }) } } - deserializer.deserialize_struct("datafusion_common.Decimal256Type", FIELDS, GeneratedVisitor) + deserializer.deserialize_struct("datafusion_common.Decimal64Type", FIELDS, GeneratedVisitor) } } impl serde::Serialize for DfField { @@ -4589,9 +5113,6 @@ impl serde::Serialize for ParquetColumnOptions { if self.bloom_filter_ndv_opt.is_some() { len += 1; } - if self.max_statistics_size_opt.is_some() { - len += 1; - } let mut struct_ser = serializer.serialize_struct("datafusion_common.ParquetColumnOptions", len)?; if let Some(v) = self.bloom_filter_enabled_opt.as_ref() { match v { @@ -4644,13 +5165,6 @@ impl serde::Serialize for ParquetColumnOptions { } } } - if let Some(v) = self.max_statistics_size_opt.as_ref() { - match v { - parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v) => { - struct_ser.serialize_field("maxStatisticsSize", v)?; - } - } - } struct_ser.end() } } @@ -4673,8 +5187,6 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions { "bloomFilterFpp", "bloom_filter_ndv", "bloomFilterNdv", - "max_statistics_size", - "maxStatisticsSize", ]; #[allow(clippy::enum_variant_names)] @@ -4686,7 +5198,6 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions { StatisticsEnabled, BloomFilterFpp, BloomFilterNdv, - MaxStatisticsSize, } impl<'de> serde::Deserialize<'de> for GeneratedField { fn deserialize(deserializer: D) -> std::result::Result @@ -4715,7 +5226,6 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions { "statisticsEnabled" | "statistics_enabled" => Ok(GeneratedField::StatisticsEnabled), "bloomFilterFpp" | "bloom_filter_fpp" => Ok(GeneratedField::BloomFilterFpp), "bloomFilterNdv" | "bloom_filter_ndv" => Ok(GeneratedField::BloomFilterNdv), - "maxStatisticsSize" | "max_statistics_size" => Ok(GeneratedField::MaxStatisticsSize), _ => Err(serde::de::Error::unknown_field(value, FIELDS)), } } @@ -4742,7 +5252,6 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions { let mut statistics_enabled_opt__ = None; let mut bloom_filter_fpp_opt__ = None; let mut bloom_filter_ndv_opt__ = None; - let mut max_statistics_size_opt__ = None; while let Some(k) = map_.next_key()? { match k { GeneratedField::BloomFilterEnabled => { @@ -4787,12 +5296,6 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions { } bloom_filter_ndv_opt__ = map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| parquet_column_options::BloomFilterNdvOpt::BloomFilterNdv(x.0)); } - GeneratedField::MaxStatisticsSize => { - if max_statistics_size_opt__.is_some() { - return Err(serde::de::Error::duplicate_field("maxStatisticsSize")); - } - max_statistics_size_opt__ = map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize(x.0)); - } } } Ok(ParquetColumnOptions { @@ -4803,7 +5306,6 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions { statistics_enabled_opt: statistics_enabled_opt__, bloom_filter_fpp_opt: bloom_filter_fpp_opt__, bloom_filter_ndv_opt: bloom_filter_ndv_opt__, - max_statistics_size_opt: max_statistics_size_opt__, }) } } @@ -5090,9 +5592,6 @@ impl serde::Serialize for ParquetOptions { if self.statistics_enabled_opt.is_some() { len += 1; } - if self.max_statistics_size_opt.is_some() { - len += 1; - } if self.column_index_truncate_length_opt.is_some() { len += 1; } @@ -5216,15 +5715,6 @@ impl serde::Serialize for ParquetOptions { } } } - if let Some(v) = self.max_statistics_size_opt.as_ref() { - match v { - parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v) => { - #[allow(clippy::needless_borrow)] - #[allow(clippy::needless_borrows_for_generic_args)] - struct_ser.serialize_field("maxStatisticsSize", ToString::to_string(&v).as_str())?; - } - } - } if let Some(v) = self.column_index_truncate_length_opt.as_ref() { match v { parquet_options::ColumnIndexTruncateLengthOpt::ColumnIndexTruncateLength(v) => { @@ -5329,8 +5819,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { "dictionaryEnabled", "statistics_enabled", "statisticsEnabled", - "max_statistics_size", - "maxStatisticsSize", "column_index_truncate_length", "columnIndexTruncateLength", "statistics_truncate_length", @@ -5370,7 +5858,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { Compression, DictionaryEnabled, StatisticsEnabled, - MaxStatisticsSize, ColumnIndexTruncateLength, StatisticsTruncateLength, Encoding, @@ -5422,7 +5909,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { "compression" => Ok(GeneratedField::Compression), "dictionaryEnabled" | "dictionary_enabled" => Ok(GeneratedField::DictionaryEnabled), "statisticsEnabled" | "statistics_enabled" => Ok(GeneratedField::StatisticsEnabled), - "maxStatisticsSize" | "max_statistics_size" => Ok(GeneratedField::MaxStatisticsSize), "columnIndexTruncateLength" | "column_index_truncate_length" => Ok(GeneratedField::ColumnIndexTruncateLength), "statisticsTruncateLength" | "statistics_truncate_length" => Ok(GeneratedField::StatisticsTruncateLength), "encoding" => Ok(GeneratedField::Encoding), @@ -5472,7 +5958,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { let mut compression_opt__ = None; let mut dictionary_enabled_opt__ = None; let mut statistics_enabled_opt__ = None; - let mut max_statistics_size_opt__ = None; let mut column_index_truncate_length_opt__ = None; let mut statistics_truncate_length_opt__ = None; let mut encoding_opt__ = None; @@ -5639,12 +6124,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { } statistics_enabled_opt__ = map_.next_value::<::std::option::Option<_>>()?.map(parquet_options::StatisticsEnabledOpt::StatisticsEnabled); } - GeneratedField::MaxStatisticsSize => { - if max_statistics_size_opt__.is_some() { - return Err(serde::de::Error::duplicate_field("maxStatisticsSize")); - } - max_statistics_size_opt__ = map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(x.0)); - } GeneratedField::ColumnIndexTruncateLength => { if column_index_truncate_length_opt__.is_some() { return Err(serde::de::Error::duplicate_field("columnIndexTruncateLength")); @@ -5708,7 +6187,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { compression_opt: compression_opt__, dictionary_enabled_opt: dictionary_enabled_opt__, statistics_enabled_opt: statistics_enabled_opt__, - max_statistics_size_opt: max_statistics_size_opt__, column_index_truncate_length_opt: column_index_truncate_length_opt__, statistics_truncate_length_opt: statistics_truncate_length_opt__, encoding_opt: encoding_opt__, @@ -6959,6 +7437,12 @@ impl serde::Serialize for ScalarValue { scalar_value::Value::MapValue(v) => { struct_ser.serialize_field("mapValue", v)?; } + scalar_value::Value::Decimal32Value(v) => { + struct_ser.serialize_field("decimal32Value", v)?; + } + scalar_value::Value::Decimal64Value(v) => { + struct_ser.serialize_field("decimal64Value", v)?; + } scalar_value::Value::Decimal128Value(v) => { struct_ser.serialize_field("decimal128Value", v)?; } @@ -7085,6 +7569,10 @@ impl<'de> serde::Deserialize<'de> for ScalarValue { "structValue", "map_value", "mapValue", + "decimal32_value", + "decimal32Value", + "decimal64_value", + "decimal64Value", "decimal128_value", "decimal128Value", "decimal256_value", @@ -7147,6 +7635,8 @@ impl<'de> serde::Deserialize<'de> for ScalarValue { FixedSizeListValue, StructValue, MapValue, + Decimal32Value, + Decimal64Value, Decimal128Value, Decimal256Value, Date64Value, @@ -7208,6 +7698,8 @@ impl<'de> serde::Deserialize<'de> for ScalarValue { "fixedSizeListValue" | "fixed_size_list_value" => Ok(GeneratedField::FixedSizeListValue), "structValue" | "struct_value" => Ok(GeneratedField::StructValue), "mapValue" | "map_value" => Ok(GeneratedField::MapValue), + "decimal32Value" | "decimal32_value" => Ok(GeneratedField::Decimal32Value), + "decimal64Value" | "decimal64_value" => Ok(GeneratedField::Decimal64Value), "decimal128Value" | "decimal128_value" => Ok(GeneratedField::Decimal128Value), "decimal256Value" | "decimal256_value" => Ok(GeneratedField::Decimal256Value), "date64Value" | "date_64_value" => Ok(GeneratedField::Date64Value), @@ -7385,6 +7877,20 @@ impl<'de> serde::Deserialize<'de> for ScalarValue { return Err(serde::de::Error::duplicate_field("mapValue")); } value__ = map_.next_value::<::std::option::Option<_>>()?.map(scalar_value::Value::MapValue) +; + } + GeneratedField::Decimal32Value => { + if value__.is_some() { + return Err(serde::de::Error::duplicate_field("decimal32Value")); + } + value__ = map_.next_value::<::std::option::Option<_>>()?.map(scalar_value::Value::Decimal32Value) +; + } + GeneratedField::Decimal64Value => { + if value__.is_some() { + return Err(serde::de::Error::duplicate_field("decimal64Value")); + } + value__ = map_.next_value::<::std::option::Option<_>>()?.map(scalar_value::Value::Decimal64Value) ; } GeneratedField::Decimal128Value => { diff --git a/datafusion/proto-common/src/generated/prost.rs b/datafusion/proto-common/src/generated/prost.rs index ac4a9ea4be69..aa23cea57470 100644 --- a/datafusion/proto-common/src/generated/prost.rs +++ b/datafusion/proto-common/src/generated/prost.rs @@ -117,7 +117,21 @@ pub struct Timestamp { pub timezone: ::prost::alloc::string::String, } #[derive(Clone, Copy, PartialEq, ::prost::Message)] -pub struct Decimal { +pub struct Decimal32Type { + #[prost(uint32, tag = "3")] + pub precision: u32, + #[prost(int32, tag = "4")] + pub scale: i32, +} +#[derive(Clone, Copy, PartialEq, ::prost::Message)] +pub struct Decimal64Type { + #[prost(uint32, tag = "3")] + pub precision: u32, + #[prost(int32, tag = "4")] + pub scale: i32, +} +#[derive(Clone, Copy, PartialEq, ::prost::Message)] +pub struct Decimal128Type { #[prost(uint32, tag = "3")] pub precision: u32, #[prost(int32, tag = "4")] @@ -297,7 +311,7 @@ pub struct ScalarFixedSizeBinary { pub struct ScalarValue { #[prost( oneof = "scalar_value::Value", - tags = "33, 1, 2, 3, 23, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 32, 41, 20, 39, 21, 24, 35, 36, 37, 38, 26, 27, 28, 29, 22, 30, 25, 31, 34, 42" + tags = "33, 1, 2, 3, 23, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 32, 41, 43, 44, 20, 39, 21, 24, 35, 36, 37, 38, 26, 27, 28, 29, 22, 30, 25, 31, 34, 42" )] pub value: ::core::option::Option, } @@ -352,6 +366,10 @@ pub mod scalar_value { StructValue(super::ScalarNestedValue), #[prost(message, tag = "41")] MapValue(super::ScalarNestedValue), + #[prost(message, tag = "43")] + Decimal32Value(super::Decimal32), + #[prost(message, tag = "44")] + Decimal64Value(super::Decimal64), #[prost(message, tag = "20")] Decimal128Value(super::Decimal128), #[prost(message, tag = "39")] @@ -391,6 +409,24 @@ pub mod scalar_value { } } #[derive(Clone, PartialEq, ::prost::Message)] +pub struct Decimal32 { + #[prost(bytes = "vec", tag = "1")] + pub value: ::prost::alloc::vec::Vec, + #[prost(int64, tag = "2")] + pub p: i64, + #[prost(int64, tag = "3")] + pub s: i64, +} +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct Decimal64 { + #[prost(bytes = "vec", tag = "1")] + pub value: ::prost::alloc::vec::Vec, + #[prost(int64, tag = "2")] + pub p: i64, + #[prost(int64, tag = "3")] + pub s: i64, +} +#[derive(Clone, PartialEq, ::prost::Message)] pub struct Decimal128 { #[prost(bytes = "vec", tag = "1")] pub value: ::prost::alloc::vec::Vec, @@ -413,7 +449,7 @@ pub struct Decimal256 { pub struct ArrowType { #[prost( oneof = "arrow_type::ArrowTypeEnum", - tags = "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 35, 32, 15, 34, 16, 31, 17, 18, 19, 20, 21, 22, 23, 24, 36, 25, 26, 27, 28, 29, 30, 33" + tags = "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 35, 32, 15, 34, 16, 31, 17, 18, 19, 20, 21, 22, 23, 40, 41, 24, 36, 25, 26, 27, 28, 29, 30, 33" )] pub arrow_type_enum: ::core::option::Option, } @@ -480,8 +516,12 @@ pub mod arrow_type { Time64(i32), #[prost(enumeration = "super::IntervalUnit", tag = "23")] Interval(i32), + #[prost(message, tag = "40")] + Decimal32(super::Decimal32Type), + #[prost(message, tag = "41")] + Decimal64(super::Decimal64Type), #[prost(message, tag = "24")] - Decimal(super::Decimal), + Decimal128(super::Decimal128Type), #[prost(message, tag = "36")] Decimal256(super::Decimal256Type), #[prost(message, tag = "25")] @@ -662,10 +702,6 @@ pub struct ParquetColumnOptions { pub bloom_filter_ndv_opt: ::core::option::Option< parquet_column_options::BloomFilterNdvOpt, >, - #[prost(oneof = "parquet_column_options::MaxStatisticsSizeOpt", tags = "8")] - pub max_statistics_size_opt: ::core::option::Option< - parquet_column_options::MaxStatisticsSizeOpt, - >, } /// Nested message and enum types in `ParquetColumnOptions`. pub mod parquet_column_options { @@ -704,11 +740,6 @@ pub mod parquet_column_options { #[prost(uint64, tag = "7")] BloomFilterNdv(u64), } - #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] - pub enum MaxStatisticsSizeOpt { - #[prost(uint32, tag = "8")] - MaxStatisticsSize(u32), - } } #[derive(Clone, PartialEq, ::prost::Message)] pub struct ParquetOptions { @@ -786,10 +817,6 @@ pub struct ParquetOptions { pub statistics_enabled_opt: ::core::option::Option< parquet_options::StatisticsEnabledOpt, >, - #[prost(oneof = "parquet_options::MaxStatisticsSizeOpt", tags = "14")] - pub max_statistics_size_opt: ::core::option::Option< - parquet_options::MaxStatisticsSizeOpt, - >, #[prost(oneof = "parquet_options::ColumnIndexTruncateLengthOpt", tags = "17")] pub column_index_truncate_length_opt: ::core::option::Option< parquet_options::ColumnIndexTruncateLengthOpt, @@ -830,11 +857,6 @@ pub mod parquet_options { StatisticsEnabled(::prost::alloc::string::String), } #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] - pub enum MaxStatisticsSizeOpt { - #[prost(uint64, tag = "14")] - MaxStatisticsSize(u64), - } - #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] pub enum ColumnIndexTruncateLengthOpt { #[prost(uint64, tag = "17")] ColumnIndexTruncateLength(u64), diff --git a/datafusion/proto-common/src/to_proto/mod.rs b/datafusion/proto-common/src/to_proto/mod.rs index b6cbe5759cfc..c06427065733 100644 --- a/datafusion/proto-common/src/to_proto/mod.rs +++ b/datafusion/proto-common/src/to_proto/mod.rs @@ -189,7 +189,15 @@ impl TryFrom<&DataType> for protobuf::arrow_type::ArrowTypeEnum { value: Some(Box::new(value_type.as_ref().try_into()?)), })) } - DataType::Decimal128(precision, scale) => Self::Decimal(protobuf::Decimal { + DataType::Decimal32(precision, scale) => Self::Decimal32(protobuf::Decimal32Type { + precision: *precision as u32, + scale: *scale as i32, + }), + DataType::Decimal64(precision, scale) => Self::Decimal64(protobuf::Decimal64Type { + precision: *precision as u32, + scale: *scale as i32, + }), + DataType::Decimal128(precision, scale) => Self::Decimal128(protobuf::Decimal128Type { precision: *precision as u32, scale: *scale as i32, }), @@ -817,8 +825,6 @@ impl TryFrom<&ParquetOptions> for protobuf::ParquetOptions { dictionary_enabled_opt: value.dictionary_enabled.map(protobuf::parquet_options::DictionaryEnabledOpt::DictionaryEnabled), dictionary_page_size_limit: value.dictionary_page_size_limit as u64, statistics_enabled_opt: value.statistics_enabled.clone().map(protobuf::parquet_options::StatisticsEnabledOpt::StatisticsEnabled), - #[allow(deprecated)] - max_statistics_size_opt: value.max_statistics_size.map(|v| protobuf::parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v as u64)), max_row_group_size: value.max_row_group_size as u64, created_by: value.created_by.clone(), column_index_truncate_length_opt: value.column_index_truncate_length.map(|v| protobuf::parquet_options::ColumnIndexTruncateLengthOpt::ColumnIndexTruncateLength(v as u64)), @@ -858,12 +864,6 @@ impl TryFrom<&ParquetColumnOptions> for protobuf::ParquetColumnOptions { .statistics_enabled .clone() .map(protobuf::parquet_column_options::StatisticsEnabledOpt::StatisticsEnabled), - #[allow(deprecated)] - max_statistics_size_opt: value.max_statistics_size.map(|v| { - protobuf::parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize( - v as u32, - ) - }), encoding_opt: value .encoding .clone() diff --git a/datafusion/proto/src/generated/datafusion_proto_common.rs b/datafusion/proto/src/generated/datafusion_proto_common.rs index ac4a9ea4be69..aa23cea57470 100644 --- a/datafusion/proto/src/generated/datafusion_proto_common.rs +++ b/datafusion/proto/src/generated/datafusion_proto_common.rs @@ -117,7 +117,21 @@ pub struct Timestamp { pub timezone: ::prost::alloc::string::String, } #[derive(Clone, Copy, PartialEq, ::prost::Message)] -pub struct Decimal { +pub struct Decimal32Type { + #[prost(uint32, tag = "3")] + pub precision: u32, + #[prost(int32, tag = "4")] + pub scale: i32, +} +#[derive(Clone, Copy, PartialEq, ::prost::Message)] +pub struct Decimal64Type { + #[prost(uint32, tag = "3")] + pub precision: u32, + #[prost(int32, tag = "4")] + pub scale: i32, +} +#[derive(Clone, Copy, PartialEq, ::prost::Message)] +pub struct Decimal128Type { #[prost(uint32, tag = "3")] pub precision: u32, #[prost(int32, tag = "4")] @@ -297,7 +311,7 @@ pub struct ScalarFixedSizeBinary { pub struct ScalarValue { #[prost( oneof = "scalar_value::Value", - tags = "33, 1, 2, 3, 23, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 32, 41, 20, 39, 21, 24, 35, 36, 37, 38, 26, 27, 28, 29, 22, 30, 25, 31, 34, 42" + tags = "33, 1, 2, 3, 23, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 32, 41, 43, 44, 20, 39, 21, 24, 35, 36, 37, 38, 26, 27, 28, 29, 22, 30, 25, 31, 34, 42" )] pub value: ::core::option::Option, } @@ -352,6 +366,10 @@ pub mod scalar_value { StructValue(super::ScalarNestedValue), #[prost(message, tag = "41")] MapValue(super::ScalarNestedValue), + #[prost(message, tag = "43")] + Decimal32Value(super::Decimal32), + #[prost(message, tag = "44")] + Decimal64Value(super::Decimal64), #[prost(message, tag = "20")] Decimal128Value(super::Decimal128), #[prost(message, tag = "39")] @@ -391,6 +409,24 @@ pub mod scalar_value { } } #[derive(Clone, PartialEq, ::prost::Message)] +pub struct Decimal32 { + #[prost(bytes = "vec", tag = "1")] + pub value: ::prost::alloc::vec::Vec, + #[prost(int64, tag = "2")] + pub p: i64, + #[prost(int64, tag = "3")] + pub s: i64, +} +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct Decimal64 { + #[prost(bytes = "vec", tag = "1")] + pub value: ::prost::alloc::vec::Vec, + #[prost(int64, tag = "2")] + pub p: i64, + #[prost(int64, tag = "3")] + pub s: i64, +} +#[derive(Clone, PartialEq, ::prost::Message)] pub struct Decimal128 { #[prost(bytes = "vec", tag = "1")] pub value: ::prost::alloc::vec::Vec, @@ -413,7 +449,7 @@ pub struct Decimal256 { pub struct ArrowType { #[prost( oneof = "arrow_type::ArrowTypeEnum", - tags = "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 35, 32, 15, 34, 16, 31, 17, 18, 19, 20, 21, 22, 23, 24, 36, 25, 26, 27, 28, 29, 30, 33" + tags = "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 35, 32, 15, 34, 16, 31, 17, 18, 19, 20, 21, 22, 23, 40, 41, 24, 36, 25, 26, 27, 28, 29, 30, 33" )] pub arrow_type_enum: ::core::option::Option, } @@ -480,8 +516,12 @@ pub mod arrow_type { Time64(i32), #[prost(enumeration = "super::IntervalUnit", tag = "23")] Interval(i32), + #[prost(message, tag = "40")] + Decimal32(super::Decimal32Type), + #[prost(message, tag = "41")] + Decimal64(super::Decimal64Type), #[prost(message, tag = "24")] - Decimal(super::Decimal), + Decimal128(super::Decimal128Type), #[prost(message, tag = "36")] Decimal256(super::Decimal256Type), #[prost(message, tag = "25")] @@ -662,10 +702,6 @@ pub struct ParquetColumnOptions { pub bloom_filter_ndv_opt: ::core::option::Option< parquet_column_options::BloomFilterNdvOpt, >, - #[prost(oneof = "parquet_column_options::MaxStatisticsSizeOpt", tags = "8")] - pub max_statistics_size_opt: ::core::option::Option< - parquet_column_options::MaxStatisticsSizeOpt, - >, } /// Nested message and enum types in `ParquetColumnOptions`. pub mod parquet_column_options { @@ -704,11 +740,6 @@ pub mod parquet_column_options { #[prost(uint64, tag = "7")] BloomFilterNdv(u64), } - #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] - pub enum MaxStatisticsSizeOpt { - #[prost(uint32, tag = "8")] - MaxStatisticsSize(u32), - } } #[derive(Clone, PartialEq, ::prost::Message)] pub struct ParquetOptions { @@ -786,10 +817,6 @@ pub struct ParquetOptions { pub statistics_enabled_opt: ::core::option::Option< parquet_options::StatisticsEnabledOpt, >, - #[prost(oneof = "parquet_options::MaxStatisticsSizeOpt", tags = "14")] - pub max_statistics_size_opt: ::core::option::Option< - parquet_options::MaxStatisticsSizeOpt, - >, #[prost(oneof = "parquet_options::ColumnIndexTruncateLengthOpt", tags = "17")] pub column_index_truncate_length_opt: ::core::option::Option< parquet_options::ColumnIndexTruncateLengthOpt, @@ -830,11 +857,6 @@ pub mod parquet_options { StatisticsEnabled(::prost::alloc::string::String), } #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] - pub enum MaxStatisticsSizeOpt { - #[prost(uint64, tag = "14")] - MaxStatisticsSize(u64), - } - #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] pub enum ColumnIndexTruncateLengthOpt { #[prost(uint64, tag = "17")] ColumnIndexTruncateLength(u64), diff --git a/datafusion/proto/src/logical_plan/file_formats.rs b/datafusion/proto/src/logical_plan/file_formats.rs index 620442c79e72..654607bd733d 100644 --- a/datafusion/proto/src/logical_plan/file_formats.rs +++ b/datafusion/proto/src/logical_plan/file_formats.rs @@ -382,9 +382,6 @@ impl TableParquetOptionsProto { statistics_enabled_opt: global_options.global.statistics_enabled.map(|enabled| { parquet_options::StatisticsEnabledOpt::StatisticsEnabled(enabled) }), - max_statistics_size_opt: global_options.global.max_statistics_size.map(|size| { - parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(size as u64) - }), max_row_group_size: global_options.global.max_row_group_size as u64, created_by: global_options.global.created_by.clone(), column_index_truncate_length_opt: global_options.global.column_index_truncate_length.map(|length| { @@ -440,9 +437,6 @@ impl TableParquetOptionsProto { bloom_filter_ndv_opt: options.bloom_filter_ndv.map(|ndv| { parquet_column_options::BloomFilterNdvOpt::BloomFilterNdv(ndv) }), - max_statistics_size_opt: options.max_statistics_size.map(|size| { - parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize(size as u32) - }), }) } }).collect(), @@ -481,9 +475,6 @@ impl From<&ParquetOptionsProto> for ParquetOptions { statistics_enabled: proto.statistics_enabled_opt.as_ref().map(|opt| match opt { parquet_options::StatisticsEnabledOpt::StatisticsEnabled(statistics) => statistics.clone(), }), - max_statistics_size: proto.max_statistics_size_opt.as_ref().map(|opt| match opt { - parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(size) => *size as usize, - }), max_row_group_size: proto.max_row_group_size as usize, created_by: proto.created_by.clone(), column_index_truncate_length: proto.column_index_truncate_length_opt.as_ref().map(|opt| match opt { @@ -542,11 +533,6 @@ impl From for ParquetColumnOptions { bloom_filter_ndv: proto .bloom_filter_ndv_opt .map(|parquet_column_options::BloomFilterNdvOpt::BloomFilterNdv(v)| v), - max_statistics_size: proto.max_statistics_size_opt.map( - |parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v)| { - v as usize - }, - ), } } } diff --git a/datafusion/sql/src/unparser/expr.rs b/datafusion/sql/src/unparser/expr.rs index c4baf9b438e9..d4e911a62c09 100644 --- a/datafusion/sql/src/unparser/expr.rs +++ b/datafusion/sql/src/unparser/expr.rs @@ -1726,6 +1726,12 @@ impl Unparser<'_> { not_impl_err!("Unsupported DataType: conversion: {data_type:?}") } DataType::Dictionary(_, val) => self.arrow_dtype_to_ast_dtype(val), + DataType::Decimal32(_precision, _scale) => { + not_impl_err!("Unsupported DataType: conversion: {data_type:?}") + } + DataType::Decimal64(_precision, _scale) => { + not_impl_err!("Unsupported DataType: conversion: {data_type:?}") + } DataType::Decimal128(precision, scale) | DataType::Decimal256(precision, scale) => { let mut new_precision = *precision as u64; diff --git a/datafusion/sqllogictest/test_files/array.slt b/datafusion/sqllogictest/test_files/array.slt index 9ec3ff8409c2..96ab84ab9095 100644 --- a/datafusion/sqllogictest/test_files/array.slt +++ b/datafusion/sqllogictest/test_files/array.slt @@ -8062,7 +8062,19 @@ select arrow_typeof(a) from fixed_size_col_table; FixedSizeList(Field { name: "item", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, 3) FixedSizeList(Field { name: "item", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, 3) -statement error +query ? rowsort +SELECT DISTINCT a FROM fixed_size_col_table +---- +[1, 2, 3] +[4, 5, 6] + +query ?I rowsort +SELECT a, count(*) FROM fixed_size_col_table GROUP BY a +---- +[1, 2, 3] 1 +[4, 5, 6] 1 + +statement error Cast error: Cannot cast to FixedSizeList\(3\): value at index 0 has length 2 create table varying_fixed_size_col_table (a int[3]) as values ([1,2,3]), ([4,5]); # https://github.com/apache/datafusion/issues/16187 diff --git a/datafusion/sqllogictest/test_files/copy.slt b/datafusion/sqllogictest/test_files/copy.slt index 5eeb05e814ac..096cde86f26f 100644 --- a/datafusion/sqllogictest/test_files/copy.slt +++ b/datafusion/sqllogictest/test_files/copy.slt @@ -332,7 +332,6 @@ OPTIONS ( 'format.dictionary_enabled' false, 'format.statistics_enabled' page, 'format.statistics_enabled::col2' none, -'format.max_statistics_size' 123, 'format.bloom_filter_fpp' 0.001, 'format.bloom_filter_ndv' 100, 'format.metadata::key' 'value' diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index 8eb1fa9a0086..fb2c89020112 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -242,7 +242,6 @@ datafusion.execution.parquet.dictionary_page_size_limit 1048576 datafusion.execution.parquet.enable_page_index true datafusion.execution.parquet.encoding NULL datafusion.execution.parquet.max_row_group_size 1048576 -datafusion.execution.parquet.max_statistics_size 4096 datafusion.execution.parquet.maximum_buffered_record_batches_per_stream 2 datafusion.execution.parquet.maximum_parallel_row_group_writers 1 datafusion.execution.parquet.metadata_size_hint NULL @@ -253,7 +252,7 @@ datafusion.execution.parquet.schema_force_view_types true datafusion.execution.parquet.skip_arrow_metadata false datafusion.execution.parquet.skip_metadata true datafusion.execution.parquet.statistics_enabled page -datafusion.execution.parquet.statistics_truncate_length NULL +datafusion.execution.parquet.statistics_truncate_length 64 datafusion.execution.parquet.write_batch_size 1024 datafusion.execution.parquet.writer_version 1.0 datafusion.execution.planning_concurrency 13 @@ -357,7 +356,6 @@ datafusion.execution.parquet.dictionary_page_size_limit 1048576 (writing) Sets b datafusion.execution.parquet.enable_page_index true (reading) If true, reads the Parquet data page level metadata (the Page Index), if present, to reduce the I/O and number of rows decoded. datafusion.execution.parquet.encoding NULL (writing) Sets default encoding for any column. Valid values are: plain, plain_dictionary, rle, bit_packed, delta_binary_packed, delta_length_byte_array, delta_byte_array, rle_dictionary, and byte_stream_split. These values are not case sensitive. If NULL, uses default parquet writer setting datafusion.execution.parquet.max_row_group_size 1048576 (writing) Target maximum number of rows in each row group (defaults to 1M rows). Writing larger row groups requires more memory to write, but can get better compression and be faster to read. -datafusion.execution.parquet.max_statistics_size 4096 (writing) Sets max statistics size for any column. If NULL, uses default parquet writer setting max_statistics_size is deprecated, currently it is not being used datafusion.execution.parquet.maximum_buffered_record_batches_per_stream 2 (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame. datafusion.execution.parquet.maximum_parallel_row_group_writers 1 (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame. datafusion.execution.parquet.metadata_size_hint NULL (reading) If specified, the parquet reader will try and fetch the last `size_hint` bytes of the parquet file optimistically. If not specified, two reads are required: One read to fetch the 8-byte parquet footer and another to fetch the metadata length encoded in the footer @@ -368,7 +366,7 @@ datafusion.execution.parquet.schema_force_view_types true (reading) If true, par datafusion.execution.parquet.skip_arrow_metadata false (writing) Skip encoding the embedded arrow metadata in the KV_meta This is analogous to the `ArrowWriterOptions::with_skip_arrow_metadata`. Refer to datafusion.execution.parquet.skip_metadata true (reading) If true, the parquet reader skip the optional embedded metadata that may be in the file Schema. This setting can help avoid schema conflicts when querying multiple parquet files with schemas containing compatible types but different metadata datafusion.execution.parquet.statistics_enabled page (writing) Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting -datafusion.execution.parquet.statistics_truncate_length NULL (writing) Sets statistics truncate length. If NULL, uses default parquet writer setting +datafusion.execution.parquet.statistics_truncate_length 64 (writing) Sets statistics truncate length. If NULL, uses default parquet writer setting datafusion.execution.parquet.write_batch_size 1024 (writing) Sets write_batch_size in bytes datafusion.execution.parquet.writer_version 1.0 (writing) Sets parquet writer version valid values are "1.0" and "2.0" datafusion.execution.planning_concurrency 13 Fan-out during initial physical planning. This is mostly use to plan `UNION` children in parallel. Defaults to the number of CPU cores on the system diff --git a/datafusion/sqllogictest/test_files/listing_table_statistics.slt b/datafusion/sqllogictest/test_files/listing_table_statistics.slt index a43d4be62551..37daf551c2c3 100644 --- a/datafusion/sqllogictest/test_files/listing_table_statistics.slt +++ b/datafusion/sqllogictest/test_files/listing_table_statistics.slt @@ -35,7 +35,7 @@ query TT explain format indent select * from t; ---- logical_plan TableScan: t projection=[int_col, str_col] -physical_plan DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/listing_table_statistics/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/listing_table_statistics/2.parquet]]}, projection=[int_col, str_col], file_type=parquet, statistics=[Rows=Exact(4), Bytes=Exact(288), [(Col[0]: Min=Exact(Int64(-1)) Max=Exact(Int64(3)) Null=Exact(0)),(Col[1]: Min=Exact(Utf8View("a")) Max=Exact(Utf8View("d")) Null=Exact(0))]] +physical_plan DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/listing_table_statistics/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/listing_table_statistics/2.parquet]]}, projection=[int_col, str_col], file_type=parquet, statistics=[Rows=Exact(4), Bytes=Exact(212), [(Col[0]: Min=Exact(Int64(-1)) Max=Exact(Int64(3)) Null=Exact(0)),(Col[1]: Min=Exact(Utf8View("a")) Max=Exact(Utf8View("d")) Null=Exact(0))]] statement ok drop table t; diff --git a/datafusion/sqllogictest/test_files/parquet_statistics.slt b/datafusion/sqllogictest/test_files/parquet_statistics.slt index 2b70f4f9c00a..f9e89902998c 100644 --- a/datafusion/sqllogictest/test_files/parquet_statistics.slt +++ b/datafusion/sqllogictest/test_files/parquet_statistics.slt @@ -59,10 +59,10 @@ query TT EXPLAIN SELECT * FROM test_table WHERE column1 = 1; ---- physical_plan -01)CoalesceBatchesExec: target_batch_size=8192, statistics=[Rows=Inexact(2), Bytes=Inexact(44), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) Null=Inexact(0))]] -02)--FilterExec: column1@0 = 1, statistics=[Rows=Inexact(2), Bytes=Inexact(44), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) Null=Inexact(0))]] -03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2, statistics=[Rows=Inexact(5), Bytes=Inexact(173), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0))]] -04)------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/1.parquet]]}, projection=[column1], file_type=parquet, predicate=column1@0 = 1, pruning_predicate=column1_null_count@2 != row_count@3 AND column1_min@0 <= 1 AND 1 <= column1_max@1, required_guarantees=[column1 in (1)], statistics=[Rows=Inexact(5), Bytes=Inexact(173), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0))]] +01)CoalesceBatchesExec: target_batch_size=8192, statistics=[Rows=Inexact(2), Bytes=Inexact(31), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) Null=Inexact(0))]] +02)--FilterExec: column1@0 = 1, statistics=[Rows=Inexact(2), Bytes=Inexact(31), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) Null=Inexact(0))]] +03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2, statistics=[Rows=Inexact(5), Bytes=Inexact(121), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0))]] +04)------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/1.parquet]]}, projection=[column1], file_type=parquet, predicate=column1@0 = 1, pruning_predicate=column1_null_count@2 != row_count@3 AND column1_min@0 <= 1 AND 1 <= column1_max@1, required_guarantees=[column1 in (1)], statistics=[Rows=Inexact(5), Bytes=Inexact(121), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0))]] # cleanup statement ok @@ -85,10 +85,10 @@ query TT EXPLAIN SELECT * FROM test_table WHERE column1 = 1; ---- physical_plan -01)CoalesceBatchesExec: target_batch_size=8192, statistics=[Rows=Inexact(2), Bytes=Inexact(44), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) Null=Inexact(0))]] -02)--FilterExec: column1@0 = 1, statistics=[Rows=Inexact(2), Bytes=Inexact(44), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) Null=Inexact(0))]] -03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2, statistics=[Rows=Inexact(5), Bytes=Inexact(173), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0))]] -04)------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/1.parquet]]}, projection=[column1], file_type=parquet, predicate=column1@0 = 1, pruning_predicate=column1_null_count@2 != row_count@3 AND column1_min@0 <= 1 AND 1 <= column1_max@1, required_guarantees=[column1 in (1)], statistics=[Rows=Inexact(5), Bytes=Inexact(173), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0))]] +01)CoalesceBatchesExec: target_batch_size=8192, statistics=[Rows=Inexact(2), Bytes=Inexact(31), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) Null=Inexact(0))]] +02)--FilterExec: column1@0 = 1, statistics=[Rows=Inexact(2), Bytes=Inexact(31), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) Null=Inexact(0))]] +03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2, statistics=[Rows=Inexact(5), Bytes=Inexact(121), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0))]] +04)------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/1.parquet]]}, projection=[column1], file_type=parquet, predicate=column1@0 = 1, pruning_predicate=column1_null_count@2 != row_count@3 AND column1_min@0 <= 1 AND 1 <= column1_max@1, required_guarantees=[column1 in (1)], statistics=[Rows=Inexact(5), Bytes=Inexact(121), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0))]] # cleanup statement ok diff --git a/datafusion/sqllogictest/test_files/repartition_scan.slt b/datafusion/sqllogictest/test_files/repartition_scan.slt index 0b851f917855..c536c8165c5a 100644 --- a/datafusion/sqllogictest/test_files/repartition_scan.slt +++ b/datafusion/sqllogictest/test_files/repartition_scan.slt @@ -61,7 +61,7 @@ logical_plan physical_plan 01)CoalesceBatchesExec: target_batch_size=8192 02)--FilterExec: column1@0 != 42 -03)----DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..141], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:141..282], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:282..423], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:423..563]]}, projection=[column1], file_type=parquet, predicate=column1@0 != 42, pruning_predicate=column1_null_count@2 != row_count@3 AND (column1_min@0 != 42 OR 42 != column1_max@1), required_guarantees=[column1 not in (42)] +03)----DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..135], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:135..270], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:270..405], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:405..537]]}, projection=[column1], file_type=parquet, predicate=column1@0 != 42, pruning_predicate=column1_null_count@2 != row_count@3 AND (column1_min@0 != 42 OR 42 != column1_max@1), required_guarantees=[column1 not in (42)] # disable round robin repartitioning statement ok @@ -77,7 +77,7 @@ logical_plan physical_plan 01)CoalesceBatchesExec: target_batch_size=8192 02)--FilterExec: column1@0 != 42 -03)----DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..141], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:141..282], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:282..423], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:423..563]]}, projection=[column1], file_type=parquet, predicate=column1@0 != 42, pruning_predicate=column1_null_count@2 != row_count@3 AND (column1_min@0 != 42 OR 42 != column1_max@1), required_guarantees=[column1 not in (42)] +03)----DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..135], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:135..270], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:270..405], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:405..537]]}, projection=[column1], file_type=parquet, predicate=column1@0 != 42, pruning_predicate=column1_null_count@2 != row_count@3 AND (column1_min@0 != 42 OR 42 != column1_max@1), required_guarantees=[column1 not in (42)] # enable round robin repartitioning again statement ok @@ -102,7 +102,7 @@ physical_plan 02)--SortExec: expr=[column1@0 ASC NULLS LAST], preserve_partitioning=[true] 03)----CoalesceBatchesExec: target_batch_size=8192 04)------FilterExec: column1@0 != 42 -05)--------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:0..280], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:280..554, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..6], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:6..286], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:286..563]]}, projection=[column1], file_type=parquet, predicate=column1@0 != 42, pruning_predicate=column1_null_count@2 != row_count@3 AND (column1_min@0 != 42 OR 42 != column1_max@1), required_guarantees=[column1 not in (42)] +05)--------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:0..266], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:266..526, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..6], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:6..272], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:272..537]]}, projection=[column1], file_type=parquet, predicate=column1@0 != 42, pruning_predicate=column1_null_count@2 != row_count@3 AND (column1_min@0 != 42 OR 42 != column1_max@1), required_guarantees=[column1 not in (42)] ## Read the files as though they are ordered @@ -138,7 +138,7 @@ physical_plan 01)SortPreservingMergeExec: [column1@0 ASC NULLS LAST] 02)--CoalesceBatchesExec: target_batch_size=8192 03)----FilterExec: column1@0 != 42 -04)------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:0..277], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:281..563], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:277..554]]}, projection=[column1], output_ordering=[column1@0 ASC NULLS LAST], file_type=parquet, predicate=column1@0 != 42, pruning_predicate=column1_null_count@2 != row_count@3 AND (column1_min@0 != 42 OR 42 != column1_max@1), required_guarantees=[column1 not in (42)] +04)------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:0..263], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..268], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:268..537], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:263..526]]}, projection=[column1], output_ordering=[column1@0 ASC NULLS LAST], file_type=parquet, predicate=column1@0 != 42, pruning_predicate=column1_null_count@2 != row_count@3 AND (column1_min@0 != 42 OR 42 != column1_max@1), required_guarantees=[column1 not in (42)] # Cleanup statement ok diff --git a/datafusion/substrait/src/logical_plan/consumer/utils.rs b/datafusion/substrait/src/logical_plan/consumer/utils.rs index 67215e8e343e..f7eedcb7a2b2 100644 --- a/datafusion/substrait/src/logical_plan/consumer/utils.rs +++ b/datafusion/substrait/src/logical_plan/consumer/utils.rs @@ -213,6 +213,8 @@ pub fn rename_data_type( | DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View + | DataType::Decimal32(_, _) + | DataType::Decimal64(_, _) | DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => Ok(data_type.clone()), } diff --git a/docs/source/library-user-guide/upgrading.md b/docs/source/library-user-guide/upgrading.md index a9c39b086118..828beec8545d 100644 --- a/docs/source/library-user-guide/upgrading.md +++ b/docs/source/library-user-guide/upgrading.md @@ -212,6 +212,12 @@ use datafusion_physical_expr_adapter::{ }; ``` +### Upgrade to arrow `56.0.0` and parquet `56.0.0` + +This version of DataFusion upgrades the underlying Apache Arrow implementation +to version `56.0.0`. See the [release notes](https://github.com/apache/arrow-rs/releases/tag/56.0.0) +for more details. + ### Added `ExecutionPlan::reset_state` In order to fix a bug in DataFusion `49.0.0` where dynamic filters (currently only generated in the presence of a query such as `ORDER BY ... LIMIT ...`) diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index cdc6d615e37f..3d4730958fb3 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -96,11 +96,10 @@ The following configuration settings are available: | datafusion.execution.parquet.dictionary_enabled | true | (writing) Sets if dictionary encoding is enabled. If NULL, uses default parquet writer setting | | datafusion.execution.parquet.dictionary_page_size_limit | 1048576 | (writing) Sets best effort maximum dictionary page size, in bytes | | datafusion.execution.parquet.statistics_enabled | page | (writing) Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting | -| datafusion.execution.parquet.max_statistics_size | 4096 | (writing) Sets max statistics size for any column. If NULL, uses default parquet writer setting max_statistics_size is deprecated, currently it is not being used | | datafusion.execution.parquet.max_row_group_size | 1048576 | (writing) Target maximum number of rows in each row group (defaults to 1M rows). Writing larger row groups requires more memory to write, but can get better compression and be faster to read. | | datafusion.execution.parquet.created_by | datafusion version 50.0.0 | (writing) Sets "created by" property | | datafusion.execution.parquet.column_index_truncate_length | 64 | (writing) Sets column index truncate length | -| datafusion.execution.parquet.statistics_truncate_length | NULL | (writing) Sets statistics truncate length. If NULL, uses default parquet writer setting | +| datafusion.execution.parquet.statistics_truncate_length | 64 | (writing) Sets statistics truncate length. If NULL, uses default parquet writer setting | | datafusion.execution.parquet.data_page_row_count_limit | 20000 | (writing) Sets best effort maximum number of rows in data page | | datafusion.execution.parquet.encoding | NULL | (writing) Sets default encoding for any column. Valid values are: plain, plain_dictionary, rle, bit_packed, delta_binary_packed, delta_length_byte_array, delta_byte_array, rle_dictionary, and byte_stream_split. These values are not case sensitive. If NULL, uses default parquet writer setting | | datafusion.execution.parquet.bloom_filter_on_write | false | (writing) Write bloom filters for all columns when creating parquet files | diff --git a/parquet-testing b/parquet-testing index a3d96a65e11e..107b36603e05 160000 --- a/parquet-testing +++ b/parquet-testing @@ -1 +1 @@ -Subproject commit a3d96a65e11e2bbca7d22a894e8313ede90a33a3 +Subproject commit 107b36603e051aee26bd93e04b871034f6c756c0 diff --git a/testing b/testing index 0d60ccae40d0..d2a137123034 160000 --- a/testing +++ b/testing @@ -1 +1 @@ -Subproject commit 0d60ccae40d0e8f2d22c15fafb01c5d4be8c63a6 +Subproject commit d2a13712303498963395318a4eb42872e66aead7 diff --git a/typos.toml b/typos.toml index 7d08ce2f09c3..1b1af4be339e 100644 --- a/typos.toml +++ b/typos.toml @@ -15,14 +15,13 @@ aroun = "aroun" abov = "abov" Ois = "Ois" alo = "alo" -Byt = "Byt" -nteger = "nteger" # abbreviations, common words, etc. typ = "typ" datas = "datas" YOUY = "YOUY" lits = "lits" +nteger = "nteger" # exposed to public API Serializeable = "Serializeable" From d95ac0c8187bbf9b35dde2799d4479f6d7cfe3e2 Mon Sep 17 00:00:00 2001 From: Brian Myers Date: Thu, 16 Oct 2025 10:40:39 +0200 Subject: [PATCH 09/13] turn off submodule updating for all testing submodules --- .github/actions/setup-submodules/action.yaml | 15 ++++++++ .github/workflows/dependencies.yml | 3 +- .github/workflows/docs_pr.yaml | 3 +- .github/workflows/extended.yml | 12 ++++--- .github/workflows/rust.yml | 36 +++++++++++++------- .gitmodules | 2 ++ 6 files changed, 53 insertions(+), 18 deletions(-) create mode 100644 .github/actions/setup-submodules/action.yaml diff --git a/.github/actions/setup-submodules/action.yaml b/.github/actions/setup-submodules/action.yaml new file mode 100644 index 000000000000..4273f054d234 --- /dev/null +++ b/.github/actions/setup-submodules/action.yaml @@ -0,0 +1,15 @@ +name: 'Setup Submodules' +description: 'Initialize and update git submodules for testing' +runs: + using: 'composite' + steps: + - name: Initialize and update submodules + shell: bash + run: | + git config --global --add safe.directory $GITHUB_WORKSPACE + git submodule init + # Override update=none setting for CI + git config submodule.testing.update checkout + git config submodule.parquet-testing.update checkout + git config submodule.datafusion-testing.update checkout + git submodule update --depth 1 \ No newline at end of file diff --git a/.github/workflows/dependencies.yml b/.github/workflows/dependencies.yml index fede423a507f..fbc5d9c0f804 100644 --- a/.github/workflows/dependencies.yml +++ b/.github/workflows/dependencies.yml @@ -43,8 +43,9 @@ jobs: steps: - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: - submodules: true fetch-depth: 1 + - name: Setup Submodules + uses: ./.github/actions/setup-submodules - name: Setup Rust toolchain uses: ./.github/actions/setup-builder with: diff --git a/.github/workflows/docs_pr.yaml b/.github/workflows/docs_pr.yaml index c182f2ef85d2..84efd227a2d6 100644 --- a/.github/workflows/docs_pr.yaml +++ b/.github/workflows/docs_pr.yaml @@ -42,8 +42,9 @@ jobs: steps: - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: - submodules: true fetch-depth: 1 + - name: Setup Submodules + uses: ./.github/actions/setup-submodules - name: Setup Python uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 with: diff --git a/.github/workflows/extended.yml b/.github/workflows/extended.yml index 9343997e0568..83246ca2457e 100644 --- a/.github/workflows/extended.yml +++ b/.github/workflows/extended.yml @@ -63,8 +63,9 @@ jobs: - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: ref: ${{ github.event.inputs.pr_head_sha }} # will be empty if triggered by push - submodules: true fetch-depth: 1 + - name: Setup Submodules + uses: ./.github/actions/setup-submodules - name: Install Rust run: | curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y @@ -87,8 +88,9 @@ jobs: - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: ref: ${{ github.event.inputs.pr_head_sha }} # will be empty if triggered by push - submodules: true fetch-depth: 1 + - name: Setup Submodules + uses: ./.github/actions/setup-submodules - name: Free Disk Space (Ubuntu) uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1 - name: Install Rust @@ -131,8 +133,9 @@ jobs: - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: ref: ${{ github.event.inputs.pr_head_sha }} # will be empty if triggered by push - submodules: true fetch-depth: 1 + - name: Setup Submodules + uses: ./.github/actions/setup-submodules - name: Setup Rust toolchain uses: ./.github/actions/setup-builder with: @@ -152,8 +155,9 @@ jobs: - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: ref: ${{ github.event.inputs.pr_head_sha }} # will be empty if triggered by push - submodules: true fetch-depth: 1 + - name: Setup Submodules + uses: ./.github/actions/setup-submodules - name: Setup Rust toolchain uses: ./.github/actions/setup-builder with: diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 05a6d70f0278..d1b402f22906 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -269,8 +269,9 @@ jobs: steps: - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: - submodules: true fetch-depth: 1 + - name: Setup Submodules + uses: ./.github/actions/setup-submodules - name: Setup Rust toolchain uses: ./.github/actions/setup-builder with: @@ -310,8 +311,9 @@ jobs: steps: - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: - submodules: true fetch-depth: 1 + - name: Setup Submodules + uses: ./.github/actions/setup-submodules - name: Setup Rust toolchain run: rustup toolchain install stable - name: Run tests (excluding doctests) @@ -336,8 +338,9 @@ jobs: steps: - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: - submodules: true fetch-depth: 1 + - name: Setup Submodules + uses: ./.github/actions/setup-submodules - name: Setup Rust toolchain uses: ./.github/actions/setup-builder with: @@ -366,8 +369,9 @@ jobs: steps: - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: - submodules: true fetch-depth: 1 + - name: Setup Submodules + uses: ./.github/actions/setup-submodules - name: Setup Rust toolchain uses: ./.github/actions/setup-builder with: @@ -424,8 +428,9 @@ jobs: steps: - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: - submodules: true fetch-depth: 1 + - name: Setup Submodules + uses: ./.github/actions/setup-submodules - name: Setup Rust toolchain uses: ./.github/actions/setup-builder with: @@ -471,8 +476,9 @@ jobs: steps: - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: - submodules: true fetch-depth: 1 + - name: Setup Submodules + uses: ./.github/actions/setup-submodules - name: Setup Rust toolchain uses: ./.github/actions/setup-builder with: @@ -495,8 +501,9 @@ jobs: steps: - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: - submodules: true fetch-depth: 1 + - name: Setup Submodules + uses: ./.github/actions/setup-submodules - name: Setup Rust toolchain uses: ./.github/actions/setup-builder with: @@ -549,8 +556,9 @@ jobs: steps: - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: - submodules: true fetch-depth: 1 + - name: Setup Submodules + uses: ./.github/actions/setup-submodules - name: Setup Rust toolchain uses: ./.github/actions/setup-macos-aarch64-builder - name: Run tests (excluding doctests) @@ -566,8 +574,9 @@ jobs: steps: - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: - submodules: true fetch-depth: 1 + - name: Setup Submodules + uses: ./.github/actions/setup-submodules - name: Install PyArrow run: | echo "LIBRARY_PATH=$LD_LIBRARY_PATH" >> $GITHUB_ENV @@ -665,8 +674,9 @@ jobs: steps: - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: - submodules: true fetch-depth: 1 + - name: Setup Submodules + uses: ./.github/actions/setup-submodules - name: Setup Rust toolchain uses: ./.github/actions/setup-builder with: @@ -690,8 +700,9 @@ jobs: steps: - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: - submodules: true fetch-depth: 1 + - name: Setup Submodules + uses: ./.github/actions/setup-submodules - name: Setup Rust toolchain uses: ./.github/actions/setup-builder with: @@ -711,8 +722,9 @@ jobs: steps: - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: - submodules: true fetch-depth: 1 + - name: Setup Submodules + uses: ./.github/actions/setup-submodules - name: Setup Rust toolchain uses: ./.github/actions/setup-builder with: diff --git a/.gitmodules b/.gitmodules index 7406be483598..14f7ce2ec08d 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,9 +1,11 @@ [submodule "parquet-testing"] path = parquet-testing url = https://github.com/apache/parquet-testing.git + update = none [submodule "testing"] path = testing url = https://github.com/apache/arrow-testing + update = none [submodule "datafusion-testing"] path = datafusion-testing url = https://github.com/apache/datafusion-testing.git From aec941a36c3b3779fd25335035e906ccc8dc26c7 Mon Sep 17 00:00:00 2001 From: Ahmed Mezghani <38987709+ahmed-mez@users.noreply.github.com> Date: Fri, 17 Oct 2025 18:48:35 +0200 Subject: [PATCH 10/13] fix: Add dictionary coercion support for numeric comparison operations (#18099) ## Which issue does this PR close? Fixes comparison errors when using dictionary-encoded types with comparison functions like NULLIF. ## Rationale for this change When using dictionary-encoded columns (e.g., Dictionary(Int32, Utf8)) in comparison operations with literals or other types, DataFusion would throw an error stating the types are not comparable. This was particularly problematic for functions like NULLIF which rely on comparison coercion. The issue was that comparison_coercion_numeric didn't handle dictionary types, even though the general comparison_coercion function did have dictionary support. ## What changes are included in this PR? 1. Refactored dictionary comparison logic: Extracted common dictionary coercion logic into dictionary_comparison_coercion_generic to avoid code duplication. 2. Added numeric-specific dictionary coercion: Introduced dictionary_comparison_coercion_numeric that uses numeric-preferring comparison rules when dealing with dictionary value types. 3. Updated comparison_coercion_numeric: Added a call to dictionary_comparison_coercion_numeric in the coercion chain to properly handle dictionary types. 4. Added sqllogictest cases demonstrating the fix works for various dictionary comparison scenarios. ## Are these changes tested? Yes, added tests in datafusion/sqllogictest/test_files/nullif.slt covering: - Dictionary type compared with string literal - String compared with dictionary type - Dictionary compared with dictionary All tests pass with the fix and would fail without it. ## Are there any user-facing changes? This is a bug fix that enables previously failing queries to work correctly. No breaking changes or API modifications. --- .../expr-common/src/type_coercion/binary.rs | 58 +++++++++++++++---- datafusion/sqllogictest/test_files/nullif.slt | 36 ++++++++++++ 2 files changed, 84 insertions(+), 10 deletions(-) diff --git a/datafusion/expr-common/src/type_coercion/binary.rs b/datafusion/expr-common/src/type_coercion/binary.rs index 9264a2940dd1..18b63b770a2e 100644 --- a/datafusion/expr-common/src/type_coercion/binary.rs +++ b/datafusion/expr-common/src/type_coercion/binary.rs @@ -796,6 +796,7 @@ pub fn comparison_coercion_numeric( return Some(lhs_type.clone()); } binary_numeric_coercion(lhs_type, rhs_type) + .or_else(|| dictionary_comparison_coercion_numeric(lhs_type, rhs_type, true)) .or_else(|| string_coercion(lhs_type, rhs_type)) .or_else(|| null_coercion(lhs_type, rhs_type)) .or_else(|| string_numeric_coercion_as_numeric(lhs_type, rhs_type)) @@ -1146,38 +1147,75 @@ fn both_numeric_or_null_and_numeric(lhs_type: &DataType, rhs_type: &DataType) -> } } -/// Coercion rules for Dictionaries: the type that both lhs and rhs +/// Generic coercion rules for Dictionaries: the type that both lhs and rhs /// can be casted to for the purpose of a computation. /// /// Not all operators support dictionaries, if `preserve_dictionaries` is true -/// dictionaries will be preserved if possible -fn dictionary_comparison_coercion( +/// dictionaries will be preserved if possible. +/// +/// The `coerce_fn` parameter determines which comparison coercion function to use +/// for comparing the dictionary value types. +fn dictionary_comparison_coercion_generic( lhs_type: &DataType, rhs_type: &DataType, preserve_dictionaries: bool, + coerce_fn: fn(&DataType, &DataType) -> Option, ) -> Option { use arrow::datatypes::DataType::*; match (lhs_type, rhs_type) { ( Dictionary(_lhs_index_type, lhs_value_type), Dictionary(_rhs_index_type, rhs_value_type), - ) => comparison_coercion(lhs_value_type, rhs_value_type), + ) => coerce_fn(lhs_value_type, rhs_value_type), (d @ Dictionary(_, value_type), other_type) | (other_type, d @ Dictionary(_, value_type)) if preserve_dictionaries && value_type.as_ref() == other_type => { Some(d.clone()) } - (Dictionary(_index_type, value_type), _) => { - comparison_coercion(value_type, rhs_type) - } - (_, Dictionary(_index_type, value_type)) => { - comparison_coercion(lhs_type, value_type) - } + (Dictionary(_index_type, value_type), _) => coerce_fn(value_type, rhs_type), + (_, Dictionary(_index_type, value_type)) => coerce_fn(lhs_type, value_type), _ => None, } } +/// Coercion rules for Dictionaries: the type that both lhs and rhs +/// can be casted to for the purpose of a computation. +/// +/// Not all operators support dictionaries, if `preserve_dictionaries` is true +/// dictionaries will be preserved if possible +fn dictionary_comparison_coercion( + lhs_type: &DataType, + rhs_type: &DataType, + preserve_dictionaries: bool, +) -> Option { + dictionary_comparison_coercion_generic( + lhs_type, + rhs_type, + preserve_dictionaries, + comparison_coercion, + ) +} + +/// Coercion rules for Dictionaries with numeric preference: similar to +/// [`dictionary_comparison_coercion`] but uses [`comparison_coercion_numeric`] +/// which prefers numeric types over strings when both are present. +/// +/// This is used by [`comparison_coercion_numeric`] to maintain consistent +/// numeric-preferring semantics when dealing with dictionary types. +fn dictionary_comparison_coercion_numeric( + lhs_type: &DataType, + rhs_type: &DataType, + preserve_dictionaries: bool, +) -> Option { + dictionary_comparison_coercion_generic( + lhs_type, + rhs_type, + preserve_dictionaries, + comparison_coercion_numeric, + ) +} + /// Coercion rules for string concat. /// This is a union of string coercion rules and specified rules: /// 1. At least one side of lhs and rhs should be string type (Utf8 / LargeUtf8) diff --git a/datafusion/sqllogictest/test_files/nullif.slt b/datafusion/sqllogictest/test_files/nullif.slt index 18642f6971ca..6acb9aea26d5 100644 --- a/datafusion/sqllogictest/test_files/nullif.slt +++ b/datafusion/sqllogictest/test_files/nullif.slt @@ -174,3 +174,39 @@ query T SELECT NULLIF(arrow_cast('a', 'Utf8View'), null); ---- a + +# Test with dictionary-encoded strings +# This tests the fix for: "Dictionary(UInt32, Utf8) and Utf8 is not comparable" +statement ok +CREATE TABLE dict_test_base( + col1 TEXT, + col2 TEXT +) as VALUES + ('foo', 'bar'), + ('bar', 'bar'), + ('baz', 'bar') +; + +# Dictionary cast with string literal +query T rowsort +SELECT NULLIF(arrow_cast(col1, 'Dictionary(Int32, Utf8)'), 'bar') FROM dict_test_base; +---- +NULL +baz +foo + +# String with dictionary cast +query T rowsort +SELECT NULLIF(col2, arrow_cast(col1, 'Dictionary(Int32, Utf8)')) FROM dict_test_base; +---- +NULL +bar +bar + +# Both as dictionaries +query T rowsort +SELECT NULLIF(arrow_cast(col1, 'Dictionary(Int32, Utf8)'), arrow_cast('bar', 'Dictionary(Int32, Utf8)')) FROM dict_test_base; +---- +NULL +baz +foo From f209f98479cc6ba19c67fcbb0d69c441ef46ed3e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=ADa=20Adriana?= Date: Wed, 22 Oct 2025 13:27:34 +0200 Subject: [PATCH 11/13] Upgrade to arrow 56.1.0 (#17275) (#57) * Update to arrow/parquet 56.1.0 * Adjust for new parquet sizes, update for deprecated API * Thread through max_predicate_cache_size, add test (cherry picked from commit 980c948d8a6343d061058afd1596ece29ea25a83) Co-authored-by: Andrew Lamb --- Cargo.lock | 69 ++++++------ Cargo.toml | 14 +-- datafusion-cli/src/main.rs | 12 +-- datafusion/common/src/config.rs | 8 ++ .../common/src/file_options/parquet_writer.rs | 13 ++- .../core/tests/parquet/filter_pushdown.rs | 101 +++++++++++++++++- datafusion/datasource-parquet/src/metadata.rs | 6 +- datafusion/datasource-parquet/src/metrics.rs | 17 +++ datafusion/datasource-parquet/src/opener.rs | 62 +++++++++-- datafusion/datasource-parquet/src/source.rs | 7 ++ datafusion/physical-plan/src/spill/mod.rs | 2 +- .../proto/datafusion_common.proto | 4 + datafusion/proto-common/src/from_proto/mod.rs | 3 + .../proto-common/src/generated/pbjson.rs | 24 +++++ .../proto-common/src/generated/prost.rs | 9 ++ datafusion/proto-common/src/to_proto/mod.rs | 1 + .../src/generated/datafusion_proto_common.rs | 9 ++ .../proto/src/logical_plan/file_formats.rs | 6 ++ .../sqllogictest/test_files/explain_tree.slt | 8 +- .../test_files/information_schema.slt | 2 + docs/source/user-guide/configs.md | 1 + 21 files changed, 310 insertions(+), 68 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5186c6d6d598..eebb802394b9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -240,9 +240,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "56.0.0" +version = "56.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd798aea3553913a5986813e9c6ad31a2d2b04e931fe8ea4a37155eb541cebb5" +checksum = "c26b57282a08ae92f727497805122fec964c6245cfa0e13f0e75452eaf3bc41f" dependencies = [ "arrow-arith", "arrow-array", @@ -264,9 +264,9 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "56.0.0" +version = "56.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "508dafb53e5804a238cab7fd97a59ddcbfab20cc4d9814b1ab5465b9fa147f2e" +checksum = "cebf38ca279120ff522f4954b81a39527425b6e9f615e6b72842f4de1ffe02b8" dependencies = [ "arrow-array", "arrow-buffer", @@ -278,9 +278,9 @@ dependencies = [ [[package]] name = "arrow-array" -version = "56.0.0" +version = "56.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2730bc045d62bb2e53ef8395b7d4242f5c8102f41ceac15e8395b9ac3d08461" +checksum = "744109142cdf8e7b02795e240e20756c2a782ac9180d4992802954a8f871c0de" dependencies = [ "ahash 0.8.12", "arrow-buffer", @@ -295,9 +295,9 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "56.0.0" +version = "56.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54295b93beb702ee9a6f6fbced08ad7f4d76ec1c297952d4b83cf68755421d1d" +checksum = "601bb103c4c374bcd1f62c66bcea67b42a2ee91a690486c37d4c180236f11ccc" dependencies = [ "bytes", "half", @@ -306,9 +306,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "56.0.0" +version = "56.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67e8bcb7dc971d779a7280593a1bf0c2743533b8028909073e804552e85e75b5" +checksum = "eed61d9d73eda8df9e3014843def37af3050b5080a9acbe108f045a316d5a0be" dependencies = [ "arrow-array", "arrow-buffer", @@ -327,9 +327,9 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "56.0.0" +version = "56.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "673fd2b5fb57a1754fdbfac425efd7cf54c947ac9950c1cce86b14e248f1c458" +checksum = "fa95b96ce0c06b4d33ac958370db8c0d31e88e54f9d6e08b0353d18374d9f991" dependencies = [ "arrow-array", "arrow-cast", @@ -342,9 +342,9 @@ dependencies = [ [[package]] name = "arrow-data" -version = "56.0.0" +version = "56.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97c22fe3da840039c69e9f61f81e78092ea36d57037b4900151f063615a2f6b4" +checksum = "43407f2c6ba2367f64d85d4603d6fb9c4b92ed79d2ffd21021b37efa96523e12" dependencies = [ "arrow-buffer", "arrow-schema", @@ -354,9 +354,9 @@ dependencies = [ [[package]] name = "arrow-flight" -version = "56.0.0" +version = "56.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6808d235786b721e49e228c44dd94242f2e8b46b7e95b233b0733c46e758bfee" +checksum = "d7c66c5e4a7aedc2bfebffeabc2116d76adb22e08d230b968b995da97f8b11ca" dependencies = [ "arrow-arith", "arrow-array", @@ -381,14 +381,15 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "56.0.0" +version = "56.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "778de14c5a69aedb27359e3dd06dd5f9c481d5f6ee9fbae912dba332fd64636b" +checksum = "e4b0487c4d2ad121cbc42c4db204f1509f8618e589bc77e635e9c40b502e3b90" dependencies = [ "arrow-array", "arrow-buffer", "arrow-data", "arrow-schema", + "arrow-select", "flatbuffers", "lz4_flex", "zstd", @@ -396,9 +397,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "56.0.0" +version = "56.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3860db334fe7b19fcf81f6b56f8d9d95053f3839ffe443d56b5436f7a29a1794" +checksum = "26d747573390905905a2dc4c5a61a96163fe2750457f90a04ee2a88680758c79" dependencies = [ "arrow-array", "arrow-buffer", @@ -418,9 +419,9 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "56.0.0" +version = "56.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "425fa0b42a39d3ff55160832e7c25553e7f012c3f187def3d70313e7a29ba5d9" +checksum = "c142a147dceb59d057bad82400f1693847c80dca870d008bf7b91caf902810ae" dependencies = [ "arrow-array", "arrow-buffer", @@ -431,9 +432,9 @@ dependencies = [ [[package]] name = "arrow-pyarrow" -version = "56.0.0" +version = "56.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d944d8ae9b77230124e6570865b570416c33a5809f32c4136c679bbe774e45c9" +checksum = "5b9038de599df1b81f63b42220e2b6cd6fd4f09af333858cd320db9bef5ac757" dependencies = [ "arrow-array", "arrow-data", @@ -443,9 +444,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "56.0.0" +version = "56.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df9c9423c9e71abd1b08a7f788fcd203ba2698ac8e72a1f236f1faa1a06a7414" +checksum = "dac6620667fccdab4204689ca173bd84a15de6bb6b756c3a8764d4d7d0c2fc04" dependencies = [ "arrow-array", "arrow-buffer", @@ -456,9 +457,9 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "56.0.0" +version = "56.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85fa1babc4a45fdc64a92175ef51ff00eba5ebbc0007962fecf8022ac1c6ce28" +checksum = "dfa93af9ff2bb80de539e6eb2c1c8764abd0f4b73ffb0d7c82bf1f9868785e66" dependencies = [ "bitflags 2.9.1", "serde", @@ -467,9 +468,9 @@ dependencies = [ [[package]] name = "arrow-select" -version = "56.0.0" +version = "56.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d8854d15f1cf5005b4b358abeb60adea17091ff5bdd094dca5d3f73787d81170" +checksum = "be8b2e0052cd20d36d64f32640b68a5ab54d805d24a473baee5d52017c85536c" dependencies = [ "ahash 0.8.12", "arrow-array", @@ -481,9 +482,9 @@ dependencies = [ [[package]] name = "arrow-string" -version = "56.0.0" +version = "56.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c477e8b89e1213d5927a2a84a72c384a9bf4dd0dbf15f9fd66d821aafd9e95e" +checksum = "c2155e26e17f053c8975c546fc70cf19c00542f9abf43c23a88a46ef7204204f" dependencies = [ "arrow-array", "arrow-buffer", @@ -4529,9 +4530,9 @@ dependencies = [ [[package]] name = "parquet" -version = "56.0.0" +version = "56.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7288a07ed5d25939a90f9cb1ca5afa6855faa08ec7700613511ae64bdb0620c" +checksum = "89b56b41d1bd36aae415e42f91cae70ee75cf6cba74416b14dce3e958d5990ec" dependencies = [ "ahash 0.8.12", "arrow-array", diff --git a/Cargo.toml b/Cargo.toml index 53c35ed35f0d..78e437250627 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -90,19 +90,19 @@ ahash = { version = "0.8", default-features = false, features = [ "runtime-rng", ] } apache-avro = { version = "0.20", default-features = false } -arrow = { version = "56.0.0", features = [ +arrow = { version = "56.1.0", features = [ "prettyprint", "chrono-tz", ] } -arrow-buffer = { version = "56.0.0", default-features = false } -arrow-flight = { version = "56.0.0", features = [ +arrow-buffer = { version = "56.1.0", default-features = false } +arrow-flight = { version = "56.1.0", features = [ "flight-sql-experimental", ] } -arrow-ipc = { version = "56.0.0", default-features = false, features = [ +arrow-ipc = { version = "56.1.0", default-features = false, features = [ "lz4", ] } -arrow-ord = { version = "56.0.0", default-features = false } -arrow-schema = { version = "56.0.0", default-features = false } +arrow-ord = { version = "56.1.0", default-features = false } +arrow-schema = { version = "56.1.0", default-features = false } async-trait = "0.1.89" bigdecimal = "0.4.8" bytes = "1.10" @@ -157,7 +157,7 @@ itertools = "0.14" log = "^0.4" object_store = { version = "0.12.3", default-features = false } parking_lot = "0.12" -parquet = { version = "56.0.0", default-features = false, features = [ +parquet = { version = "56.1.0", default-features = false, features = [ "arrow", "async", "object_store", diff --git a/datafusion-cli/src/main.rs b/datafusion-cli/src/main.rs index a28e97a9f88e..7b73ea08cc27 100644 --- a/datafusion-cli/src/main.rs +++ b/datafusion-cli/src/main.rs @@ -570,15 +570,15 @@ mod tests { let df = ctx.sql(sql).await?; let rbs = df.collect().await?; - assert_snapshot!(batches_to_string(&rbs),@r#" + assert_snapshot!(batches_to_string(&rbs),@r" +-----------------------------------+-----------------+---------------------+------+------------------+ | filename | file_size_bytes | metadata_size_bytes | hits | extra | +-----------------------------------+-----------------+---------------------+------+------------------+ | alltypes_plain.parquet | 1851 | 10181 | 2 | page_index=false | - | alltypes_tiny_pages.parquet | 454233 | 881634 | 2 | page_index=true | + | alltypes_tiny_pages.parquet | 454233 | 881418 | 2 | page_index=true | | lz4_raw_compressed_larger.parquet | 380836 | 2939 | 2 | page_index=false | +-----------------------------------+-----------------+---------------------+------+------------------+ - "#); + "); // increase the number of hits ctx.sql("select * from alltypes_plain") @@ -601,15 +601,15 @@ mod tests { let df = ctx.sql(sql).await?; let rbs = df.collect().await?; - assert_snapshot!(batches_to_string(&rbs),@r#" + assert_snapshot!(batches_to_string(&rbs),@r" +-----------------------------------+-----------------+---------------------+------+------------------+ | filename | file_size_bytes | metadata_size_bytes | hits | extra | +-----------------------------------+-----------------+---------------------+------+------------------+ | alltypes_plain.parquet | 1851 | 10181 | 5 | page_index=false | - | alltypes_tiny_pages.parquet | 454233 | 881634 | 2 | page_index=true | + | alltypes_tiny_pages.parquet | 454233 | 881418 | 2 | page_index=true | | lz4_raw_compressed_larger.parquet | 380836 | 2939 | 3 | page_index=false | +-----------------------------------+-----------------+---------------------+------+------------------+ - "#); + "); Ok(()) } diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index cdd8e72a06cc..51fc95466394 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -560,6 +560,14 @@ config_namespace! { /// (reading) Use any available bloom filters when reading parquet files pub bloom_filter_on_read: bool, default = true + /// (reading) The maximum predicate cache size, in bytes. When + /// `pushdown_filters` is enabled, sets the maximum memory used to cache + /// the results of predicate evaluation between filter evaluation and + /// output generation. Decreasing this value will reduce memory usage, + /// but may increase IO and CPU usage. None means use the default + /// parquet reader setting. 0 means no caching. + pub max_predicate_cache_size: Option, default = None + // The following options affect writing to parquet files // and map to parquet::file::properties::WriterProperties diff --git a/datafusion/common/src/file_options/parquet_writer.rs b/datafusion/common/src/file_options/parquet_writer.rs index 185826aef47d..d052e6bb5948 100644 --- a/datafusion/common/src/file_options/parquet_writer.rs +++ b/datafusion/common/src/file_options/parquet_writer.rs @@ -233,6 +233,7 @@ impl ParquetOptions { binary_as_string: _, // not used for writer props coerce_int96: _, // not used for writer props skip_arrow_metadata: _, + max_predicate_cache_size: _, } = self; let mut builder = WriterProperties::builder() @@ -425,6 +426,10 @@ pub(crate) fn parse_statistics_string(str_setting: &str) -> Result usize { } } } + +#[tokio::test] +async fn predicate_cache_default() -> datafusion_common::Result<()> { + let ctx = SessionContext::new(); + // The cache is on by default, but not used unless filter pushdown is enabled + PredicateCacheTest { + expected_inner_records: 0, + expected_records: 0, + } + .run(&ctx) + .await +} + +#[tokio::test] +async fn predicate_cache_pushdown_default() -> datafusion_common::Result<()> { + let mut config = SessionConfig::new(); + config.options_mut().execution.parquet.pushdown_filters = true; + let ctx = SessionContext::new_with_config(config); + // The cache is on by default, and used when filter pushdown is enabled + PredicateCacheTest { + expected_inner_records: 8, + expected_records: 4, + } + .run(&ctx) + .await +} + +#[tokio::test] +async fn predicate_cache_pushdown_disable() -> datafusion_common::Result<()> { + // Can disable the cache even with filter pushdown by setting the size to 0. In this case we + // expect the inner records are reported but no records are read from the cache + let mut config = SessionConfig::new(); + config.options_mut().execution.parquet.pushdown_filters = true; + config + .options_mut() + .execution + .parquet + .max_predicate_cache_size = Some(0); + let ctx = SessionContext::new_with_config(config); + PredicateCacheTest { + // file has 8 rows, which need to be read twice, one for filter, one for + // final output + expected_inner_records: 16, + // Expect this to 0 records read as the cache is disabled. However, it is + // non zero due to https://github.com/apache/arrow-rs/issues/8307 + expected_records: 3, + } + .run(&ctx) + .await +} + +/// Runs the query "SELECT * FROM alltypes_plain WHERE double_col != 0.0" +/// with a given SessionContext and asserts that the predicate cache metrics +/// are as expected +#[derive(Debug)] +struct PredicateCacheTest { + /// Expected records read from the underlying reader (to evaluate filters) + /// -- this is the total number of records in the file + expected_inner_records: usize, + /// Expected records to be read from the cache (after filtering) + expected_records: usize, +} + +impl PredicateCacheTest { + async fn run(self, ctx: &SessionContext) -> datafusion_common::Result<()> { + let Self { + expected_inner_records, + expected_records, + } = self; + // Create a dataframe that scans the "alltypes_plain.parquet" file with + // a filter on `double_col != 0.0` + let path = parquet_test_data() + "/alltypes_plain.parquet"; + let exec = ctx + .read_parquet(path, ParquetReadOptions::default()) + .await? + .filter(col("double_col").not_eq(lit(0.0)))? + .create_physical_plan() + .await?; + + // run the plan to completion + let _ = collect(exec.clone(), ctx.task_ctx()).await?; // run plan + let metrics = + TestParquetFile::parquet_metrics(&exec).expect("found parquet metrics"); + + // verify the predicate cache metrics + assert_eq!( + get_value(&metrics, "predicate_cache_inner_records"), + expected_inner_records + ); + assert_eq!( + get_value(&metrics, "predicate_cache_records"), + expected_records + ); + Ok(()) + } +} diff --git a/datafusion/datasource-parquet/src/metadata.rs b/datafusion/datasource-parquet/src/metadata.rs index 71c81a25001b..8e3de5bbf2c0 100644 --- a/datafusion/datasource-parquet/src/metadata.rs +++ b/datafusion/datasource-parquet/src/metadata.rs @@ -39,7 +39,9 @@ use object_store::path::Path; use object_store::{ObjectMeta, ObjectStore}; use parquet::arrow::arrow_reader::statistics::StatisticsConverter; use parquet::arrow::parquet_to_arrow_schema; -use parquet::file::metadata::{ParquetMetaData, ParquetMetaDataReader, RowGroupMetaData}; +use parquet::file::metadata::{ + PageIndexPolicy, ParquetMetaData, ParquetMetaDataReader, RowGroupMetaData, +}; use std::any::Any; use std::collections::HashMap; use std::sync::Arc; @@ -148,7 +150,7 @@ impl<'a> DFParquetMetadata<'a> { if cache_metadata && file_metadata_cache.is_some() { // Need to retrieve the entire metadata for the caching to be effective. - reader = reader.with_page_indexes(true); + reader = reader.with_page_index_policy(PageIndexPolicy::Optional); } let metadata = Arc::new( diff --git a/datafusion/datasource-parquet/src/metrics.rs b/datafusion/datasource-parquet/src/metrics.rs index 574fe2a040ea..d75a979d4cad 100644 --- a/datafusion/datasource-parquet/src/metrics.rs +++ b/datafusion/datasource-parquet/src/metrics.rs @@ -72,6 +72,13 @@ pub struct ParquetFileMetrics { pub page_index_eval_time: Time, /// Total time spent reading and parsing metadata from the footer pub metadata_load_time: Time, + /// Predicate Cache: number of records read directly from the inner reader. + /// This is the number of rows decoded while evaluating predicates + pub predicate_cache_inner_records: Count, + /// Predicate Cache: number of records read from the cache. This is the + /// number of rows that were stored in the cache after evaluating predicates + /// reused for the output. + pub predicate_cache_records: Count, } impl ParquetFileMetrics { @@ -140,6 +147,14 @@ impl ParquetFileMetrics { let files_ranges_pruned_statistics = MetricBuilder::new(metrics) .counter("files_ranges_pruned_statistics", partition); + let predicate_cache_inner_records = MetricBuilder::new(metrics) + .with_new_label("filename", filename.to_string()) + .counter("predicate_cache_inner_records", partition); + + let predicate_cache_records = MetricBuilder::new(metrics) + .with_new_label("filename", filename.to_string()) + .counter("predicate_cache_records", partition); + Self { files_ranges_pruned_statistics, predicate_evaluation_errors, @@ -157,6 +172,8 @@ impl ParquetFileMetrics { bloom_filter_eval_time, page_index_eval_time, metadata_load_time, + predicate_cache_inner_records, + predicate_cache_records, } } } diff --git a/datafusion/datasource-parquet/src/opener.rs b/datafusion/datasource-parquet/src/opener.rs index 93a3d4af5432..def53706c34a 100644 --- a/datafusion/datasource-parquet/src/opener.rs +++ b/datafusion/datasource-parquet/src/opener.rs @@ -51,10 +51,11 @@ use datafusion_execution::parquet_encryption::EncryptionFactory; use futures::{ready, Stream, StreamExt, TryStreamExt}; use itertools::Itertools; use log::debug; +use parquet::arrow::arrow_reader::metrics::ArrowReaderMetrics; use parquet::arrow::arrow_reader::{ArrowReaderMetadata, ArrowReaderOptions}; use parquet::arrow::async_reader::AsyncFileReader; use parquet::arrow::{ParquetRecordBatchStreamBuilder, ProjectionMask}; -use parquet::file::metadata::ParquetMetaDataReader; +use parquet::file::metadata::{PageIndexPolicy, ParquetMetaDataReader}; /// Implements [`FileOpener`] for a parquet file pub(super) struct ParquetOpener { @@ -105,6 +106,9 @@ pub(super) struct ParquetOpener { #[cfg(feature = "parquet_encryption")] pub encryption_factory: Option<(Arc, EncryptionFactoryOptions)>, + /// Maximum size of the predicate cache, in bytes. If none, uses + /// the arrow-rs default. + pub max_predicate_cache_size: Option, } impl FileOpener for ParquetOpener { @@ -152,6 +156,7 @@ impl FileOpener for ParquetOpener { let enable_page_index = self.enable_page_index; let encryption_context = self.get_encryption_context(); + let max_predicate_cache_size = self.max_predicate_cache_size; Ok(Box::pin(async move { let file_decryption_properties = encryption_context @@ -401,21 +406,42 @@ impl FileOpener for ParquetOpener { builder = builder.with_limit(limit) } + if let Some(max_predicate_cache_size) = max_predicate_cache_size { + builder = builder.with_max_predicate_cache_size(max_predicate_cache_size); + } + + // metrics from the arrow reader itself + let arrow_reader_metrics = ArrowReaderMetrics::enabled(); + let stream = builder .with_projection(mask) .with_batch_size(batch_size) .with_row_groups(row_group_indexes) + .with_metrics(arrow_reader_metrics.clone()) .build()?; - let stream = stream - .map_err(DataFusionError::from) - .map(move |b| b.and_then(|b| schema_mapping.map_batch(b))); + let files_ranges_pruned_statistics = + file_metrics.files_ranges_pruned_statistics.clone(); + let predicate_cache_inner_records = + file_metrics.predicate_cache_inner_records.clone(); + let predicate_cache_records = file_metrics.predicate_cache_records.clone(); + + let stream = stream.map_err(DataFusionError::from).map(move |b| { + b.and_then(|b| { + copy_arrow_reader_metrics( + &arrow_reader_metrics, + &predicate_cache_inner_records, + &predicate_cache_records, + ); + schema_mapping.map_batch(b) + }) + }); if let Some(file_pruner) = file_pruner { Ok(EarlyStoppingStream::new( stream, file_pruner, - file_metrics.files_ranges_pruned_statistics.clone(), + files_ranges_pruned_statistics, ) .boxed()) } else { @@ -425,6 +451,22 @@ impl FileOpener for ParquetOpener { } } +/// Copies metrics from ArrowReaderMetrics (the metrics collected by the +/// arrow-rs parquet reader) to the parquet file metrics for DataFusion +fn copy_arrow_reader_metrics( + arrow_reader_metrics: &ArrowReaderMetrics, + predicate_cache_inner_records: &Count, + predicate_cache_records: &Count, +) { + if let Some(v) = arrow_reader_metrics.records_read_from_inner() { + predicate_cache_inner_records.add(v); + } + + if let Some(v) = arrow_reader_metrics.records_read_from_cache() { + predicate_cache_records.add(v); + } +} + /// Wraps an inner RecordBatchStream and a [`FilePruner`] /// /// This can terminate the scan early when some dynamic filters is updated after @@ -652,8 +694,8 @@ async fn load_page_index( if missing_column_index || missing_offset_index { let m = Arc::try_unwrap(Arc::clone(parquet_metadata)) .unwrap_or_else(|e| e.as_ref().clone()); - let mut reader = - ParquetMetaDataReader::new_with_metadata(m).with_page_indexes(true); + let mut reader = ParquetMetaDataReader::new_with_metadata(m) + .with_page_index_policy(PageIndexPolicy::Optional); reader.load_page_index(input).await?; let new_parquet_metadata = reader.finish()?; let new_arrow_reader = @@ -823,6 +865,7 @@ mod test { expr_adapter_factory: Some(Arc::new(DefaultPhysicalExprAdapterFactory)), #[cfg(feature = "parquet_encryption")] encryption_factory: None, + max_predicate_cache_size: None, } }; @@ -911,6 +954,7 @@ mod test { expr_adapter_factory: Some(Arc::new(DefaultPhysicalExprAdapterFactory)), #[cfg(feature = "parquet_encryption")] encryption_factory: None, + max_predicate_cache_size: None, } }; @@ -1015,6 +1059,7 @@ mod test { expr_adapter_factory: Some(Arc::new(DefaultPhysicalExprAdapterFactory)), #[cfg(feature = "parquet_encryption")] encryption_factory: None, + max_predicate_cache_size: None, } }; let make_meta = || FileMeta { @@ -1129,6 +1174,7 @@ mod test { expr_adapter_factory: Some(Arc::new(DefaultPhysicalExprAdapterFactory)), #[cfg(feature = "parquet_encryption")] encryption_factory: None, + max_predicate_cache_size: None, } }; @@ -1244,6 +1290,7 @@ mod test { expr_adapter_factory: Some(Arc::new(DefaultPhysicalExprAdapterFactory)), #[cfg(feature = "parquet_encryption")] encryption_factory: None, + max_predicate_cache_size: None, } }; @@ -1426,6 +1473,7 @@ mod test { expr_adapter_factory: None, #[cfg(feature = "parquet_encryption")] encryption_factory: None, + max_predicate_cache_size: None, }; let predicate = logical2physical(&col("a").eq(lit(1u64)), &table_schema); diff --git a/datafusion/datasource-parquet/src/source.rs b/datafusion/datasource-parquet/src/source.rs index 007c239ef492..4fff23413dd0 100644 --- a/datafusion/datasource-parquet/src/source.rs +++ b/datafusion/datasource-parquet/src/source.rs @@ -426,6 +426,12 @@ impl ParquetSource { self.table_parquet_options.global.bloom_filter_on_read } + /// Return the maximum predicate cache size, in bytes, used when + /// `pushdown_filters` + pub fn max_predicate_cache_size(&self) -> Option { + self.table_parquet_options.global.max_predicate_cache_size + } + /// Applies schema adapter factory from the FileScanConfig if present. /// /// # Arguments @@ -580,6 +586,7 @@ impl FileSource for ParquetSource { expr_adapter_factory, #[cfg(feature = "parquet_encryption")] encryption_factory: self.get_encryption_factory_with_config(), + max_predicate_cache_size: self.max_predicate_cache_size(), }) } diff --git a/datafusion/physical-plan/src/spill/mod.rs b/datafusion/physical-plan/src/spill/mod.rs index fab62bff840f..fbea6ae85a39 100644 --- a/datafusion/physical-plan/src/spill/mod.rs +++ b/datafusion/physical-plan/src/spill/mod.rs @@ -752,7 +752,7 @@ mod tests { .unwrap(); let size = get_record_batch_memory_size(&batch); - assert_eq!(size, 8320); + assert_eq!(size, 8208); } // ==== Spill manager tests ==== diff --git a/datafusion/proto-common/proto/datafusion_common.proto b/datafusion/proto-common/proto/datafusion_common.proto index f5c79cf3d9a4..f56c7318609a 100644 --- a/datafusion/proto-common/proto/datafusion_common.proto +++ b/datafusion/proto-common/proto/datafusion_common.proto @@ -580,6 +580,10 @@ message ParquetOptions { oneof coerce_int96_opt { string coerce_int96 = 32; } + + oneof max_predicate_cache_size_opt { + uint64 max_predicate_cache_size = 33; + } } enum JoinSide { diff --git a/datafusion/proto-common/src/from_proto/mod.rs b/datafusion/proto-common/src/from_proto/mod.rs index c5242d0176e6..103e52dfec90 100644 --- a/datafusion/proto-common/src/from_proto/mod.rs +++ b/datafusion/proto-common/src/from_proto/mod.rs @@ -999,6 +999,9 @@ impl TryFrom<&protobuf::ParquetOptions> for ParquetOptions { protobuf::parquet_options::CoerceInt96Opt::CoerceInt96(v) => Some(v), }).unwrap_or(None), skip_arrow_metadata: value.skip_arrow_metadata, + max_predicate_cache_size: value.max_predicate_cache_size_opt.map(|opt| match opt { + protobuf::parquet_options::MaxPredicateCacheSizeOpt::MaxPredicateCacheSize(v) => Some(v as usize), + }).unwrap_or(None), }) } } diff --git a/datafusion/proto-common/src/generated/pbjson.rs b/datafusion/proto-common/src/generated/pbjson.rs index 48782ff1d93a..f4b0022722c9 100644 --- a/datafusion/proto-common/src/generated/pbjson.rs +++ b/datafusion/proto-common/src/generated/pbjson.rs @@ -5610,6 +5610,9 @@ impl serde::Serialize for ParquetOptions { if self.coerce_int96_opt.is_some() { len += 1; } + if self.max_predicate_cache_size_opt.is_some() { + len += 1; + } let mut struct_ser = serializer.serialize_struct("datafusion_common.ParquetOptions", len)?; if self.enable_page_index { struct_ser.serialize_field("enablePageIndex", &self.enable_page_index)?; @@ -5763,6 +5766,15 @@ impl serde::Serialize for ParquetOptions { } } } + if let Some(v) = self.max_predicate_cache_size_opt.as_ref() { + match v { + parquet_options::MaxPredicateCacheSizeOpt::MaxPredicateCacheSize(v) => { + #[allow(clippy::needless_borrow)] + #[allow(clippy::needless_borrows_for_generic_args)] + struct_ser.serialize_field("maxPredicateCacheSize", ToString::to_string(&v).as_str())?; + } + } + } struct_ser.end() } } @@ -5830,6 +5842,8 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { "bloomFilterNdv", "coerce_int96", "coerceInt96", + "max_predicate_cache_size", + "maxPredicateCacheSize", ]; #[allow(clippy::enum_variant_names)] @@ -5864,6 +5878,7 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { BloomFilterFpp, BloomFilterNdv, CoerceInt96, + MaxPredicateCacheSize, } impl<'de> serde::Deserialize<'de> for GeneratedField { fn deserialize(deserializer: D) -> std::result::Result @@ -5915,6 +5930,7 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { "bloomFilterFpp" | "bloom_filter_fpp" => Ok(GeneratedField::BloomFilterFpp), "bloomFilterNdv" | "bloom_filter_ndv" => Ok(GeneratedField::BloomFilterNdv), "coerceInt96" | "coerce_int96" => Ok(GeneratedField::CoerceInt96), + "maxPredicateCacheSize" | "max_predicate_cache_size" => Ok(GeneratedField::MaxPredicateCacheSize), _ => Err(serde::de::Error::unknown_field(value, FIELDS)), } } @@ -5964,6 +5980,7 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { let mut bloom_filter_fpp_opt__ = None; let mut bloom_filter_ndv_opt__ = None; let mut coerce_int96_opt__ = None; + let mut max_predicate_cache_size_opt__ = None; while let Some(k) = map_.next_key()? { match k { GeneratedField::EnablePageIndex => { @@ -6160,6 +6177,12 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { } coerce_int96_opt__ = map_.next_value::<::std::option::Option<_>>()?.map(parquet_options::CoerceInt96Opt::CoerceInt96); } + GeneratedField::MaxPredicateCacheSize => { + if max_predicate_cache_size_opt__.is_some() { + return Err(serde::de::Error::duplicate_field("maxPredicateCacheSize")); + } + max_predicate_cache_size_opt__ = map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| parquet_options::MaxPredicateCacheSizeOpt::MaxPredicateCacheSize(x.0)); + } } } Ok(ParquetOptions { @@ -6193,6 +6216,7 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { bloom_filter_fpp_opt: bloom_filter_fpp_opt__, bloom_filter_ndv_opt: bloom_filter_ndv_opt__, coerce_int96_opt: coerce_int96_opt__, + max_predicate_cache_size_opt: max_predicate_cache_size_opt__, }) } } diff --git a/datafusion/proto-common/src/generated/prost.rs b/datafusion/proto-common/src/generated/prost.rs index aa23cea57470..aa94c7937661 100644 --- a/datafusion/proto-common/src/generated/prost.rs +++ b/datafusion/proto-common/src/generated/prost.rs @@ -833,6 +833,10 @@ pub struct ParquetOptions { pub bloom_filter_ndv_opt: ::core::option::Option, #[prost(oneof = "parquet_options::CoerceInt96Opt", tags = "32")] pub coerce_int96_opt: ::core::option::Option, + #[prost(oneof = "parquet_options::MaxPredicateCacheSizeOpt", tags = "33")] + pub max_predicate_cache_size_opt: ::core::option::Option< + parquet_options::MaxPredicateCacheSizeOpt, + >, } /// Nested message and enum types in `ParquetOptions`. pub mod parquet_options { @@ -886,6 +890,11 @@ pub mod parquet_options { #[prost(string, tag = "32")] CoerceInt96(::prost::alloc::string::String), } + #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] + pub enum MaxPredicateCacheSizeOpt { + #[prost(uint64, tag = "33")] + MaxPredicateCacheSize(u64), + } } #[derive(Clone, PartialEq, ::prost::Message)] pub struct Precision { diff --git a/datafusion/proto-common/src/to_proto/mod.rs b/datafusion/proto-common/src/to_proto/mod.rs index c06427065733..610531a5bf91 100644 --- a/datafusion/proto-common/src/to_proto/mod.rs +++ b/datafusion/proto-common/src/to_proto/mod.rs @@ -842,6 +842,7 @@ impl TryFrom<&ParquetOptions> for protobuf::ParquetOptions { binary_as_string: value.binary_as_string, skip_arrow_metadata: value.skip_arrow_metadata, coerce_int96_opt: value.coerce_int96.clone().map(protobuf::parquet_options::CoerceInt96Opt::CoerceInt96), + max_predicate_cache_size_opt: value.max_predicate_cache_size.map(|v| protobuf::parquet_options::MaxPredicateCacheSizeOpt::MaxPredicateCacheSize(v as u64)), }) } } diff --git a/datafusion/proto/src/generated/datafusion_proto_common.rs b/datafusion/proto/src/generated/datafusion_proto_common.rs index aa23cea57470..aa94c7937661 100644 --- a/datafusion/proto/src/generated/datafusion_proto_common.rs +++ b/datafusion/proto/src/generated/datafusion_proto_common.rs @@ -833,6 +833,10 @@ pub struct ParquetOptions { pub bloom_filter_ndv_opt: ::core::option::Option, #[prost(oneof = "parquet_options::CoerceInt96Opt", tags = "32")] pub coerce_int96_opt: ::core::option::Option, + #[prost(oneof = "parquet_options::MaxPredicateCacheSizeOpt", tags = "33")] + pub max_predicate_cache_size_opt: ::core::option::Option< + parquet_options::MaxPredicateCacheSizeOpt, + >, } /// Nested message and enum types in `ParquetOptions`. pub mod parquet_options { @@ -886,6 +890,11 @@ pub mod parquet_options { #[prost(string, tag = "32")] CoerceInt96(::prost::alloc::string::String), } + #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] + pub enum MaxPredicateCacheSizeOpt { + #[prost(uint64, tag = "33")] + MaxPredicateCacheSize(u64), + } } #[derive(Clone, PartialEq, ::prost::Message)] pub struct Precision { diff --git a/datafusion/proto/src/logical_plan/file_formats.rs b/datafusion/proto/src/logical_plan/file_formats.rs index 654607bd733d..09e4a6aa2a89 100644 --- a/datafusion/proto/src/logical_plan/file_formats.rs +++ b/datafusion/proto/src/logical_plan/file_formats.rs @@ -411,6 +411,9 @@ impl TableParquetOptionsProto { coerce_int96_opt: global_options.global.coerce_int96.map(|compression| { parquet_options::CoerceInt96Opt::CoerceInt96(compression) }), + max_predicate_cache_size_opt: global_options.global.max_predicate_cache_size.map(|size| { + parquet_options::MaxPredicateCacheSizeOpt::MaxPredicateCacheSize(size as u64) + }), }), column_specific_options: column_specific_options.into_iter().map(|(column_name, options)| { ParquetColumnSpecificOptions { @@ -504,6 +507,9 @@ impl From<&ParquetOptionsProto> for ParquetOptions { coerce_int96: proto.coerce_int96_opt.as_ref().map(|opt| match opt { parquet_options::CoerceInt96Opt::CoerceInt96(coerce_int96) => coerce_int96.clone(), }), + max_predicate_cache_size: proto.max_predicate_cache_size_opt.as_ref().map(|opt| match opt { + parquet_options::MaxPredicateCacheSizeOpt::MaxPredicateCacheSize(size) => *size as usize, + }), } } } diff --git a/datafusion/sqllogictest/test_files/explain_tree.slt b/datafusion/sqllogictest/test_files/explain_tree.slt index 0df361a75bae..b4916cb6e6ec 100644 --- a/datafusion/sqllogictest/test_files/explain_tree.slt +++ b/datafusion/sqllogictest/test_files/explain_tree.slt @@ -1314,7 +1314,7 @@ physical_plan 11)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ 12)│ DataSourceExec ││ DataSourceExec │ 13)│ -------------------- ││ -------------------- │ -14)│ bytes: 6040 ││ bytes: 6040 │ +14)│ bytes: 5932 ││ bytes: 5932 │ 15)│ format: memory ││ format: memory │ 16)│ rows: 1 ││ rows: 1 │ 17)└───────────────────────────┘└───────────────────────────┘ @@ -1798,7 +1798,7 @@ physical_plan 11)┌─────────────┴─────────────┐ 12)│ DataSourceExec │ 13)│ -------------------- │ -14)│ bytes: 2672 │ +14)│ bytes: 2576 │ 15)│ format: memory │ 16)│ rows: 1 │ 17)└───────────────────────────┘ @@ -1821,7 +1821,7 @@ physical_plan 11)┌─────────────┴─────────────┐ 12)│ DataSourceExec │ 13)│ -------------------- │ -14)│ bytes: 2672 │ +14)│ bytes: 2576 │ 15)│ format: memory │ 16)│ rows: 1 │ 17)└───────────────────────────┘ @@ -1844,7 +1844,7 @@ physical_plan 11)┌─────────────┴─────────────┐ 12)│ DataSourceExec │ 13)│ -------------------- │ -14)│ bytes: 2672 │ +14)│ bytes: 2576 │ 15)│ format: memory │ 16)│ rows: 1 │ 17)└───────────────────────────┘ diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index fb2c89020112..0c3c1a320234 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -241,6 +241,7 @@ datafusion.execution.parquet.dictionary_enabled true datafusion.execution.parquet.dictionary_page_size_limit 1048576 datafusion.execution.parquet.enable_page_index true datafusion.execution.parquet.encoding NULL +datafusion.execution.parquet.max_predicate_cache_size NULL datafusion.execution.parquet.max_row_group_size 1048576 datafusion.execution.parquet.maximum_buffered_record_batches_per_stream 2 datafusion.execution.parquet.maximum_parallel_row_group_writers 1 @@ -355,6 +356,7 @@ datafusion.execution.parquet.dictionary_enabled true (writing) Sets if dictionar datafusion.execution.parquet.dictionary_page_size_limit 1048576 (writing) Sets best effort maximum dictionary page size, in bytes datafusion.execution.parquet.enable_page_index true (reading) If true, reads the Parquet data page level metadata (the Page Index), if present, to reduce the I/O and number of rows decoded. datafusion.execution.parquet.encoding NULL (writing) Sets default encoding for any column. Valid values are: plain, plain_dictionary, rle, bit_packed, delta_binary_packed, delta_length_byte_array, delta_byte_array, rle_dictionary, and byte_stream_split. These values are not case sensitive. If NULL, uses default parquet writer setting +datafusion.execution.parquet.max_predicate_cache_size NULL (reading) The maximum predicate cache size, in bytes. When `pushdown_filters` is enabled, sets the maximum memory used to cache the results of predicate evaluation between filter evaluation and output generation. Decreasing this value will reduce memory usage, but may increase IO and CPU usage. None means use the default parquet reader setting. 0 means no caching. datafusion.execution.parquet.max_row_group_size 1048576 (writing) Target maximum number of rows in each row group (defaults to 1M rows). Writing larger row groups requires more memory to write, but can get better compression and be faster to read. datafusion.execution.parquet.maximum_buffered_record_batches_per_stream 2 (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame. datafusion.execution.parquet.maximum_parallel_row_group_writers 1 (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame. diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index 3d4730958fb3..c3fe5ffb1134 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -88,6 +88,7 @@ The following configuration settings are available: | datafusion.execution.parquet.binary_as_string | false | (reading) If true, parquet reader will read columns of `Binary/LargeBinary` with `Utf8`, and `BinaryView` with `Utf8View`. Parquet files generated by some legacy writers do not correctly set the UTF8 flag for strings, causing string columns to be loaded as BLOB instead. | | datafusion.execution.parquet.coerce_int96 | NULL | (reading) If true, parquet reader will read columns of physical type int96 as originating from a different resolution than nanosecond. This is useful for reading data from systems like Spark which stores microsecond resolution timestamps in an int96 allowing it to write values with a larger date range than 64-bit timestamps with nanosecond resolution. | | datafusion.execution.parquet.bloom_filter_on_read | true | (reading) Use any available bloom filters when reading parquet files | +| datafusion.execution.parquet.max_predicate_cache_size | NULL | (reading) The maximum predicate cache size, in bytes. When `pushdown_filters` is enabled, sets the maximum memory used to cache the results of predicate evaluation between filter evaluation and output generation. Decreasing this value will reduce memory usage, but may increase IO and CPU usage. None means use the default parquet reader setting. 0 means no caching. | | datafusion.execution.parquet.data_pagesize_limit | 1048576 | (writing) Sets best effort maximum size of data page in bytes | | datafusion.execution.parquet.write_batch_size | 1024 | (writing) Sets write_batch_size in bytes | | datafusion.execution.parquet.writer_version | 1.0 | (writing) Sets parquet writer version valid values are "1.0" and "2.0" | From f097ad4a78f3df29db897992262948b95fb3afb5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=ADa=20Adriana?= Date: Tue, 4 Nov 2025 08:52:25 -0500 Subject: [PATCH 12/13] Cherry-pick filter passthrough AggregateExec and UnionExec (#58) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Allow filter pushdown through AggregateExec (#18404) ## Which issue does this PR close? - Closes #18399 ## Rationale for this change Right now filters cannot pass through `AggregateExec` nodes, preventing filter pushdown optimization in queries with GROUP BY/DISTINCT operations. ## What changes are included in this PR? - Implemented `gather_filters_for_pushdown()` for `AggregateExec` that allows filters on grouping columns to pass through to children - Supports both Pre phase (static filters) and Post phase (dynamic filters from joins) Essentially, filter will pass through in the scenarios @asolimando mentioned [here](https://github.com/apache/datafusion/issues/18399#issuecomment-3472572336) ## Are these changes tested? Yes, added three tests: - `test_aggregate_filter_pushdown`: Positive case with aggregate functions - `test_no_pushdown_aggregate_filter_on_non_grouping_column`: Negative case ensuring filters on aggregate results are not pushed ## Are there any user-facing changes? (cherry picked from commit 076b091a4706973e722c36edf35f605a669e5255) * physical-plan: push filters down to UnionExec children (#18054) Filters are safe to be pushed down, so we can override the default behavior here. Signed-off-by: Alfonso Subiotto Marques (cherry picked from commit 0ecd59bd05eb31e19194d00b707037f5d210de79) * fix: prevent UnionExec panic with empty inputs (#17449) * fix: prevent UnionExec panic with empty inputs This commit fixes a panic in UnionExec when constructed with empty inputs. Previously, UnionExec::new(vec![]) would cause an index out of bounds panic at union.rs:542 when trying to access inputs[0]. Changes: - Made UnionExec::new() return Result with proper validation - Made union_schema() return Result with empty input checks - Added descriptive error messages for empty input cases - Updated all call sites to handle the new Result return type - Added comprehensive tests for edge cases Error messages: - "UnionExec requires at least one input" - "Cannot create union schema from empty inputs" The fix maintains backward compatibility for valid inputs while preventing crashes and providing clear error messages for invalid usage. Fixes #17052 * refactor: address PR review comments for UnionExec empty inputs fix - Add new try_new method that returns Result> - Deprecate existing new method in favor of try_new - Optimize single-input case: try_new returns the input directly - Remove redundant assert!(result.is_err()) from tests - Rename test_union_multiple_inputs_still_works to test_union_schema_multiple_inputs - Update all call sites to use appropriate API (try_new for new code, deprecated new for tests) This maintains backward compatibility while providing better error handling and optimization for single-input cases. * Fix cargo fmt and clippy warnings - Add proper feature gates for parquet_encryption in datasource-parquet - Format code to pass cargo fmt checks - All tests passing * Fix clippy --------- Co-authored-by: Eeshan Co-authored-by: ebembi-crdb Co-authored-by: Andrew Lamb (cherry picked from commit b122a1643adf9f89c01adda5afbfb4bdc0284e85) --------- Signed-off-by: Alfonso Subiotto Marques Co-authored-by: Alfonso Subiotto Marqués Co-authored-by: EeshanBembi <33062610+EeshanBembi@users.noreply.github.com> Co-authored-by: Eeshan Co-authored-by: ebembi-crdb Co-authored-by: Andrew Lamb --- datafusion/core/src/physical_planner.rs | 2 +- .../enforce_distribution.rs | 2 + .../physical_optimizer/filter_pushdown/mod.rs | 492 +++++++++++++++++- .../partition_statistics.rs | 1 + .../physical_optimizer/projection_pushdown.rs | 1 + .../tests/physical_optimizer/test_utils.rs | 1 + datafusion/datasource-parquet/src/opener.rs | 11 + datafusion/datasource-parquet/src/source.rs | 3 + .../physical-plan/src/aggregates/mod.rs | 88 ++++ .../physical-plan/src/repartition/mod.rs | 2 + datafusion/physical-plan/src/union.rs | 152 +++++- datafusion/proto/src/physical_plan/mod.rs | 1 + .../tests/cases/roundtrip_physical_plan.rs | 1 + 13 files changed, 727 insertions(+), 30 deletions(-) diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index a06106a96465..61069cfba0da 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -1243,7 +1243,7 @@ impl DefaultPhysicalPlanner { } // N Children - LogicalPlan::Union(_) => Arc::new(UnionExec::new(children.vec())), + LogicalPlan::Union(_) => UnionExec::try_new(children.vec())?, LogicalPlan::Extension(Extension { node }) => { let mut maybe_plan = None; let children = children.vec(); diff --git a/datafusion/core/tests/physical_optimizer/enforce_distribution.rs b/datafusion/core/tests/physical_optimizer/enforce_distribution.rs index e0826c90dd8d..1ddeb3c61148 100644 --- a/datafusion/core/tests/physical_optimizer/enforce_distribution.rs +++ b/datafusion/core/tests/physical_optimizer/enforce_distribution.rs @@ -1783,6 +1783,7 @@ fn union_to_interleave() -> Result<()> { ); // Union + #[allow(deprecated)] let plan = Arc::new(UnionExec::new(vec![left, right])); // final agg @@ -1827,6 +1828,7 @@ fn union_not_to_interleave() -> Result<()> { ); // Union + #[allow(deprecated)] let plan = Arc::new(UnionExec::new(vec![left, right])); // final agg diff --git a/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs b/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs index b949ae159756..a3b07cbc44cb 100644 --- a/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs +++ b/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs @@ -56,6 +56,7 @@ use datafusion_physical_plan::{ ExecutionPlan, }; +use datafusion_physical_plan::union::UnionExec; use futures::StreamExt; use object_store::{memory::InMemory, ObjectStore}; use util::{format_plan_for_test, OptimizationTest, TestNode, TestScanBuilder}; @@ -536,11 +537,11 @@ fn test_push_down_through_transparent_nodes() { } #[test] -fn test_no_pushdown_through_aggregates() { - // There are 2 important points here: - // 1. The outer filter **is not** pushed down at all because we haven't implemented pushdown support - // yet for AggregateExec. - // 2. The inner filter **is** pushed down into the DataSource. +fn test_pushdown_through_aggregates_on_grouping_columns() { + // Test that filters on grouping columns can be pushed through AggregateExec. + // This test has two filters: + // 1. An inner filter (a@0 = foo) below the aggregate - gets pushed to DataSource + // 2. An outer filter (b@1 = bar) above the aggregate - also gets pushed through because 'b' is a grouping column let scan = TestScanBuilder::new(schema()).with_support(true).build(); let coalesce = Arc::new(CoalesceBatchesExec::new(scan, 10)); @@ -579,7 +580,7 @@ fn test_no_pushdown_through_aggregates() { let predicate = col_lit_predicate("b", "bar", &schema()); let plan = Arc::new(FilterExec::try_new(predicate, coalesce).unwrap()); - // expect the predicate to be pushed down into the DataSource + // Both filters should be pushed down to the DataSource since both reference grouping columns insta::assert_snapshot!( OptimizationTest::new(plan, FilterPushdown::new(), true), @r" @@ -593,11 +594,10 @@ fn test_no_pushdown_through_aggregates() { - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true output: Ok: - - FilterExec: b@1 = bar - - CoalesceBatchesExec: target_batch_size=100 - - AggregateExec: mode=Final, gby=[a@0 as a, b@1 as b], aggr=[cnt], ordering_mode=PartiallySorted([0]) - - CoalesceBatchesExec: target_batch_size=10 - - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = foo + - CoalesceBatchesExec: target_batch_size=100 + - AggregateExec: mode=Final, gby=[a@0 as a, b@1 as b], aggr=[cnt], ordering_mode=Sorted + - CoalesceBatchesExec: target_batch_size=10 + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = foo AND b@1 = bar " ); } @@ -1517,6 +1517,34 @@ STORED AS PARQUET; // Pushdown pruned most rows } +#[test] +fn test_filter_pushdown_through_union() { + let scan1 = TestScanBuilder::new(schema()).with_support(true).build(); + let scan2 = TestScanBuilder::new(schema()).with_support(true).build(); + + let union = UnionExec::try_new(vec![scan1, scan2]).unwrap(); + + let predicate = col_lit_predicate("a", "foo", &schema()); + let plan = Arc::new(FilterExec::try_new(predicate, union).unwrap()); + + insta::assert_snapshot!( + OptimizationTest::new(plan, FilterPushdown::new(), true), + @r" + OptimizationTest: + input: + - FilterExec: a@0 = foo + - UnionExec + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true + output: + Ok: + - UnionExec + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = foo + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = foo + " + ); +} + /// Schema: /// a: String /// b: String @@ -1547,3 +1575,445 @@ fn col_lit_predicate( Arc::new(Literal::new(scalar_value)), )) } + +#[tokio::test] +async fn test_aggregate_filter_pushdown() { + // Test that filters can pass through AggregateExec even with aggregate functions + // when the filter references grouping columns + // Simulates: SELECT a, COUNT(b) FROM table WHERE a = 'x' GROUP BY a + + let batches = + vec![ + record_batch!(("a", Utf8, ["x", "y"]), ("b", Utf8, ["foo", "bar"])).unwrap(), + ]; + + let scan = TestScanBuilder::new(schema()) + .with_support(true) + .with_batches(batches) + .build(); + + // Create an aggregate: GROUP BY a with COUNT(b) + let group_by = PhysicalGroupBy::new_single(vec![( + col("a", &schema()).unwrap(), + "a".to_string(), + )]); + + // Add COUNT aggregate + let count_expr = + AggregateExprBuilder::new(count_udaf(), vec![col("b", &schema()).unwrap()]) + .schema(schema()) + .alias("count") + .build() + .unwrap(); + + let aggregate = Arc::new( + AggregateExec::try_new( + AggregateMode::Partial, + group_by, + vec![count_expr.into()], // Has aggregate function + vec![None], // No filter on the aggregate function + Arc::clone(&scan), + schema(), + ) + .unwrap(), + ); + + // Add a filter on the grouping column 'a' + let predicate = col_lit_predicate("a", "x", &schema()); + let plan = Arc::new(FilterExec::try_new(predicate, aggregate).unwrap()) + as Arc; + + // Even with aggregate functions, filter on grouping column should be pushed through + insta::assert_snapshot!( + OptimizationTest::new(Arc::clone(&plan), FilterPushdown::new(), true), + @r" + OptimizationTest: + input: + - FilterExec: a@0 = x + - AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[count] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true + output: + Ok: + - AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[count], ordering_mode=Sorted + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = x + " + ); +} + +#[tokio::test] +async fn test_no_pushdown_filter_on_aggregate_result() { + // Test that filters on aggregate results (not grouping columns) are NOT pushed through + // SELECT a, COUNT(b) as cnt FROM table GROUP BY a HAVING cnt > 5 + // The filter on 'cnt' cannot be pushed down because it's an aggregate result + + let batches = + vec![ + record_batch!(("a", Utf8, ["x", "y"]), ("b", Utf8, ["foo", "bar"])).unwrap(), + ]; + + let scan = TestScanBuilder::new(schema()) + .with_support(true) + .with_batches(batches) + .build(); + + // Create an aggregate: GROUP BY a with COUNT(b) + let group_by = PhysicalGroupBy::new_single(vec![( + col("a", &schema()).unwrap(), + "a".to_string(), + )]); + + // Add COUNT aggregate + let count_expr = + AggregateExprBuilder::new(count_udaf(), vec![col("b", &schema()).unwrap()]) + .schema(schema()) + .alias("count") + .build() + .unwrap(); + + let aggregate = Arc::new( + AggregateExec::try_new( + AggregateMode::Partial, + group_by, + vec![count_expr.into()], + vec![None], + Arc::clone(&scan), + schema(), + ) + .unwrap(), + ); + + // Add a filter on the aggregate output column + // This simulates filtering on COUNT result, which should NOT be pushed through + let agg_schema = aggregate.schema(); + let predicate = Arc::new(BinaryExpr::new( + Arc::new(Column::new_with_schema("count[count]", &agg_schema).unwrap()), + Operator::Gt, + Arc::new(Literal::new(ScalarValue::Int64(Some(5)))), + )); + let plan = Arc::new(FilterExec::try_new(predicate, aggregate).unwrap()) + as Arc; + + // The filter should NOT be pushed through the aggregate since it's on an aggregate result + insta::assert_snapshot!( + OptimizationTest::new(Arc::clone(&plan), FilterPushdown::new(), true), + @r" + OptimizationTest: + input: + - FilterExec: count[count]@1 > 5 + - AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[count] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true + output: + Ok: + - FilterExec: count[count]@1 > 5 + - AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[count] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true + " + ); +} + +#[test] +fn test_pushdown_filter_on_non_first_grouping_column() { + // Test that filters on non-first grouping columns are still pushed down + // SELECT a, b, count(*) as cnt FROM table GROUP BY a, b HAVING b = 'bar' + // The filter is on 'b' (second grouping column), should push down + let scan = TestScanBuilder::new(schema()).with_support(true).build(); + + let aggregate_expr = + vec![ + AggregateExprBuilder::new(count_udaf(), vec![col("c", &schema()).unwrap()]) + .schema(schema()) + .alias("cnt") + .build() + .map(Arc::new) + .unwrap(), + ]; + + let group_by = PhysicalGroupBy::new_single(vec![ + (col("a", &schema()).unwrap(), "a".to_string()), + (col("b", &schema()).unwrap(), "b".to_string()), + ]); + + let aggregate = Arc::new( + AggregateExec::try_new( + AggregateMode::Final, + group_by, + aggregate_expr.clone(), + vec![None], + scan, + schema(), + ) + .unwrap(), + ); + + let predicate = col_lit_predicate("b", "bar", &schema()); + let plan = Arc::new(FilterExec::try_new(predicate, aggregate).unwrap()); + + insta::assert_snapshot!( + OptimizationTest::new(plan, FilterPushdown::new(), true), + @r" + OptimizationTest: + input: + - FilterExec: b@1 = bar + - AggregateExec: mode=Final, gby=[a@0 as a, b@1 as b], aggr=[cnt] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true + output: + Ok: + - AggregateExec: mode=Final, gby=[a@0 as a, b@1 as b], aggr=[cnt], ordering_mode=PartiallySorted([1]) + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=b@1 = bar + " + ); +} + +#[test] +fn test_no_pushdown_grouping_sets_filter_on_missing_column() { + // Test that filters on columns missing from some grouping sets are NOT pushed through + let scan = TestScanBuilder::new(schema()).with_support(true).build(); + + let aggregate_expr = + vec![ + AggregateExprBuilder::new(count_udaf(), vec![col("c", &schema()).unwrap()]) + .schema(schema()) + .alias("cnt") + .build() + .map(Arc::new) + .unwrap(), + ]; + + // Create GROUPING SETS with (a, b) and (b) + let group_by = PhysicalGroupBy::new( + vec![ + (col("a", &schema()).unwrap(), "a".to_string()), + (col("b", &schema()).unwrap(), "b".to_string()), + ], + vec![ + ( + Arc::new(Literal::new(ScalarValue::Utf8(None))), + "a".to_string(), + ), + ( + Arc::new(Literal::new(ScalarValue::Utf8(None))), + "b".to_string(), + ), + ], + vec![ + vec![false, false], // (a, b) - both present + vec![true, false], // (b) - a is NULL, b present + ], + ); + + let aggregate = Arc::new( + AggregateExec::try_new( + AggregateMode::Final, + group_by, + aggregate_expr.clone(), + vec![None], + scan, + schema(), + ) + .unwrap(), + ); + + // Filter on column 'a' which is missing in the second grouping set, should not be pushed down + let predicate = col_lit_predicate("a", "foo", &schema()); + let plan = Arc::new(FilterExec::try_new(predicate, aggregate).unwrap()); + + insta::assert_snapshot!( + OptimizationTest::new(plan, FilterPushdown::new(), true), + @r" + OptimizationTest: + input: + - FilterExec: a@0 = foo + - AggregateExec: mode=Final, gby=[(a@0 as a, b@1 as b), (NULL as a, b@1 as b)], aggr=[cnt] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true + output: + Ok: + - FilterExec: a@0 = foo + - AggregateExec: mode=Final, gby=[(a@0 as a, b@1 as b), (NULL as a, b@1 as b)], aggr=[cnt] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true + " + ); +} + +#[test] +fn test_pushdown_grouping_sets_filter_on_common_column() { + // Test that filters on columns present in ALL grouping sets ARE pushed through + let scan = TestScanBuilder::new(schema()).with_support(true).build(); + + let aggregate_expr = + vec![ + AggregateExprBuilder::new(count_udaf(), vec![col("c", &schema()).unwrap()]) + .schema(schema()) + .alias("cnt") + .build() + .map(Arc::new) + .unwrap(), + ]; + + // Create GROUPING SETS with (a, b) and (b) + let group_by = PhysicalGroupBy::new( + vec![ + (col("a", &schema()).unwrap(), "a".to_string()), + (col("b", &schema()).unwrap(), "b".to_string()), + ], + vec![ + ( + Arc::new(Literal::new(ScalarValue::Utf8(None))), + "a".to_string(), + ), + ( + Arc::new(Literal::new(ScalarValue::Utf8(None))), + "b".to_string(), + ), + ], + vec![ + vec![false, false], // (a, b) - both present + vec![true, false], // (b) - a is NULL, b present + ], + ); + + let aggregate = Arc::new( + AggregateExec::try_new( + AggregateMode::Final, + group_by, + aggregate_expr.clone(), + vec![None], + scan, + schema(), + ) + .unwrap(), + ); + + // Filter on column 'b' which is present in all grouping sets will be pushed down + let predicate = col_lit_predicate("b", "bar", &schema()); + let plan = Arc::new(FilterExec::try_new(predicate, aggregate).unwrap()); + + insta::assert_snapshot!( + OptimizationTest::new(plan, FilterPushdown::new(), true), + @r" + OptimizationTest: + input: + - FilterExec: b@1 = bar + - AggregateExec: mode=Final, gby=[(a@0 as a, b@1 as b), (NULL as a, b@1 as b)], aggr=[cnt] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true + output: + Ok: + - AggregateExec: mode=Final, gby=[(a@0 as a, b@1 as b), (NULL as a, b@1 as b)], aggr=[cnt], ordering_mode=PartiallySorted([1]) + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=b@1 = bar + " + ); +} + +#[test] +fn test_pushdown_with_empty_group_by() { + // Test that filters can be pushed down when GROUP BY is empty (no grouping columns) + // SELECT count(*) as cnt FROM table WHERE a = 'foo' + // There are no grouping columns, so the filter should still push down + let scan = TestScanBuilder::new(schema()).with_support(true).build(); + + let aggregate_expr = + vec![ + AggregateExprBuilder::new(count_udaf(), vec![col("c", &schema()).unwrap()]) + .schema(schema()) + .alias("cnt") + .build() + .map(Arc::new) + .unwrap(), + ]; + + // Empty GROUP BY - no grouping columns + let group_by = PhysicalGroupBy::new_single(vec![]); + + let aggregate = Arc::new( + AggregateExec::try_new( + AggregateMode::Final, + group_by, + aggregate_expr.clone(), + vec![None], + scan, + schema(), + ) + .unwrap(), + ); + + // Filter on 'a' + let predicate = col_lit_predicate("a", "foo", &schema()); + let plan = Arc::new(FilterExec::try_new(predicate, aggregate).unwrap()); + + // The filter should be pushed down even with empty GROUP BY + insta::assert_snapshot!( + OptimizationTest::new(plan, FilterPushdown::new(), true), + @r" + OptimizationTest: + input: + - FilterExec: a@0 = foo + - AggregateExec: mode=Final, gby=[], aggr=[cnt] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true + output: + Ok: + - AggregateExec: mode=Final, gby=[], aggr=[cnt] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = foo + " + ); +} + +#[test] +fn test_pushdown_with_computed_grouping_key() { + // Test filter pushdown with computed grouping expression + // SELECT (c + 1.0) as c_plus_1, count(*) FROM table WHERE c > 5.0 GROUP BY (c + 1.0) + + let scan = TestScanBuilder::new(schema()).with_support(true).build(); + + let predicate = Arc::new(BinaryExpr::new( + col("c", &schema()).unwrap(), + Operator::Gt, + Arc::new(Literal::new(ScalarValue::Float64(Some(5.0)))), + )) as Arc; + let filter = Arc::new(FilterExec::try_new(predicate, scan).unwrap()); + + let aggregate_expr = + vec![ + AggregateExprBuilder::new(count_udaf(), vec![col("a", &schema()).unwrap()]) + .schema(schema()) + .alias("cnt") + .build() + .map(Arc::new) + .unwrap(), + ]; + + let c_plus_one = Arc::new(BinaryExpr::new( + col("c", &schema()).unwrap(), + Operator::Plus, + Arc::new(Literal::new(ScalarValue::Float64(Some(1.0)))), + )) as Arc; + + let group_by = + PhysicalGroupBy::new_single(vec![(c_plus_one, "c_plus_1".to_string())]); + + let plan = Arc::new( + AggregateExec::try_new( + AggregateMode::Final, + group_by, + aggregate_expr.clone(), + vec![None], + filter, + schema(), + ) + .unwrap(), + ); + + // The filter should be pushed down because 'c' is extracted from the grouping expression (c + 1.0) + insta::assert_snapshot!( + OptimizationTest::new(plan, FilterPushdown::new(), true), + @r" + OptimizationTest: + input: + - AggregateExec: mode=Final, gby=[c@2 + 1 as c_plus_1], aggr=[cnt] + - FilterExec: c@2 > 5 + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true + output: + Ok: + - AggregateExec: mode=Final, gby=[c@2 + 1 as c_plus_1], aggr=[cnt] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=c@2 > 5 + " + ); +} diff --git a/datafusion/core/tests/physical_optimizer/partition_statistics.rs b/datafusion/core/tests/physical_optimizer/partition_statistics.rs index 7e9d5bf1b901..eb174fba79f5 100644 --- a/datafusion/core/tests/physical_optimizer/partition_statistics.rs +++ b/datafusion/core/tests/physical_optimizer/partition_statistics.rs @@ -356,6 +356,7 @@ mod test { #[tokio::test] async fn test_statistic_by_partition_of_union() -> Result<()> { let scan = create_scan_exec_with_statistics(None, Some(2)).await; + #[allow(deprecated)] let union_exec: Arc = Arc::new(UnionExec::new(vec![scan.clone(), scan])); let statistics = (0..union_exec.output_partitioning().partition_count()) diff --git a/datafusion/core/tests/physical_optimizer/projection_pushdown.rs b/datafusion/core/tests/physical_optimizer/projection_pushdown.rs index 7160ed4184b0..ab753d00b4a9 100644 --- a/datafusion/core/tests/physical_optimizer/projection_pushdown.rs +++ b/datafusion/core/tests/physical_optimizer/projection_pushdown.rs @@ -1535,6 +1535,7 @@ fn test_sort_preserving_after_projection() -> Result<()> { #[test] fn test_union_after_projection() -> Result<()> { let csv = create_simple_csv_exec(); + #[allow(deprecated)] let union = Arc::new(UnionExec::new(vec![csv.clone(), csv.clone(), csv])); let projection: Arc = Arc::new(ProjectionExec::try_new( vec![ diff --git a/datafusion/core/tests/physical_optimizer/test_utils.rs b/datafusion/core/tests/physical_optimizer/test_utils.rs index 69dbe04927b2..49efe24fb8f9 100644 --- a/datafusion/core/tests/physical_optimizer/test_utils.rs +++ b/datafusion/core/tests/physical_optimizer/test_utils.rs @@ -304,6 +304,7 @@ pub fn sort_preserving_merge_exec_with_fetch( } pub fn union_exec(input: Vec>) -> Arc { + #[allow(deprecated)] Arc::new(UnionExec::new(input)) } diff --git a/datafusion/datasource-parquet/src/opener.rs b/datafusion/datasource-parquet/src/opener.rs index def53706c34a..13b8a574c353 100644 --- a/datafusion/datasource-parquet/src/opener.rs +++ b/datafusion/datasource-parquet/src/opener.rs @@ -99,6 +99,7 @@ pub(super) struct ParquetOpener { /// Coerce INT96 timestamps to specific TimeUnit pub coerce_int96: Option, /// Optional parquet FileDecryptionProperties + #[cfg(feature = "parquet_encryption")] pub file_decryption_properties: Option>, /// Rewrite expressions in the context of the file schema pub(crate) expr_adapter_factory: Option>, @@ -155,10 +156,12 @@ impl FileOpener for ParquetOpener { let mut predicate_file_schema = Arc::clone(&self.logical_file_schema); let enable_page_index = self.enable_page_index; + #[cfg(feature = "parquet_encryption")] let encryption_context = self.get_encryption_context(); let max_predicate_cache_size = self.max_predicate_cache_size; Ok(Box::pin(async move { + #[cfg(feature = "parquet_encryption")] let file_decryption_properties = encryption_context .get_file_decryption_properties(&file_location) .await?; @@ -544,6 +547,7 @@ where } #[derive(Default)] +#[cfg_attr(not(feature = "parquet_encryption"), allow(dead_code))] struct EncryptionContext { #[cfg(feature = "parquet_encryption")] file_decryption_properties: Option>, @@ -586,6 +590,7 @@ impl EncryptionContext { } #[cfg(not(feature = "parquet_encryption"))] +#[allow(dead_code)] impl EncryptionContext { async fn get_file_decryption_properties( &self, @@ -605,6 +610,7 @@ impl ParquetOpener { } #[cfg(not(feature = "parquet_encryption"))] + #[allow(dead_code)] fn get_encryption_context(&self) -> EncryptionContext { EncryptionContext::default() } @@ -861,6 +867,7 @@ mod test { schema_adapter_factory: Arc::new(DefaultSchemaAdapterFactory), enable_row_group_stats_pruning: true, coerce_int96: None, + #[cfg(feature = "parquet_encryption")] file_decryption_properties: None, expr_adapter_factory: Some(Arc::new(DefaultPhysicalExprAdapterFactory)), #[cfg(feature = "parquet_encryption")] @@ -950,6 +957,7 @@ mod test { schema_adapter_factory: Arc::new(DefaultSchemaAdapterFactory), enable_row_group_stats_pruning: true, coerce_int96: None, + #[cfg(feature = "parquet_encryption")] file_decryption_properties: None, expr_adapter_factory: Some(Arc::new(DefaultPhysicalExprAdapterFactory)), #[cfg(feature = "parquet_encryption")] @@ -1055,6 +1063,7 @@ mod test { schema_adapter_factory: Arc::new(DefaultSchemaAdapterFactory), enable_row_group_stats_pruning: true, coerce_int96: None, + #[cfg(feature = "parquet_encryption")] file_decryption_properties: None, expr_adapter_factory: Some(Arc::new(DefaultPhysicalExprAdapterFactory)), #[cfg(feature = "parquet_encryption")] @@ -1170,6 +1179,7 @@ mod test { schema_adapter_factory: Arc::new(DefaultSchemaAdapterFactory), enable_row_group_stats_pruning: false, // note that this is false! coerce_int96: None, + #[cfg(feature = "parquet_encryption")] file_decryption_properties: None, expr_adapter_factory: Some(Arc::new(DefaultPhysicalExprAdapterFactory)), #[cfg(feature = "parquet_encryption")] @@ -1286,6 +1296,7 @@ mod test { schema_adapter_factory: Arc::new(DefaultSchemaAdapterFactory), enable_row_group_stats_pruning: true, coerce_int96: None, + #[cfg(feature = "parquet_encryption")] file_decryption_properties: None, expr_adapter_factory: Some(Arc::new(DefaultPhysicalExprAdapterFactory)), #[cfg(feature = "parquet_encryption")] diff --git a/datafusion/datasource-parquet/src/source.rs b/datafusion/datasource-parquet/src/source.rs index 4fff23413dd0..f32725aa6653 100644 --- a/datafusion/datasource-parquet/src/source.rs +++ b/datafusion/datasource-parquet/src/source.rs @@ -52,6 +52,7 @@ use datafusion_physical_plan::metrics::Count; use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet; use datafusion_physical_plan::DisplayFormatType; +#[cfg(feature = "parquet_encryption")] use datafusion_common::encryption::map_config_decryption_to_decryption; #[cfg(feature = "parquet_encryption")] use datafusion_execution::parquet_encryption::EncryptionFactory; @@ -547,6 +548,7 @@ impl FileSource for ParquetSource { Arc::new(DefaultParquetFileReaderFactory::new(object_store)) as _ }); + #[cfg(feature = "parquet_encryption")] let file_decryption_properties = self .table_parquet_options() .crypto @@ -582,6 +584,7 @@ impl FileSource for ParquetSource { enable_row_group_stats_pruning: self.table_parquet_options.global.pruning, schema_adapter_factory, coerce_int96, + #[cfg(feature = "parquet_encryption")] file_decryption_properties, expr_adapter_factory, #[cfg(feature = "parquet_encryption")] diff --git a/datafusion/physical-plan/src/aggregates/mod.rs b/datafusion/physical-plan/src/aggregates/mod.rs index 878bccc1d177..30d1441f5773 100644 --- a/datafusion/physical-plan/src/aggregates/mod.rs +++ b/datafusion/physical-plan/src/aggregates/mod.rs @@ -26,12 +26,18 @@ use crate::aggregates::{ topk_stream::GroupedTopKAggregateStream, }; use crate::execution_plan::{CardinalityEffect, EmissionType}; +use crate::filter_pushdown::{ + ChildFilterDescription, FilterDescription, FilterPushdownPhase, PushedDownPredicate, +}; use crate::metrics::{ExecutionPlanMetricsSet, MetricsSet}; use crate::windows::get_ordered_partition_by_indices; use crate::{ DisplayFormatType, Distribution, ExecutionPlan, InputOrderMode, SendableRecordBatchStream, Statistics, }; +use datafusion_common::config::ConfigOptions; +use datafusion_physical_expr::utils::collect_columns; +use std::collections::HashSet; use arrow::array::{ArrayRef, UInt16Array, UInt32Array, UInt64Array, UInt8Array}; use arrow::datatypes::{Field, Schema, SchemaRef}; @@ -1025,6 +1031,88 @@ impl ExecutionPlan for AggregateExec { fn cardinality_effect(&self) -> CardinalityEffect { CardinalityEffect::LowerEqual } + + /// Push down parent filters when possible (see implementation comment for details), + /// but do not introduce any new self filters. + fn gather_filters_for_pushdown( + &self, + _phase: FilterPushdownPhase, + parent_filters: Vec>, + _config: &ConfigOptions, + ) -> Result { + // It's safe to push down filters through aggregates when filters only reference + // grouping columns, because such filters determine which groups to compute, not + // *how* to compute them. Each group's aggregate values (SUM, COUNT, etc.) are + // calculated from the same input rows regardless of whether we filter before or + // after grouping - filtering before just eliminates entire groups early. + // This optimization is NOT safe for filters on aggregated columns (like filtering on + // the result of SUM or COUNT), as those require computing all groups first. + + let grouping_columns: HashSet<_> = self + .group_by + .expr() + .iter() + .flat_map(|(expr, _)| collect_columns(expr)) + .collect(); + + // Analyze each filter separately to determine if it can be pushed down + let mut safe_filters = Vec::new(); + let mut unsafe_filters = Vec::new(); + + for filter in parent_filters { + let filter_columns: HashSet<_> = + collect_columns(&filter).into_iter().collect(); + + // Check if this filter references non-grouping columns + let references_non_grouping = !grouping_columns.is_empty() + && !filter_columns.is_subset(&grouping_columns); + + if references_non_grouping { + unsafe_filters.push(filter); + continue; + } + + // For GROUPING SETS, verify this filter's columns appear in all grouping sets + if self.group_by.groups().len() > 1 { + let filter_column_indices: Vec = filter_columns + .iter() + .filter_map(|filter_col| { + self.group_by.expr().iter().position(|(expr, _)| { + collect_columns(expr).contains(filter_col) + }) + }) + .collect(); + + // Check if any of this filter's columns are missing from any grouping set + let has_missing_column = self.group_by.groups().iter().any(|null_mask| { + filter_column_indices + .iter() + .any(|&idx| null_mask.get(idx) == Some(&true)) + }); + + if has_missing_column { + unsafe_filters.push(filter); + continue; + } + } + + // This filter is safe to push down + safe_filters.push(filter); + } + + // Build child filter description with both safe and unsafe filters + let child = self.children()[0]; + let mut child_desc = ChildFilterDescription::from_child(&safe_filters, child)?; + + // Add unsafe filters as unsupported + child_desc.parent_filters.extend( + unsafe_filters + .into_iter() + .map(PushedDownPredicate::unsupported), + ); + + Ok(FilterDescription::new().with_child(child_desc)) + } } fn create_schema( diff --git a/datafusion/physical-plan/src/repartition/mod.rs b/datafusion/physical-plan/src/repartition/mod.rs index 3cd6ee6c1af3..cd188a648f94 100644 --- a/datafusion/physical-plan/src/repartition/mod.rs +++ b/datafusion/physical-plan/src/repartition/mod.rs @@ -1783,6 +1783,7 @@ mod test { let source1 = sorted_memory_exec(&schema, sort_exprs.clone()); let source2 = sorted_memory_exec(&schema, sort_exprs); // output has multiple partitions, and is sorted + #[allow(deprecated)] let union = UnionExec::new(vec![source1, source2]); let exec = RepartitionExec::try_new(Arc::new(union), Partitioning::RoundRobinBatch(10)) @@ -1825,6 +1826,7 @@ mod test { let source1 = memory_exec(&schema); let source2 = memory_exec(&schema); // output has multiple partitions, but is not sorted + #[allow(deprecated)] let union = UnionExec::new(vec![source1, source2]); let exec = RepartitionExec::try_new(Arc::new(union), Partitioning::RoundRobinBatch(10)) diff --git a/datafusion/physical-plan/src/union.rs b/datafusion/physical-plan/src/union.rs index aca03c57b1b4..9166ff0d6455 100644 --- a/datafusion/physical-plan/src/union.rs +++ b/datafusion/physical-plan/src/union.rs @@ -36,16 +36,18 @@ use crate::execution_plan::{ boundedness_from_children, check_default_invariants, emission_type_from_children, InvariantLevel, }; +use crate::filter_pushdown::{FilterDescription, FilterPushdownPhase}; use crate::metrics::BaselineMetrics; use crate::projection::{make_with_child, ProjectionExec}; use crate::stream::ObservedStream; use arrow::datatypes::{Field, Schema, SchemaRef}; use arrow::record_batch::RecordBatch; +use datafusion_common::config::ConfigOptions; use datafusion_common::stats::Precision; use datafusion_common::{exec_err, internal_err, DataFusionError, Result}; use datafusion_execution::TaskContext; -use datafusion_physical_expr::{calculate_union, EquivalenceProperties}; +use datafusion_physical_expr::{calculate_union, EquivalenceProperties, PhysicalExpr}; use futures::Stream; use itertools::Itertools; @@ -101,8 +103,10 @@ pub struct UnionExec { impl UnionExec { /// Create a new UnionExec + #[deprecated(since = "44.0.0", note = "Use UnionExec::try_new instead")] pub fn new(inputs: Vec>) -> Self { - let schema = union_schema(&inputs); + let schema = + union_schema(&inputs).expect("UnionExec::new called with empty inputs"); // The schema of the inputs and the union schema is consistent when: // - They have the same number of fields, and // - Their fields have same types at the same indices. @@ -116,6 +120,37 @@ impl UnionExec { } } + /// Try to create a new UnionExec. + /// + /// # Errors + /// Returns an error if: + /// - `inputs` is empty + /// + /// # Optimization + /// If there is only one input, returns that input directly rather than wrapping it in a UnionExec + pub fn try_new( + inputs: Vec>, + ) -> Result> { + match inputs.len() { + 0 => exec_err!("UnionExec requires at least one input"), + 1 => Ok(inputs.into_iter().next().unwrap()), + _ => { + let schema = union_schema(&inputs)?; + // The schema of the inputs and the union schema is consistent when: + // - They have the same number of fields, and + // - Their fields have same types at the same indices. + // Here, we know that schemas are consistent and the call below can + // not return an error. + let cache = Self::compute_properties(&inputs, schema).unwrap(); + Ok(Arc::new(UnionExec { + inputs, + metrics: ExecutionPlanMetricsSet::new(), + cache, + })) + } + } + } + /// Get inputs of the execution plan pub fn inputs(&self) -> &Vec> { &self.inputs @@ -187,10 +222,6 @@ impl ExecutionPlan for UnionExec { )) } - fn children(&self) -> Vec<&Arc> { - self.inputs.iter().collect() - } - fn maintains_input_order(&self) -> Vec { // If the Union has an output ordering, it maintains at least one // child's ordering (i.e. the meet). @@ -216,11 +247,19 @@ impl ExecutionPlan for UnionExec { } } + fn benefits_from_input_partitioning(&self) -> Vec { + vec![false; self.children().len()] + } + + fn children(&self) -> Vec<&Arc> { + self.inputs.iter().collect() + } + fn with_new_children( self: Arc, children: Vec>, ) -> Result> { - Ok(Arc::new(UnionExec::new(children))) + UnionExec::try_new(children) } fn execute( @@ -293,10 +332,6 @@ impl ExecutionPlan for UnionExec { } } - fn benefits_from_input_partitioning(&self) -> Vec { - vec![false; self.children().len()] - } - fn supports_limit_pushdown(&self) -> bool { true } @@ -319,8 +354,18 @@ impl ExecutionPlan for UnionExec { .map(|child| make_with_child(projection, child)) .collect::>>()?; + #[allow(deprecated)] Ok(Some(Arc::new(UnionExec::new(new_children)))) } + + fn gather_filters_for_pushdown( + &self, + _phase: FilterPushdownPhase, + parent_filters: Vec>, + _config: &ConfigOptions, + ) -> Result { + FilterDescription::from_children(parent_filters, &self.children()) + } } /// Combines multiple input streams by interleaving them. @@ -373,7 +418,7 @@ impl InterleaveExec { "Not all InterleaveExec children have a consistent hash partitioning" ); } - let cache = Self::compute_properties(&inputs); + let cache = Self::compute_properties(&inputs)?; Ok(InterleaveExec { inputs, metrics: ExecutionPlanMetricsSet::new(), @@ -387,17 +432,17 @@ impl InterleaveExec { } /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. - fn compute_properties(inputs: &[Arc]) -> PlanProperties { - let schema = union_schema(inputs); + fn compute_properties(inputs: &[Arc]) -> Result { + let schema = union_schema(inputs)?; let eq_properties = EquivalenceProperties::new(schema); // Get output partitioning: let output_partitioning = inputs[0].output_partitioning().clone(); - PlanProperties::new( + Ok(PlanProperties::new( eq_properties, output_partitioning, emission_type_from_children(inputs), boundedness_from_children(inputs), - ) + )) } } @@ -538,7 +583,11 @@ pub fn can_interleave>>( .all(|partition| partition == *reference) } -fn union_schema(inputs: &[Arc]) -> SchemaRef { +fn union_schema(inputs: &[Arc]) -> Result { + if inputs.is_empty() { + return exec_err!("Cannot create union schema from empty inputs"); + } + let first_schema = inputs[0].schema(); let fields = (0..first_schema.fields().len()) @@ -581,7 +630,10 @@ fn union_schema(inputs: &[Arc]) -> SchemaRef { .flat_map(|i| i.schema().metadata().clone().into_iter()) .collect(); - Arc::new(Schema::new_with_metadata(fields, all_metadata_merged)) + Ok(Arc::new(Schema::new_with_metadata( + fields, + all_metadata_merged, + ))) } /// CombinedRecordBatchStream can be used to combine a Vec of SendableRecordBatchStreams into one @@ -710,6 +762,7 @@ mod tests { let csv = test::scan_partitioned(4); let csv2 = test::scan_partitioned(5); + #[allow(deprecated)] let union_exec = Arc::new(UnionExec::new(vec![csv, csv2])); // Should have 9 partitions and 9 output batches @@ -892,6 +945,7 @@ mod tests { let mut union_expected_eq = EquivalenceProperties::new(Arc::clone(&schema)); union_expected_eq.add_orderings(union_expected_orderings); + #[allow(deprecated)] let union = UnionExec::new(vec![child1, child2]); let union_eq_properties = union.properties().equivalence_properties(); let err_msg = format!( @@ -916,4 +970,66 @@ mod tests { assert!(lhs_orderings.contains(rhs_ordering), "{}", err_msg); } } + + #[test] + fn test_union_empty_inputs() { + // Test that UnionExec::try_new fails with empty inputs + let result = UnionExec::try_new(vec![]); + assert!(result + .unwrap_err() + .to_string() + .contains("UnionExec requires at least one input")); + } + + #[test] + fn test_union_schema_empty_inputs() { + // Test that union_schema fails with empty inputs + let result = union_schema(&[]); + assert!(result + .unwrap_err() + .to_string() + .contains("Cannot create union schema from empty inputs")); + } + + #[test] + fn test_union_single_input() -> Result<()> { + // Test that UnionExec::try_new returns the single input directly + let schema = create_test_schema()?; + let memory_exec: Arc = + Arc::new(TestMemoryExec::try_new(&[], Arc::clone(&schema), None)?); + let memory_exec_clone = Arc::clone(&memory_exec); + let result = UnionExec::try_new(vec![memory_exec])?; + + // Check that the result is the same as the input (no UnionExec wrapper) + assert_eq!(result.schema(), schema); + // Verify it's the same execution plan + assert!(Arc::ptr_eq(&result, &memory_exec_clone)); + + Ok(()) + } + + #[test] + fn test_union_schema_multiple_inputs() -> Result<()> { + // Test that existing functionality with multiple inputs still works + let schema = create_test_schema()?; + let memory_exec1 = + Arc::new(TestMemoryExec::try_new(&[], Arc::clone(&schema), None)?); + let memory_exec2 = + Arc::new(TestMemoryExec::try_new(&[], Arc::clone(&schema), None)?); + + let union_plan = UnionExec::try_new(vec![memory_exec1, memory_exec2])?; + + // Downcast to verify it's a UnionExec + let union = union_plan + .as_any() + .downcast_ref::() + .expect("Expected UnionExec"); + + // Check that schema is correct + assert_eq!(union.schema(), schema); + // Check that we have 2 inputs + assert_eq!(union.inputs().len(), 2); + + Ok(()) + } } diff --git a/datafusion/proto/src/physical_plan/mod.rs b/datafusion/proto/src/physical_plan/mod.rs index 8e38b0d1bf5b..ad5c33e0e74e 100644 --- a/datafusion/proto/src/physical_plan/mod.rs +++ b/datafusion/proto/src/physical_plan/mod.rs @@ -1405,6 +1405,7 @@ impl protobuf::PhysicalPlanNode { for input in &union.inputs { inputs.push(input.try_into_physical_plan(ctx, runtime, extension_codec)?); } + #[allow(deprecated)] Ok(Arc::new(UnionExec::new(inputs))) } diff --git a/datafusion/proto/tests/cases/roundtrip_physical_plan.rs b/datafusion/proto/tests/cases/roundtrip_physical_plan.rs index a5357a132eef..f408ec1a9124 100644 --- a/datafusion/proto/tests/cases/roundtrip_physical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_physical_plan.rs @@ -1649,6 +1649,7 @@ fn roundtrip_union() -> Result<()> { let left = EmptyExec::new(Arc::new(schema_left)); let right = EmptyExec::new(Arc::new(schema_right)); let inputs: Vec> = vec![Arc::new(left), Arc::new(right)]; + #[allow(deprecated)] let union = UnionExec::new(inputs); roundtrip_test(Arc::new(union)) } From 34c8e49a36343092fc2a1dce2ceb503c2b5dfd3d Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Tue, 18 Nov 2025 05:55:16 -0500 Subject: [PATCH 13/13] Switch from xz2 to liblzma to reduce duplicate dependencies (#17509) Closes https://github.com/apache/datafusion/issues/15342 Reduces the duplicate dependencies. We currently depend on bzip2 in two different ways. In attempting to reduce this, I needed to update `async-compression` which caused two different libraries to link to the system lzma library. This is not allowed in rust. This PR updates avro-rs, but we cannot merge this PR until that crate merges https://github.com/apache/avro-rs/pull/284 and we remove the crates.io patch this PR contains. Update avro-rs and switch from the unmaintained xz2 crate to liblzma. Unit tests. None. This is simply a dependency update to a more recent crate. (cherry picked from commit 5765a08c842e0b9c3a609a58a5d951733291f734) --- Cargo.lock | 1775 ++++++++--------- Cargo.toml | 3 +- datafusion/common/Cargo.toml | 2 +- datafusion/core/Cargo.toml | 4 +- datafusion/core/src/test/mod.rs | 4 +- datafusion/datasource/Cargo.toml | 6 +- .../datasource/src/file_compression_type.rs | 4 +- 7 files changed, 875 insertions(+), 923 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index eebb802394b9..5de25750ffd0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -50,15 +50,6 @@ dependencies = [ "core_extensions", ] -[[package]] -name = "addr2line" -version = "0.24.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dfbe277e56a376000877090da837660b4427aad530e3028d44e0bffe4f89a1c1" -dependencies = [ - "gimli", -] - [[package]] name = "adler2" version = "2.0.1" @@ -84,7 +75,7 @@ checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" dependencies = [ "cfg-if", "const-random", - "getrandom 0.3.3", + "getrandom 0.3.4", "once_cell", "version_check", "zerocopy", @@ -92,9 +83,9 @@ dependencies = [ [[package]] name = "aho-corasick" -version = "1.1.3" +version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" dependencies = [ "memchr", ] @@ -120,12 +111,6 @@ version = "0.2.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" -[[package]] -name = "android-tzdata" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0" - [[package]] name = "android_system_properties" version = "0.1.5" @@ -143,9 +128,9 @@ checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" [[package]] name = "anstream" -version = "0.6.20" +version = "0.6.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ae563653d1938f79b1ab1b5e668c87c76a9930414574a6583a7b7e11a8e6192" +checksum = "43d5b281e737544384e969a5ccad3f1cdd24b48086a0fc1b2a5262a26b8f4f4a" dependencies = [ "anstyle", "anstyle-parse", @@ -158,9 +143,9 @@ dependencies = [ [[package]] name = "anstyle" -version = "1.0.11" +version = "1.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "862ed96ca487e809f1c8e5a8447f6ee2cf102f846893800b20cebdf541fc6bbd" +checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78" [[package]] name = "anstyle-parse" @@ -173,41 +158,42 @@ dependencies = [ [[package]] name = "anstyle-query" -version = "1.1.4" +version = "1.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e231f6134f61b71076a3eab506c379d4f36122f2af15a9ff04415ea4c3339e2" +checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" dependencies = [ - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] name = "anstyle-wincon" -version = "3.0.10" +version = "3.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3e0633414522a32ffaac8ac6cc8f748e090c5717661fddeea04219e2344f5f2a" +checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" dependencies = [ "anstyle", "once_cell_polyfill", - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] name = "anyhow" -version = "1.0.99" +version = "1.0.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0674a1ddeecb70197781e945de4b3b8ffb61fa939a5597bcf48503737663100" +checksum = "a23eb6b1614318a8071c9b2521f36b424b2c83db5eb3a0fead4a6c0809af6e61" [[package]] name = "apache-avro" -version = "0.20.0" +version = "0.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a033b4ced7c585199fb78ef50fca7fe2f444369ec48080c5fd072efa1a03cc7" +checksum = "36fa98bc79671c7981272d91a8753a928ff6a1cd8e4f20a44c45bd5d313840bf" dependencies = [ "bigdecimal", "bon", - "bzip2 0.6.0", + "bzip2", "crc32fast", "digest", + "liblzma", "log", "miniz_oxide", "num-bigint", @@ -218,14 +204,22 @@ dependencies = [ "serde_bytes", "serde_json", "snap", - "strum", - "strum_macros", + "strum 0.27.2", + "strum_macros 0.27.2", "thiserror", "uuid", - "xz2", "zstd", ] +[[package]] +name = "ar_archive_writer" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0c269894b6fe5e9d7ada0cf69b5bf847ff35bc25fc271f08e1d080fce80339a" +dependencies = [ + "object", +] + [[package]] name = "arrayref" version = "0.3.9" @@ -240,9 +234,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "56.1.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c26b57282a08ae92f727497805122fec964c6245cfa0e13f0e75452eaf3bc41f" +checksum = "6e833808ff2d94ed40d9379848a950d995043c7fb3e81a30b383f4c6033821cc" dependencies = [ "arrow-arith", "arrow-array", @@ -264,9 +258,9 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "56.1.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cebf38ca279120ff522f4954b81a39527425b6e9f615e6b72842f4de1ffe02b8" +checksum = "ad08897b81588f60ba983e3ca39bda2b179bdd84dced378e7df81a5313802ef8" dependencies = [ "arrow-array", "arrow-buffer", @@ -278,9 +272,9 @@ dependencies = [ [[package]] name = "arrow-array" -version = "56.1.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "744109142cdf8e7b02795e240e20756c2a782ac9180d4992802954a8f871c0de" +checksum = "8548ca7c070d8db9ce7aa43f37393e4bfcf3f2d3681df278490772fd1673d08d" dependencies = [ "ahash 0.8.12", "arrow-buffer", @@ -289,15 +283,15 @@ dependencies = [ "chrono", "chrono-tz", "half", - "hashbrown 0.15.5", + "hashbrown 0.16.0", "num", ] [[package]] name = "arrow-buffer" -version = "56.1.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "601bb103c4c374bcd1f62c66bcea67b42a2ee91a690486c37d4c180236f11ccc" +checksum = "e003216336f70446457e280807a73899dd822feaf02087d31febca1363e2fccc" dependencies = [ "bytes", "half", @@ -306,9 +300,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "56.1.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eed61d9d73eda8df9e3014843def37af3050b5080a9acbe108f045a316d5a0be" +checksum = "919418a0681298d3a77d1a315f625916cb5678ad0d74b9c60108eb15fd083023" dependencies = [ "arrow-array", "arrow-buffer", @@ -327,9 +321,9 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "56.1.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa95b96ce0c06b4d33ac958370db8c0d31e88e54f9d6e08b0353d18374d9f991" +checksum = "bfa9bf02705b5cf762b6f764c65f04ae9082c7cfc4e96e0c33548ee3f67012eb" dependencies = [ "arrow-array", "arrow-cast", @@ -342,9 +336,9 @@ dependencies = [ [[package]] name = "arrow-data" -version = "56.1.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43407f2c6ba2367f64d85d4603d6fb9c4b92ed79d2ffd21021b37efa96523e12" +checksum = "a5c64fff1d142f833d78897a772f2e5b55b36cb3e6320376f0961ab0db7bd6d0" dependencies = [ "arrow-buffer", "arrow-schema", @@ -354,9 +348,9 @@ dependencies = [ [[package]] name = "arrow-flight" -version = "56.1.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7c66c5e4a7aedc2bfebffeabc2116d76adb22e08d230b968b995da97f8b11ca" +checksum = "8c8b0ba0784d56bc6266b79f5de7a24b47024e7b3a0045d2ad4df3d9b686099f" dependencies = [ "arrow-arith", "arrow-array", @@ -381,9 +375,9 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "56.1.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4b0487c4d2ad121cbc42c4db204f1509f8618e589bc77e635e9c40b502e3b90" +checksum = "1d3594dcddccc7f20fd069bc8e9828ce37220372680ff638c5e00dea427d88f5" dependencies = [ "arrow-array", "arrow-buffer", @@ -397,9 +391,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "56.1.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26d747573390905905a2dc4c5a61a96163fe2750457f90a04ee2a88680758c79" +checksum = "88cf36502b64a127dc659e3b305f1d993a544eab0d48cce704424e62074dc04b" dependencies = [ "arrow-array", "arrow-buffer", @@ -408,7 +402,7 @@ dependencies = [ "arrow-schema", "chrono", "half", - "indexmap 2.11.0", + "indexmap 2.12.0", "lexical-core", "memchr", "num", @@ -419,9 +413,9 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "56.1.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c142a147dceb59d057bad82400f1693847c80dca870d008bf7b91caf902810ae" +checksum = "3c8f82583eb4f8d84d4ee55fd1cb306720cddead7596edce95b50ee418edf66f" dependencies = [ "arrow-array", "arrow-buffer", @@ -432,9 +426,9 @@ dependencies = [ [[package]] name = "arrow-pyarrow" -version = "56.1.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b9038de599df1b81f63b42220e2b6cd6fd4f09af333858cd320db9bef5ac757" +checksum = "7d924b32e96f8bb74d94cd82bd97b313c432fcb0ea331689ef9e7c6b8be4b258" dependencies = [ "arrow-array", "arrow-data", @@ -444,9 +438,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "56.1.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dac6620667fccdab4204689ca173bd84a15de6bb6b756c3a8764d4d7d0c2fc04" +checksum = "9d07ba24522229d9085031df6b94605e0f4b26e099fb7cdeec37abd941a73753" dependencies = [ "arrow-array", "arrow-buffer", @@ -457,20 +451,20 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "56.1.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dfa93af9ff2bb80de539e6eb2c1c8764abd0f4b73ffb0d7c82bf1f9868785e66" +checksum = "b3aa9e59c611ebc291c28582077ef25c97f1975383f1479b12f3b9ffee2ffabe" dependencies = [ - "bitflags 2.9.1", + "bitflags 2.10.0", "serde", "serde_json", ] [[package]] name = "arrow-select" -version = "56.1.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be8b2e0052cd20d36d64f32640b68a5ab54d805d24a473baee5d52017c85536c" +checksum = "8c41dbbd1e97bfcaee4fcb30e29105fb2c75e4d82ae4de70b792a5d3f66b2e7a" dependencies = [ "ahash 0.8.12", "arrow-array", @@ -482,9 +476,9 @@ dependencies = [ [[package]] name = "arrow-string" -version = "56.1.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2155e26e17f053c8975c546fc70cf19c00542f9abf43c23a88a46ef7204204f" +checksum = "53f5183c150fbc619eede22b861ea7c0eebed8eaac0333eaa7f6da5205fd504d" dependencies = [ "arrow-array", "arrow-buffer", @@ -511,13 +505,12 @@ dependencies = [ [[package]] name = "assert_cmd" -version = "2.0.17" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2bd389a4b2970a01282ee455294913c0a43724daedcd1a24c3eb0ec1c1320b66" +checksum = "bcbb6924530aa9e0432442af08bbcafdad182db80d2e560da42a6d442535bf85" dependencies = [ "anstyle", "bstr", - "doc-comment", "libc", "predicates", "predicates-core", @@ -527,19 +520,15 @@ dependencies = [ [[package]] name = "async-compression" -version = "0.4.19" +version = "0.4.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06575e6a9673580f52661c92107baabffbf41e2141373441cbcdc47cb733003c" +checksum = "93c1f86859c1af3d514fa19e8323147ff10ea98684e6c7b307912509f50e67b2" dependencies = [ - "bzip2 0.5.2", - "flate2", + "compression-codecs", + "compression-core", "futures-core", - "memchr", "pin-project-lite", "tokio", - "xz2", - "zstd", - "zstd-safe", ] [[package]] @@ -559,7 +548,7 @@ checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] @@ -570,7 +559,7 @@ checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] @@ -596,9 +585,9 @@ checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" [[package]] name = "aws-config" -version = "1.8.6" +version = "1.8.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8bc1b40fb26027769f16960d2f4a6bc20c4bb755d403e552c8c1a73af433c246" +checksum = "1856b1b48b65f71a4dd940b1c0931f9a7b646d4a924b9828ffefc1454714668a" dependencies = [ "aws-credential-types", "aws-runtime", @@ -626,9 +615,9 @@ dependencies = [ [[package]] name = "aws-credential-types" -version = "1.2.6" +version = "1.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d025db5d9f52cbc413b167136afb3d8aeea708c0d8884783cf6253be5e22f6f2" +checksum = "86590e57ea40121d47d3f2e131bfd873dea15d78dc2f4604f4734537ad9e56c4" dependencies = [ "aws-smithy-async", "aws-smithy-runtime-api", @@ -638,9 +627,9 @@ dependencies = [ [[package]] name = "aws-lc-rs" -version = "1.13.3" +version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c953fe1ba023e6b7730c0d4b031d06f267f23a46167dcbd40316644b10a17ba" +checksum = "5932a7d9d28b0d2ea34c6b3779d35e3dd6f6345317c34e73438c4f1f29144151" dependencies = [ "aws-lc-sys", "zeroize", @@ -648,9 +637,9 @@ dependencies = [ [[package]] name = "aws-lc-sys" -version = "0.30.0" +version = "0.33.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbfd150b5dbdb988bcc8fb1fe787eb6b7ee6180ca24da683b61ea5405f3d43ff" +checksum = "1826f2e4cfc2cd19ee53c42fbf68e2f81ec21108e0b7ecf6a71cf062137360fc" dependencies = [ "bindgen", "cc", @@ -661,9 +650,9 @@ dependencies = [ [[package]] name = "aws-runtime" -version = "1.5.10" +version = "1.5.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c034a1bc1d70e16e7f4e4caf7e9f7693e4c9c24cd91cf17c2a0b21abaebc7c8b" +checksum = "8fe0fd441565b0b318c76e7206c8d1d0b0166b3e986cf30e890b61feb6192045" dependencies = [ "aws-credential-types", "aws-sigv4", @@ -685,9 +674,9 @@ dependencies = [ [[package]] name = "aws-sdk-sso" -version = "1.83.0" +version = "1.89.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "643cd43af212d2a1c4dedff6f044d7e1961e5d9e7cfe773d70f31d9842413886" +checksum = "a9c1b1af02288f729e95b72bd17988c009aa72e26dcb59b3200f86d7aea726c9" dependencies = [ "aws-credential-types", "aws-runtime", @@ -707,9 +696,9 @@ dependencies = [ [[package]] name = "aws-sdk-ssooidc" -version = "1.84.0" +version = "1.91.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "20ec4a95bd48e0db7a424356a161f8d87bd6a4f0af37204775f0da03d9e39fc3" +checksum = "4e8122301558dc7c6c68e878af918880b82ff41897a60c8c4e18e4dc4d93e9f1" dependencies = [ "aws-credential-types", "aws-runtime", @@ -729,9 +718,9 @@ dependencies = [ [[package]] name = "aws-sdk-sts" -version = "1.85.0" +version = "1.92.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "410309ad0df4606bc721aff0d89c3407682845453247213a0ccc5ff8801ee107" +checksum = "a0c7808adcff8333eaa76a849e6de926c6ac1a1268b9fd6afe32de9c29ef29d2" dependencies = [ "aws-credential-types", "aws-runtime", @@ -752,9 +741,9 @@ dependencies = [ [[package]] name = "aws-sigv4" -version = "1.3.4" +version = "1.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "084c34162187d39e3740cb635acd73c4e3a551a36146ad6fe8883c929c9f876c" +checksum = "c35452ec3f001e1f2f6db107b6373f1f48f05ec63ba2c5c9fa91f07dad32af11" dependencies = [ "aws-credential-types", "aws-smithy-http", @@ -774,9 +763,9 @@ dependencies = [ [[package]] name = "aws-smithy-async" -version = "1.2.5" +version = "1.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e190749ea56f8c42bf15dd76c65e14f8f765233e6df9b0506d9d934ebef867c" +checksum = "127fcfad33b7dfc531141fda7e1c402ac65f88aca5511a4d31e2e3d2cd01ce9c" dependencies = [ "futures-util", "pin-project-lite", @@ -785,15 +774,16 @@ dependencies = [ [[package]] name = "aws-smithy-http" -version = "0.62.3" +version = "0.62.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c4dacf2d38996cf729f55e7a762b30918229917eca115de45dfa8dfb97796c9" +checksum = "445d5d720c99eed0b4aa674ed00d835d9b1427dd73e04adaf2f94c6b2d6f9fca" dependencies = [ "aws-smithy-runtime-api", "aws-smithy-types", "bytes", "bytes-utils", "futures-core", + "futures-util", "http 0.2.12", "http 1.3.1", "http-body 0.4.6", @@ -805,9 +795,9 @@ dependencies = [ [[package]] name = "aws-smithy-http-client" -version = "1.1.1" +version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "147e8eea63a40315d704b97bf9bc9b8c1402ae94f89d5ad6f7550d963309da1b" +checksum = "623254723e8dfd535f566ee7b2381645f8981da086b5c4aa26c0c41582bb1d2c" dependencies = [ "aws-smithy-async", "aws-smithy-runtime-api", @@ -829,27 +819,27 @@ dependencies = [ [[package]] name = "aws-smithy-json" -version = "0.61.5" +version = "0.61.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eaa31b350998e703e9826b2104dd6f63be0508666e1aba88137af060e8944047" +checksum = "2db31f727935fc63c6eeae8b37b438847639ec330a9161ece694efba257e0c54" dependencies = [ "aws-smithy-types", ] [[package]] name = "aws-smithy-observability" -version = "0.1.3" +version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9364d5989ac4dd918e5cc4c4bdcc61c9be17dcd2586ea7f69e348fc7c6cab393" +checksum = "2d1881b1ea6d313f9890710d65c158bdab6fb08c91ea825f74c1c8c357baf4cc" dependencies = [ "aws-smithy-runtime-api", ] [[package]] name = "aws-smithy-query" -version = "0.60.7" +version = "0.60.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2fbd61ceb3fe8a1cb7352e42689cec5335833cd9f94103a61e98f9bb61c64bb" +checksum = "d28a63441360c477465f80c7abac3b9c4d075ca638f982e605b7dc2a2c7156c9" dependencies = [ "aws-smithy-types", "urlencoding", @@ -857,9 +847,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime" -version = "1.9.1" +version = "1.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d3946acbe1ead1301ba6862e712c7903ca9bb230bdf1fbd1b5ac54158ef2ab1f" +checksum = "0bbe9d018d646b96c7be063dd07987849862b0e6d07c778aad7d93d1be6c1ef0" dependencies = [ "aws-smithy-async", "aws-smithy-http", @@ -881,9 +871,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime-api" -version = "1.9.0" +version = "1.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07f5e0fc8a6b3f2303f331b94504bbf754d85488f402d6f1dd7a6080f99afe56" +checksum = "ec7204f9fd94749a7c53b26da1b961b4ac36bf070ef1e0b94bb09f79d4f6c193" dependencies = [ "aws-smithy-async", "aws-smithy-types", @@ -898,9 +888,9 @@ dependencies = [ [[package]] name = "aws-smithy-types" -version = "1.3.2" +version = "1.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d498595448e43de7f4296b7b7a18a8a02c61ec9349128c80a368f7c3b4ab11a8" +checksum = "25f535879a207fce0db74b679cfc3e91a3159c8144d717d55f5832aea9eef46e" dependencies = [ "base64-simd", "bytes", @@ -921,18 +911,18 @@ dependencies = [ [[package]] name = "aws-smithy-xml" -version = "0.60.10" +version = "0.60.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3db87b96cb1b16c024980f133968d52882ca0daaee3a086c6decc500f6c99728" +checksum = "eab77cdd036b11056d2a30a7af7b775789fb024bf216acc13884c6c97752ae56" dependencies = [ "xmlparser", ] [[package]] name = "aws-types" -version = "1.3.8" +version = "1.3.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b069d19bf01e46298eaedd7c6f283fe565a59263e53eebec945f3e6398f42390" +checksum = "d79fb68e3d7fe5d4833ea34dc87d2e97d26d3086cb3da660bb6b1f76d98680b6" dependencies = [ "aws-credential-types", "aws-smithy-async", @@ -944,9 +934,9 @@ dependencies = [ [[package]] name = "axum" -version = "0.8.4" +version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "021e862c184ae977658b36c4500f7feac3221ca5da43e3f25bd04ab6c79a29b5" +checksum = "5b098575ebe77cb6d14fc7f32749631a6e44edbef6b796f89b020e99ba20d425" dependencies = [ "axum-core", "bytes", @@ -960,8 +950,7 @@ dependencies = [ "mime", "percent-encoding", "pin-project-lite", - "rustversion", - "serde", + "serde_core", "sync_wrapper", "tower", "tower-layer", @@ -970,9 +959,9 @@ dependencies = [ [[package]] name = "axum-core" -version = "0.5.2" +version = "0.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68464cd0412f486726fb3373129ef5d2993f90c34bc2bc1c1e9943b2f4fc7ca6" +checksum = "59446ce19cd142f8833f856eb31f3eb097812d1479ab224f54d72428ca21ea22" dependencies = [ "bytes", "futures-core", @@ -981,27 +970,11 @@ dependencies = [ "http-body-util", "mime", "pin-project-lite", - "rustversion", "sync_wrapper", "tower-layer", "tower-service", ] -[[package]] -name = "backtrace" -version = "0.3.75" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6806a6321ec58106fea15becdad98371e28d92ccbc7c8f1b3b6dd724fe8f1002" -dependencies = [ - "addr2line", - "cfg-if", - "libc", - "miniz_oxide", - "object", - "rustc-demangle", - "windows-targets 0.52.6", -] - [[package]] name = "base64" version = "0.21.7" @@ -1026,9 +999,9 @@ dependencies = [ [[package]] name = "bigdecimal" -version = "0.4.8" +version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a22f228ab7a1b23027ccc6c350b72868017af7ea8356fbdf19f8d991c690013" +checksum = "560f42649de9fa436b73517378a147ec21f6c997a546581df4b4b31677828934" dependencies = [ "autocfg", "libm", @@ -1040,25 +1013,22 @@ dependencies = [ [[package]] name = "bindgen" -version = "0.69.5" +version = "0.72.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088" +checksum = "993776b509cfb49c750f11b8f07a46fa23e0a1386ffc01fb1e7d343efc387895" dependencies = [ - "bitflags 2.9.1", + "bitflags 2.10.0", "cexpr", "clang-sys", - "itertools 0.12.1", - "lazy_static", - "lazycell", + "itertools 0.13.0", "log", "prettyplease", "proc-macro2", "quote", "regex", - "rustc-hash 1.1.0", + "rustc-hash", "shlex", - "syn 2.0.106", - "which", + "syn 2.0.110", ] [[package]] @@ -1069,9 +1039,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" -version = "2.9.1" +version = "2.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b8e56985ec62d17e9c1001dc89c88ecd7dc08e47eba5ec7c29c7b5eeecde967" +checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3" [[package]] name = "bitvec" @@ -1168,9 +1138,9 @@ dependencies = [ [[package]] name = "bon" -version = "3.7.1" +version = "3.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "537c317ddf588aab15c695bf92cf55dec159b93221c074180ca3e0e5a94da415" +checksum = "ebeb9aaf9329dff6ceb65c689ca3db33dbf15f324909c60e4e5eef5701ce31b1" dependencies = [ "bon-macros", "rustversion", @@ -1178,17 +1148,17 @@ dependencies = [ [[package]] name = "bon-macros" -version = "3.7.1" +version = "3.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca5abbf2d4a4c6896197c9de13d6d7cb7eff438c63dacde1dde980569cb00248" +checksum = "77e9d642a7e3a318e37c2c9427b5a6a48aa1ad55dcd986f3034ab2239045a645" dependencies = [ - "darling 0.21.3", + "darling", "ident_case", "prettyplease", "proc-macro2", "quote", "rustversion", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] @@ -1211,14 +1181,14 @@ dependencies = [ "proc-macro-crate", "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] name = "brotli" -version = "8.0.1" +version = "8.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9991eea70ea4f293524138648e41ee89b0b2b12ddef3b255effa43c8056e0e0d" +checksum = "4bd8b9603c7aa97359dbd97ecf258968c95f3adddd6db2f7e7a5bef101c84560" dependencies = [ "alloc-no-stdlib", "alloc-stdlib", @@ -1237,9 +1207,9 @@ dependencies = [ [[package]] name = "bstr" -version = "1.12.0" +version = "1.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "234113d19d0d7d613b40e86fb654acf958910802bcceab913a4f9e7cda03b1a4" +checksum = "63044e1ae8e69f3b5a92c736ca6269b8d12fa7efe39bf34ddb06d102cf0e2cab" dependencies = [ "memchr", "regex-automata", @@ -1282,9 +1252,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.10.1" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a" +checksum = "b35204fbdc0b3f4446b89fc1ac2cf84a8a68971995d0bf2e925ec7cd960f9cb3" [[package]] name = "bytes-utils" @@ -1298,32 +1268,13 @@ dependencies = [ [[package]] name = "bzip2" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49ecfb22d906f800d4fe833b6282cf4dc1c298f5057ca0b5445e5c209735ca47" -dependencies = [ - "bzip2-sys", -] - -[[package]] -name = "bzip2" -version = "0.6.0" +version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bea8dcd42434048e4f7a304411d9273a411f647446c1234a65ce0554923f4cff" +checksum = "f3a53fac24f34a81bc9954b5d6cfce0c21e18ec6959f44f56e8e90e4bb7c346c" dependencies = [ "libbz2-rs-sys", ] -[[package]] -name = "bzip2-sys" -version = "0.1.13+1.0.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "225bff33b2141874fe80d71e07d6eec4f85c5c216453dd96388240f96e1acc14" -dependencies = [ - "cc", - "pkg-config", -] - [[package]] name = "cast" version = "0.3.0" @@ -1332,10 +1283,11 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "cc" -version = "1.2.32" +version = "1.2.46" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2352e5597e9c544d5e6d9c95190d5d27738ade584fa8db0a16e130e5c2b5296e" +checksum = "b97463e1064cb1b1c1384ad0a0b9c8abd0988e2a91f52606c80ef14aadb63e36" dependencies = [ + "find-msvc-tools", "jobserver", "libc", "shlex", @@ -1352,9 +1304,9 @@ dependencies = [ [[package]] name = "cfg-if" -version = "1.0.1" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9555578bc9e57714c812a1f84e4fc5b4d21fcb063490c624de019f7464c91268" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" [[package]] name = "cfg_aliases" @@ -1364,17 +1316,16 @@ checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" [[package]] name = "chrono" -version = "0.4.41" +version = "0.4.42" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c469d952047f47f91b68d1cba3f10d63c11d73e4636f24f08daf0278abf01c4d" +checksum = "145052bdd345b87320e369255277e3fb5152762ad123a901ef5c262dd38fe8d2" dependencies = [ - "android-tzdata", "iana-time-zone", "js-sys", "num-traits", "serde", "wasm-bindgen", - "windows-link", + "windows-link 0.2.1", ] [[package]] @@ -1422,7 +1373,7 @@ checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4" dependencies = [ "glob", "libc", - "libloading 0.8.8", + "libloading 0.8.9", ] [[package]] @@ -1438,9 +1389,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.47" +version = "4.5.52" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7eac00902d9d136acd712710d71823fb8ac8004ca445a89e73a41d45aa712931" +checksum = "aa8120877db0e5c011242f96806ce3c94e0737ab8108532a76a3300a01db2ab8" dependencies = [ "clap_builder", "clap_derive", @@ -1448,9 +1399,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.47" +version = "4.5.52" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2ad9bbf750e73b5884fb8a211a9424a1906c1e156724260fdae972f31d70e1d6" +checksum = "02576b399397b659c26064fbc92a75fede9d18ffd5f80ca1cd74ddab167016e1" dependencies = [ "anstream", "anstyle", @@ -1460,21 +1411,21 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.5.47" +version = "4.5.49" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbfd7eae0b0f1a6e63d4b13c9c478de77c2eb546fba158ad50b4203dc24b9f9c" +checksum = "2a0b5487afeab2deb2ff4e03a807ad1a03ac532ff5a2cee5d86884440c7f7671" dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] name = "clap_lex" -version = "0.7.5" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b94f61472cee1439c0b966b47e3aca9ae07e45d070759512cd390ea2bebc6675" +checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d" [[package]] name = "clipboard-win" @@ -1502,14 +1453,36 @@ checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" [[package]] name = "comfy-table" -version = "7.1.4" +version = "7.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a65ebfec4fb190b6f90e944a817d60499ee0744e582530e2c9900a22e591d9a" +checksum = "e0d05af1e006a2407bedef5af410552494ce5be9090444dbbcb57258c1af3d56" dependencies = [ - "unicode-segmentation", - "unicode-width 0.2.1", + "strum 0.26.3", + "strum_macros 0.26.4", + "unicode-width 0.2.2", ] +[[package]] +name = "compression-codecs" +version = "0.4.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "680dc087785c5230f8e8843e2e57ac7c1c90488b6a91b88caa265410568f441b" +dependencies = [ + "bzip2", + "compression-core", + "flate2", + "liblzma", + "memchr", + "zstd", + "zstd-safe", +] + +[[package]] +name = "compression-core" +version = "0.4.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a9b614a5787ef0c8802a55766480563cb3a93b435898c422ed2a359cf811582" + [[package]] name = "console" version = "0.15.11" @@ -1524,15 +1497,15 @@ dependencies = [ [[package]] name = "console" -version = "0.16.0" +version = "0.16.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e09ced7ebbccb63b4c65413d821f2e00ce54c5ca4514ddc6b3c892fdbcbc69d" +checksum = "b430743a6eb14e9764d4260d4c0d8123087d504eeb9c48f2b2a5e810dd369df4" dependencies = [ "encode_unicode", "libc", "once_cell", - "unicode-width 0.2.1", - "windows-sys 0.60.2", + "unicode-width 0.2.2", + "windows-sys 0.61.2", ] [[package]] @@ -1567,9 +1540,9 @@ dependencies = [ [[package]] name = "const_panic" -version = "0.2.14" +version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb8a602185c3c95b52f86dc78e55a6df9a287a7a93ddbcf012509930880cf879" +checksum = "e262cdaac42494e3ae34c43969f9cdeb7da178bdb4b66fa6a1ea2edb4c8ae652" dependencies = [ "typewit", ] @@ -1598,18 +1571,18 @@ checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" [[package]] name = "core_extensions" -version = "1.5.3" +version = "1.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92c71dc07c9721607e7a16108336048ee978c3a8b129294534272e8bac96c0ee" +checksum = "42bb5e5d0269fd4f739ea6cedaf29c16d81c27a7ce7582008e90eb50dcd57003" dependencies = [ "core_extensions_proc_macros", ] [[package]] name = "core_extensions_proc_macros" -version = "1.5.3" +version = "1.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69f3b219d28b6e3b4ac87bc1fc522e0803ab22e055da177bff0068c4150c61a6" +checksum = "533d38ecd2709b7608fb8e18e4504deb99e9a72879e6aa66373a76d8dc4259ea" [[package]] name = "cpufeatures" @@ -1638,7 +1611,7 @@ dependencies = [ "anes", "cast", "ciborium", - "clap 4.5.47", + "clap 4.5.52", "criterion-plot", "futures", "is-terminal", @@ -1709,9 +1682,9 @@ checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" [[package]] name = "crypto-common" -version = "0.1.6" +version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" +checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a" dependencies = [ "generic-array", "typenum", @@ -1719,21 +1692,21 @@ dependencies = [ [[package]] name = "csv" -version = "1.3.1" +version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "acdc4883a9c96732e4733212c01447ebd805833b7275a73ca3ee080fd77afdaf" +checksum = "52cd9d68cf7efc6ddfaaee42e7288d3a99d613d4b50f76ce9827ae0c6e14f938" dependencies = [ "csv-core", "itoa", "ryu", - "serde", + "serde_core", ] [[package]] name = "csv-core" -version = "0.1.12" +version = "0.1.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d02f3b0da4c6504f86e9cd789d8dbafab48c2321be74e9987593de5a894d93d" +checksum = "704a3c26996a80471189265814dbc2c257598b96b8a7feae2d31ace646bb9782" dependencies = [ "memchr", ] @@ -1760,38 +1733,14 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b365fabc795046672053e29c954733ec3b05e4be654ab130fe8f1f94d7051f35" -[[package]] -name = "darling" -version = "0.20.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc7f46116c46ff9ab3eb1597a45688b6715c6e628b5c133e288e709a29bcb4ee" -dependencies = [ - "darling_core 0.20.11", - "darling_macro 0.20.11", -] - [[package]] name = "darling" version = "0.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9cdf337090841a411e2a7f3deb9187445851f91b309c0c0a29e05f74a00a48c0" dependencies = [ - "darling_core 0.21.3", - "darling_macro 0.21.3", -] - -[[package]] -name = "darling_core" -version = "0.20.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d00b9596d185e565c2207a0b01f8bd1a135483d02d9b7b0a54b11da8d53412e" -dependencies = [ - "fnv", - "ident_case", - "proc-macro2", - "quote", - "strsim", - "syn 2.0.106", + "darling_core", + "darling_macro", ] [[package]] @@ -1805,18 +1754,7 @@ dependencies = [ "proc-macro2", "quote", "strsim", - "syn 2.0.106", -] - -[[package]] -name = "darling_macro" -version = "0.20.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead" -dependencies = [ - "darling_core 0.20.11", - "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] @@ -1825,9 +1763,9 @@ version = "0.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d38308df82d1080de0afee5d069fa14b0326a88c14f15c5ccda35b4a6c414c81" dependencies = [ - "darling_core 0.21.3", + "darling_core", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] @@ -1853,7 +1791,7 @@ dependencies = [ "arrow-schema", "async-trait", "bytes", - "bzip2 0.6.0", + "bzip2", "chrono", "criterion", "ctor", @@ -1894,6 +1832,7 @@ dependencies = [ "hex", "insta", "itertools 0.14.0", + "liblzma", "log", "nix", "object_store", @@ -1913,7 +1852,6 @@ dependencies = [ "tokio", "url", "uuid", - "xz2", "zstd", ] @@ -1997,7 +1935,7 @@ dependencies = [ "async-trait", "aws-config", "aws-credential-types", - "clap 4.5.47", + "clap 4.5.52", "ctor", "datafusion", "dirs", @@ -2033,7 +1971,7 @@ dependencies = [ "half", "hashbrown 0.14.5", "hex", - "indexmap 2.11.0", + "indexmap 2.12.0", "insta", "libc", "log", @@ -2065,7 +2003,7 @@ dependencies = [ "async-compression", "async-trait", "bytes", - "bzip2 0.6.0", + "bzip2", "chrono", "criterion", "datafusion-common", @@ -2081,6 +2019,7 @@ dependencies = [ "futures", "glob", "itertools 0.14.0", + "liblzma", "log", "object_store", "parquet", @@ -2089,7 +2028,6 @@ dependencies = [ "tokio", "tokio-util", "url", - "xz2", "zstd", ] @@ -2271,7 +2209,7 @@ dependencies = [ "datafusion-functions-window-common", "datafusion-physical-expr-common", "env_logger", - "indexmap 2.11.0", + "indexmap 2.12.0", "insta", "paste", "recursive", @@ -2285,7 +2223,7 @@ version = "50.0.0" dependencies = [ "arrow", "datafusion-common", - "indexmap 2.11.0", + "indexmap 2.12.0", "itertools 0.14.0", "paste", ] @@ -2440,7 +2378,7 @@ version = "50.0.0" dependencies = [ "datafusion-expr", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] @@ -2461,7 +2399,7 @@ dependencies = [ "datafusion-physical-expr", "datafusion-sql", "env_logger", - "indexmap 2.11.0", + "indexmap 2.12.0", "insta", "itertools 0.14.0", "log", @@ -2485,13 +2423,13 @@ dependencies = [ "datafusion-physical-expr-common", "half", "hashbrown 0.14.5", - "indexmap 2.11.0", + "indexmap 2.12.0", "insta", "itertools 0.14.0", "log", "parking_lot", "paste", - "petgraph 0.8.2", + "petgraph 0.8.3", "rand 0.9.2", "rstest", ] @@ -2567,7 +2505,7 @@ dependencies = [ "futures", "half", "hashbrown 0.14.5", - "indexmap 2.11.0", + "indexmap 2.12.0", "insta", "itertools 0.14.0", "log", @@ -2692,7 +2630,7 @@ dependencies = [ "datafusion-functions-nested", "datafusion-functions-window", "env_logger", - "indexmap 2.11.0", + "indexmap 2.12.0", "insta", "log", "paste", @@ -2711,7 +2649,7 @@ dependencies = [ "bigdecimal", "bytes", "chrono", - "clap 4.5.47", + "clap 4.5.52", "datafusion", "datafusion-spark", "datafusion-substrait", @@ -2769,7 +2707,7 @@ dependencies = [ "datafusion-optimizer", "datafusion-physical-plan", "datafusion-sql", - "getrandom 0.3.3", + "getrandom 0.3.4", "insta", "object_store", "tokio", @@ -2780,12 +2718,12 @@ dependencies = [ [[package]] name = "deranged" -version = "0.4.0" +version = "0.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c9e6a11ca8224451684bc0d7d5a7adbf8f2fd6887261a1cfc3c0432f9d4068e" +checksum = "ececcb659e7ba858fb4f10388c250a7252eb0a27373f1a72b8748afdd248e587" dependencies = [ "powerfmt", - "serde", + "serde_core", ] [[package]] @@ -2829,7 +2767,7 @@ dependencies = [ "libc", "option-ext", "redox_users", - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -2840,14 +2778,14 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] name = "doc-comment" -version = "0.3.3" +version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10" +checksum = "780955b8b195a21ab8e4ac6b60dd1dbdcec1dc6c51c0617964b08c81785e12c9" [[package]] name = "docker_credential" @@ -2896,7 +2834,7 @@ dependencies = [ "enum-ordinalize", "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] @@ -2919,29 +2857,29 @@ checksum = "c34f04666d835ff5d62e058c3995147c06f42fe86ff053337632bca83e42702d" [[package]] name = "enum-ordinalize" -version = "4.3.0" +version = "4.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fea0dcfa4e54eeb516fe454635a95753ddd39acda650ce703031c6973e315dd5" +checksum = "4a1091a7bb1f8f2c4b28f1fe2cef4980ca2d410a3d727d67ecc3178c9b0800f0" dependencies = [ "enum-ordinalize-derive", ] [[package]] name = "enum-ordinalize-derive" -version = "4.3.1" +version = "4.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d28318a75d4aead5c4db25382e8ef717932d0346600cacae6357eb5941bc5ff" +checksum = "8ca9601fb2d62598ee17836250842873a413586e5d7ed88b356e38ddbb0ec631" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] name = "env_filter" -version = "0.1.3" +version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "186e05a59d4c50738528153b83b0b0194d3a29507dfec16eccd4b342903397d0" +checksum = "1bf3c259d255ca70051b30e2e95b5446cdb8949ac4cd22c0d7fd634d89f568e2" dependencies = [ "log", "regex", @@ -2968,12 +2906,12 @@ checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" [[package]] name = "errno" -version = "0.3.13" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "778e2ac28f6c47af28e4907f13ffd1e1ddbd400980a9abd7c8df189bf578a5ad" +checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -3018,7 +2956,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ce92ff622d6dadf7349484f42c93271a0d49b7cc4d466a936405bacbe10aa78" dependencies = [ "cfg-if", - "rustix 1.0.8", + "rustix", "windows-sys 0.59.0", ] @@ -3054,16 +2992,22 @@ dependencies = [ [[package]] name = "filetime" -version = "0.2.25" +version = "0.2.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35c0522e981e68cbfa8c3f978441a5f34b30b96e146b33cd3359176b50fe8586" +checksum = "bc0505cd1b6fa6580283f6bdf70a73fcf4aba1184038c90902b92b3dd0df63ed" dependencies = [ "cfg-if", "libc", "libredox", - "windows-sys 0.59.0", + "windows-sys 0.60.2", ] +[[package]] +name = "find-msvc-tools" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a3076410a55c90011c298b04d0cfa770b00fa04e1e3c97d3f6c9de105a03844" + [[package]] name = "fixedbitset" version = "0.5.7" @@ -3072,19 +3016,19 @@ checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99" [[package]] name = "flatbuffers" -version = "25.2.10" +version = "25.9.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1045398c1bfd89168b5fd3f1fc11f6e70b34f6f66300c87d44d3de849463abf1" +checksum = "09b6620799e7340ebd9968d2e0708eb82cf1971e9a16821e2091b6d6e475eed5" dependencies = [ - "bitflags 2.9.1", + "bitflags 2.10.0", "rustc_version", ] [[package]] name = "flate2" -version = "1.1.2" +version = "1.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a3d7db9596fecd151c5f638c0ee5d5bd487b6e0ea232e5dc96d5250f6f94b1d" +checksum = "bfe33edd8e85a12a67454e37f8c75e730830d83e313556ab9ebf9ee7fbeb3bfb" dependencies = [ "crc32fast", "libz-rs-sys", @@ -3112,6 +3056,12 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" +[[package]] +name = "foldhash" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" + [[package]] name = "form_urlencoded" version = "1.2.2" @@ -3123,9 +3073,9 @@ dependencies = [ [[package]] name = "fs-err" -version = "3.1.1" +version = "3.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88d7be93788013f265201256d58f04936a8079ad5dc898743aa20525f503b683" +checksum = "62d91fd049c123429b018c47887d3f75a265540dd3c30ba9cb7bae9197edb03a" dependencies = [ "autocfg", ] @@ -3198,7 +3148,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] @@ -3281,30 +3231,24 @@ dependencies = [ "cfg-if", "js-sys", "libc", - "wasi 0.11.1+wasi-snapshot-preview1", + "wasi", "wasm-bindgen", ] [[package]] name = "getrandom" -version = "0.3.3" +version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4" +checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" dependencies = [ "cfg-if", "js-sys", "libc", "r-efi", - "wasi 0.14.2+wasi-0.2.4", + "wasip2", "wasm-bindgen", ] -[[package]] -name = "gimli" -version = "0.31.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f" - [[package]] name = "glob" version = "0.3.3" @@ -3313,9 +3257,9 @@ checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280" [[package]] name = "globset" -version = "0.4.16" +version = "0.4.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54a1028dfc5f5df5da8a56a73e6c153c9a9708ec57232470703592a3f18e49f5" +checksum = "52dfc19153a48bde0cbd630453615c8151bce3a5adfac7a0aebfbf0a1e1f57e3" dependencies = [ "aho-corasick", "bstr", @@ -3336,7 +3280,7 @@ dependencies = [ "futures-core", "futures-sink", "http 1.3.1", - "indexmap 2.11.0", + "indexmap 2.12.0", "slab", "tokio", "tokio-util", @@ -3345,13 +3289,14 @@ dependencies = [ [[package]] name = "half" -version = "2.6.0" +version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "459196ed295495a68f7d7fe1d84f6c4b7ff0e21fe3017b2f283c6fac3ad803c9" +checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" dependencies = [ "cfg-if", "crunchy", "num-traits", + "zerocopy", ] [[package]] @@ -3378,10 +3323,19 @@ name = "hashbrown" version = "0.15.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" +dependencies = [ + "foldhash 0.1.5", +] + +[[package]] +name = "hashbrown" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5419bdc4f6a9207fbeba6d11b604d481addf78ecd10c11ad51e76c2f6482748d" dependencies = [ "allocator-api2", "equivalent", - "foldhash", + "foldhash 0.2.0", ] [[package]] @@ -3422,11 +3376,11 @@ dependencies = [ [[package]] name = "home" -version = "0.5.11" +version = "0.5.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "589533453244b0995c858700322199b2becb13b627df2851f64a2775d024abcf" +checksum = "cc627f471c528ff0c4a49e1d5e60450c8f6461dd6d10ba9dcd3a61d3dff7728d" dependencies = [ - "windows-sys 0.59.0", + "windows-sys 0.61.2", ] [[package]] @@ -3499,19 +3453,20 @@ checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" [[package]] name = "humantime" -version = "2.2.0" +version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b112acc8b3adf4b107a8ec20977da0273a8c386765a3ec0229bd500a1443f9f" +checksum = "135b12329e5e3ce057a9f972339ea52bc954fe1e9358ef27f95e89716fbc5424" [[package]] name = "hyper" -version = "1.6.0" +version = "1.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc2b571658e38e0c01b1fdca3bbbe93c00d3d71693ff2770043f8c29bc7d6f80" +checksum = "2ab2d4f250c3d7b1c9fcdff1cece94ea4e2dfbec68614f7b87cb205f24ca9d11" dependencies = [ + "atomic-waker", "bytes", "futures-channel", - "futures-util", + "futures-core", "h2", "http 1.3.1", "http-body 1.0.1", @@ -3519,6 +3474,7 @@ dependencies = [ "httpdate", "itoa", "pin-project-lite", + "pin-utils", "smallvec", "tokio", "want", @@ -3571,9 +3527,9 @@ dependencies = [ [[package]] name = "hyper-util" -version = "0.1.16" +version = "0.1.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d9b05277c7e8da2c93a568989bb6207bef0112e8d17df7a6eda4a3cf143bc5e" +checksum = "52e9a2a24dc5c6821e71a7030e1e14b7b632acac55c40e9d2e082c621261bb56" dependencies = [ "base64 0.22.1", "bytes", @@ -3587,7 +3543,7 @@ dependencies = [ "libc", "percent-encoding", "pin-project-lite", - "socket2 0.6.0", + "socket2 0.6.1", "tokio", "tower-service", "tracing", @@ -3610,9 +3566,9 @@ dependencies = [ [[package]] name = "iana-time-zone" -version = "0.1.63" +version = "0.1.64" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0c919e5debc312ad217002b8048a17b7d83f80703865bbfcfebb0458b0b27d8" +checksum = "33e57f83510bb73707521ebaffa789ec8caf86f9657cad665b092b581d40e9fb" dependencies = [ "android_system_properties", "core-foundation-sys", @@ -3620,7 +3576,7 @@ dependencies = [ "js-sys", "log", "wasm-bindgen", - "windows-core", + "windows-core 0.62.2", ] [[package]] @@ -3634,9 +3590,9 @@ dependencies = [ [[package]] name = "icu_collections" -version = "2.0.0" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "200072f5d0e3614556f94a9930d5dc3e0662a652823904c3a75dc3b0af7fee47" +checksum = "4c6b649701667bbe825c3b7e6388cb521c23d88644678e83c0c4d0a621a34b43" dependencies = [ "displaydoc", "potential_utf", @@ -3647,9 +3603,9 @@ dependencies = [ [[package]] name = "icu_locale_core" -version = "2.0.0" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0cde2700ccaed3872079a65fb1a78f6c0a36c91570f28755dda67bc8f7d9f00a" +checksum = "edba7861004dd3714265b4db54a3c390e880ab658fec5f7db895fae2046b5bb6" dependencies = [ "displaydoc", "litemap", @@ -3660,11 +3616,10 @@ dependencies = [ [[package]] name = "icu_normalizer" -version = "2.0.0" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "436880e8e18df4d7bbc06d58432329d6458cc84531f7ac5f024e93deadb37979" +checksum = "5f6c8828b67bf8908d82127b2054ea1b4427ff0230ee9141c54251934ab1b599" dependencies = [ - "displaydoc", "icu_collections", "icu_normalizer_data", "icu_properties", @@ -3675,42 +3630,38 @@ dependencies = [ [[package]] name = "icu_normalizer_data" -version = "2.0.0" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "00210d6893afc98edb752b664b8890f0ef174c8adbb8d0be9710fa66fbbf72d3" +checksum = "7aedcccd01fc5fe81e6b489c15b247b8b0690feb23304303a9e560f37efc560a" [[package]] name = "icu_properties" -version = "2.0.1" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "016c619c1eeb94efb86809b015c58f479963de65bdb6253345c1a1276f22e32b" +checksum = "e93fcd3157766c0c8da2f8cff6ce651a31f0810eaa1c51ec363ef790bbb5fb99" dependencies = [ - "displaydoc", "icu_collections", "icu_locale_core", "icu_properties_data", "icu_provider", - "potential_utf", "zerotrie", "zerovec", ] [[package]] name = "icu_properties_data" -version = "2.0.1" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "298459143998310acd25ffe6810ed544932242d3f07083eee1084d83a71bd632" +checksum = "02845b3647bb045f1100ecd6480ff52f34c35f82d9880e029d329c21d1054899" [[package]] name = "icu_provider" -version = "2.0.0" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03c80da27b5f4187909049ee2d72f276f0d9f99a42c306bd0131ecfe04d8e5af" +checksum = "85962cf0ce02e1e0a629cc34e7ca3e373ce20dda4c4d7294bbd0bf1fdb59e614" dependencies = [ "displaydoc", "icu_locale_core", - "stable_deref_trait", - "tinystr", "writeable", "yoke", "zerofrom", @@ -3758,33 +3709,37 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.11.0" +version = "2.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2481980430f9f78649238835720ddccc57e52df14ffce1c6f37391d61b563e9" +checksum = "6717a8d2a5a929a1a2eb43a12812498ed141a0bcfb7e8f7844fbdbe4303bba9f" dependencies = [ "equivalent", - "hashbrown 0.15.5", + "hashbrown 0.16.0", "serde", + "serde_core", ] [[package]] name = "indicatif" -version = "0.18.0" +version = "0.18.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70a646d946d06bedbbc4cac4c218acf4bbf2d87757a784857025f4d447e4e1cd" +checksum = "9375e112e4b463ec1b1c6c011953545c65a30164fbab5b581df32b3abf0dcb88" dependencies = [ - "console 0.16.0", + "console 0.16.1", "portable-atomic", - "unicode-width 0.2.1", + "unicode-width 0.2.2", "unit-prefix", "web-time", ] [[package]] name = "indoc" -version = "2.0.6" +version = "2.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4c7245a08504955605670dbf141fceab975f15ca21570696aebe9d2e71576bd" +checksum = "79cf5c93f93228cf8efb3ba362535fb11199ac548a09ce117c9b1adc3030d706" +dependencies = [ + "rustversion", +] [[package]] name = "insta" @@ -3818,17 +3773,6 @@ version = "3.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" -[[package]] -name = "io-uring" -version = "0.7.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d93587f37623a1a17d94ef2bc9ada592f5465fe7732084ab7beefabe5c77c0c4" -dependencies = [ - "bitflags 2.9.1", - "cfg-if", - "libc", -] - [[package]] name = "ipnet" version = "2.11.0" @@ -3837,9 +3781,9 @@ checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130" [[package]] name = "iri-string" -version = "0.7.8" +version = "0.7.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbc5ebe9c3a1a7a5127f920a418f7585e9e758e911d0466ed004f393b0e380b2" +checksum = "4f867b9d1d896b67beb18518eda36fdb77a32ea590de864f1325b294a6d14397" dependencies = [ "memchr", "serde", @@ -3847,20 +3791,20 @@ dependencies = [ [[package]] name = "is-terminal" -version = "0.4.16" +version = "0.4.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e04d7f318608d35d4b61ddd75cbdaee86b023ebe2bd5a66ee0915f0bf93095a9" +checksum = "3640c1c38b8e4e43584d8df18be5fc6b0aa314ce6ebf51b53313d4306cca8e46" dependencies = [ "hermit-abi", "libc", - "windows-sys 0.59.0", + "windows-sys 0.61.2", ] [[package]] name = "is_terminal_polyfill" -version = "1.70.1" +version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" +checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" [[package]] name = "itertools" @@ -3871,15 +3815,6 @@ dependencies = [ "either", ] -[[package]] -name = "itertools" -version = "0.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" -dependencies = [ - "either", -] - [[package]] name = "itertools" version = "0.13.0" @@ -3906,43 +3841,43 @@ checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" [[package]] name = "jiff" -version = "0.2.15" +version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be1f93b8b1eb69c77f24bbb0afdf66f54b632ee39af40ca21c4365a1d7347e49" +checksum = "49cce2b81f2098e7e3efc35bc2e0a6b7abec9d34128283d7a26fa8f32a6dbb35" dependencies = [ "jiff-static", "log", "portable-atomic", "portable-atomic-util", - "serde", + "serde_core", ] [[package]] name = "jiff-static" -version = "0.2.15" +version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03343451ff899767262ec32146f6d559dd759fdadf42ff0e227c7c48f72594b4" +checksum = "980af8b43c3ad5d8d349ace167ec8170839f753a42d233ba19e08afe1850fa69" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] name = "jobserver" -version = "0.1.33" +version = "0.1.34" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38f262f097c174adebe41eb73d66ae9c06b2844fb0da69969647bbddd9b0538a" +checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33" dependencies = [ - "getrandom 0.3.3", + "getrandom 0.3.4", "libc", ] [[package]] name = "js-sys" -version = "0.3.77" +version = "0.3.82" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cfaf33c695fc6e08064efbc1f72ec937429614f25eef83af942d0e227c3a28f" +checksum = "b011eec8cc36da2aab2d5cff675ec18454fad408585853910a202391cf9f8e65" dependencies = [ "once_cell", "wasm-bindgen", @@ -3954,17 +3889,11 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" -[[package]] -name = "lazycell" -version = "1.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" - [[package]] name = "lexical-core" -version = "1.0.5" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b765c31809609075565a70b4b71402281283aeda7ecaf4818ac14a7b2ade8958" +checksum = "7d8d125a277f807e55a77304455eb7b1cb52f2b18c143b60e766c120bd64a594" dependencies = [ "lexical-parse-float", "lexical-parse-integer", @@ -3975,53 +3904,46 @@ dependencies = [ [[package]] name = "lexical-parse-float" -version = "1.0.5" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de6f9cb01fb0b08060209a057c048fcbab8717b4c1ecd2eac66ebfe39a65b0f2" +checksum = "52a9f232fbd6f550bc0137dcb5f99ab674071ac2d690ac69704593cb4abbea56" dependencies = [ "lexical-parse-integer", "lexical-util", - "static_assertions", ] [[package]] name = "lexical-parse-integer" -version = "1.0.5" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72207aae22fc0a121ba7b6d479e42cbfea549af1479c3f3a4f12c70dd66df12e" +checksum = "9a7a039f8fb9c19c996cd7b2fcce303c1b2874fe1aca544edc85c4a5f8489b34" dependencies = [ "lexical-util", - "static_assertions", ] [[package]] name = "lexical-util" -version = "1.0.6" +version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a82e24bf537fd24c177ffbbdc6ebcc8d54732c35b50a3f28cc3f4e4c949a0b3" -dependencies = [ - "static_assertions", -] +checksum = "2604dd126bb14f13fb5d1bd6a66155079cb9fa655b37f875b3a742c705dbed17" [[package]] name = "lexical-write-float" -version = "1.0.5" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c5afc668a27f460fb45a81a757b6bf2f43c2d7e30cb5a2dcd3abf294c78d62bd" +checksum = "50c438c87c013188d415fbabbb1dceb44249ab81664efbd31b14ae55dabb6361" dependencies = [ "lexical-util", "lexical-write-integer", - "static_assertions", ] [[package]] name = "lexical-write-integer" -version = "1.0.5" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "629ddff1a914a836fb245616a7888b62903aae58fa771e1d83943035efa0f978" +checksum = "409851a618475d2d5796377cad353802345cba92c867d9fbcde9cf4eac4e14df" dependencies = [ "lexical-util", - "static_assertions", ] [[package]] @@ -4032,9 +3954,9 @@ checksum = "2c4a545a15244c7d945065b5d392b2d2d7f21526fba56ce51467b06ed445e8f7" [[package]] name = "libc" -version = "0.2.175" +version = "0.2.177" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a82ae493e598baaea5209805c49bbf2ea7de956d50d7da0da1164f9c6d28543" +checksum = "2874a2af47a2325c2001a6e6fad9b16a53b802102b528163885171cf92b15976" [[package]] name = "libloading" @@ -4048,12 +3970,32 @@ dependencies = [ [[package]] name = "libloading" -version = "0.8.8" +version = "0.8.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07033963ba89ebaf1584d767badaa2e8fcec21aedea6b8c0346d487d49c28667" +checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55" dependencies = [ "cfg-if", - "windows-targets 0.53.3", + "windows-link 0.2.1", +] + +[[package]] +name = "liblzma" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73c36d08cad03a3fbe2c4e7bb3a9e84c57e4ee4135ed0b065cade3d98480c648" +dependencies = [ + "liblzma-sys", +] + +[[package]] +name = "liblzma-sys" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01b9596486f6d60c3bbe644c0e1be1aa6ccc472ad630fe8927b456973d7cb736" +dependencies = [ + "cc", + "libc", + "pkg-config", ] [[package]] @@ -4075,13 +4017,13 @@ dependencies = [ [[package]] name = "libredox" -version = "0.1.9" +version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "391290121bad3d37fbddad76d8f5d1c1c314cfc646d143d7e07a3086ddff0ce3" +checksum = "416f7e718bdb06000964960ffa43b4335ad4012ae8b99060261aa4a8088d5ccb" dependencies = [ - "bitflags 2.9.1", + "bitflags 2.10.0", "libc", - "redox_syscall 0.5.17", + "redox_syscall 0.5.18", ] [[package]] @@ -4092,52 +4034,45 @@ checksum = "5297962ef19edda4ce33aaa484386e0a5b3d7f2f4e037cbeee00503ef6b29d33" dependencies = [ "anstream", "anstyle", - "clap 4.5.47", + "clap 4.5.52", "escape8259", ] [[package]] name = "libz-rs-sys" -version = "0.5.1" +version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "172a788537a2221661b480fee8dc5f96c580eb34fa88764d3205dc356c7e4221" +checksum = "840db8cf39d9ec4dd794376f38acc40d0fc65eec2a8f484f7fd375b84602becd" dependencies = [ "zlib-rs", ] [[package]] name = "linux-raw-sys" -version = "0.4.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab" - -[[package]] -name = "linux-raw-sys" -version = "0.9.4" +version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd945864f07fe9f5371a27ad7b52a172b4b499999f1d97574c9fa68373937e12" +checksum = "df1d3c3b53da64cf5760482273a98e575c651a67eec7f77df96b5b642de8f039" [[package]] name = "litemap" -version = "0.8.0" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "241eaef5fd12c88705a01fc1066c48c4b36e0dd4377dcdc7ec3942cea7a69956" +checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77" [[package]] name = "lock_api" -version = "0.4.13" +version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96936507f153605bddfcda068dd804796c84324ed2510809e5b2a624c81da765" +checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965" dependencies = [ - "autocfg", "scopeguard", ] [[package]] name = "log" -version = "0.4.27" +version = "0.4.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" +checksum = "34080505efa8e45a4b816c349525ebe327ceaa8559756f0356cba97ef3bf7432" [[package]] name = "lru-slab" @@ -4154,17 +4089,6 @@ dependencies = [ "twox-hash", ] -[[package]] -name = "lzma-sys" -version = "0.1.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27" -dependencies = [ - "cc", - "libc", - "pkg-config", -] - [[package]] name = "matchit" version = "0.8.4" @@ -4183,9 +4107,9 @@ dependencies = [ [[package]] name = "memchr" -version = "2.7.5" +version = "2.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0" +checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273" [[package]] name = "memoffset" @@ -4234,17 +4158,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" dependencies = [ "adler2", + "simd-adler32", ] [[package]] name = "mio" -version = "1.0.4" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78bed444cc8a2160f01cbcf811ef18cac863ad68ae8ca62092e8db51d51c761c" +checksum = "69d83b0086dc8ecf3ce9ae2874b2d1290252e2a30720bea58a5c6639b0092873" dependencies = [ "libc", - "wasi 0.11.1+wasi-snapshot-preview1", - "windows-sys 0.59.0", + "wasi", + "windows-sys 0.61.2", ] [[package]] @@ -4268,7 +4193,7 @@ version = "0.30.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "74523f3a35e05aba87a1d978330aef40f67b0304ac79c1c00b294c9830543db6" dependencies = [ - "bitflags 2.9.1", + "bitflags 2.10.0", "cfg-if", "cfg_aliases", "libc", @@ -4301,11 +4226,11 @@ dependencies = [ [[package]] name = "nu-ansi-term" -version = "0.50.1" +version = "0.50.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4a28e057d01f97e61255210fcff094d74ed0466038633e95017f5beb68e4399" +checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" dependencies = [ - "windows-sys 0.52.0", + "windows-sys 0.61.2", ] [[package]] @@ -4391,18 +4316,18 @@ dependencies = [ [[package]] name = "objc2-core-foundation" -version = "0.3.1" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c10c2894a6fed806ade6027bcd50662746363a9589d3ec9d9bef30a4e4bc166" +checksum = "2a180dd8642fa45cdb7dd721cd4c11b1cadd4929ce112ebd8b9f5803cc79d536" dependencies = [ - "bitflags 2.9.1", + "bitflags 2.10.0", ] [[package]] name = "objc2-io-kit" -version = "0.3.1" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "71c1c64d6120e51cd86033f67176b1cb66780c2efe34dec55176f77befd93c0a" +checksum = "33fafba39597d6dc1fb709123dfa8289d39406734be322956a69f0931c73bb15" dependencies = [ "libc", "objc2-core-foundation", @@ -4410,18 +4335,18 @@ dependencies = [ [[package]] name = "object" -version = "0.36.7" +version = "0.32.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62948e14d923ea95ea2c7c86c71013138b66525b86bdc08d2dcc262bdb497b87" +checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441" dependencies = [ "memchr", ] [[package]] name = "object_store" -version = "0.12.3" +version = "0.12.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "efc4f07659e11cd45a341cd24d71e683e3be65d9ff1f8150061678fe60437496" +checksum = "4c1be0c6c22ec0817cdc77d3842f721a17fd30ab6965001415b5402a74e6b740" dependencies = [ "async-trait", "base64 0.22.1", @@ -4462,9 +4387,9 @@ checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" [[package]] name = "once_cell_polyfill" -version = "1.70.1" +version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad" +checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" [[package]] name = "oorandom" @@ -4501,15 +4426,15 @@ checksum = "1a80800c0488c3a21695ea981a54918fbb37abf04f4d0720c453632255e2ff0e" [[package]] name = "owo-colors" -version = "4.2.2" +version = "4.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48dd4f4a2c8405440fd0462561f0e5806bd0f77e86f51c761481bdd4018b545e" +checksum = "9c6901729fa79e91a0913333229e9ca5dc725089d1c363b2f4b4760709dc4a52" [[package]] name = "parking_lot" -version = "0.12.4" +version = "0.12.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70d58bf43669b5795d1576d0641cfb6fbb2057bf629506267a92807158584a13" +checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a" dependencies = [ "lock_api", "parking_lot_core", @@ -4517,22 +4442,22 @@ dependencies = [ [[package]] name = "parking_lot_core" -version = "0.9.11" +version = "0.9.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc838d2a56b5b1a6c25f55575dfc605fabb63bb2365f6c2353ef9159aa69e4a5" +checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" dependencies = [ "cfg-if", "libc", - "redox_syscall 0.5.17", + "redox_syscall 0.5.18", "smallvec", - "windows-targets 0.52.6", + "windows-link 0.2.1", ] [[package]] name = "parquet" -version = "56.1.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89b56b41d1bd36aae415e42f91cae70ee75cf6cba74416b14dce3e958d5990ec" +checksum = "f0dbd48ad52d7dccf8ea1b90a3ddbfaea4f69878dd7683e51c507d4bc52b5b27" dependencies = [ "ahash 0.8.12", "arrow-array", @@ -4549,7 +4474,7 @@ dependencies = [ "flate2", "futures", "half", - "hashbrown 0.15.5", + "hashbrown 0.16.0", "lz4_flex", "num", "num-bigint", @@ -4587,7 +4512,7 @@ dependencies = [ "regex", "regex-syntax", "structmeta", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] @@ -4646,53 +4571,54 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3672b37090dbd86368a4145bc067582552b29c27377cad4e0a306c97f9bd7772" dependencies = [ "fixedbitset", - "indexmap 2.11.0", + "indexmap 2.12.0", ] [[package]] name = "petgraph" -version = "0.8.2" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54acf3a685220b533e437e264e4d932cfbdc4cc7ec0cd232ed73c08d03b8a7ca" +checksum = "8701b58ea97060d5e5b155d383a69952a60943f0e6dfe30b04c287beb0b27455" dependencies = [ "fixedbitset", "hashbrown 0.15.5", - "indexmap 2.11.0", + "indexmap 2.12.0", "serde", ] [[package]] name = "phf" -version = "0.11.3" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078" +checksum = "913273894cec178f401a31ec4b656318d95473527be05c0752cc41cdc32be8b7" dependencies = [ - "phf_shared 0.11.3", + "phf_shared 0.12.1", ] [[package]] name = "phf" -version = "0.12.1" +version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "913273894cec178f401a31ec4b656318d95473527be05c0752cc41cdc32be8b7" +checksum = "c1562dc717473dbaa4c1f85a36410e03c047b2e7df7f45ee938fbef64ae7fadf" dependencies = [ - "phf_shared 0.12.1", + "phf_shared 0.13.1", + "serde", ] [[package]] name = "phf_shared" -version = "0.11.3" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5" +checksum = "06005508882fb681fd97892ecff4b7fd0fee13ef1aa569f8695dae7ab9099981" dependencies = [ "siphasher", ] [[package]] name = "phf_shared" -version = "0.12.1" +version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06005508882fb681fd97892ecff4b7fd0fee13ef1aa569f8695dae7ab9099981" +checksum = "e57fef6bc5981e38c2ce2d63bfa546861309f875b8a75f092d1d54ae2d64f266" dependencies = [ "siphasher", ] @@ -4714,7 +4640,7 @@ checksum = "6e918e4ff8c4549eb882f14b3a4bc8c8bc93de829416eacf579f1207a8fbf861" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] @@ -4780,21 +4706,21 @@ dependencies = [ [[package]] name = "postgres-derive" -version = "0.4.6" +version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69700ea4603c5ef32d447708e6a19cd3e8ac197a000842e97f527daea5e4175f" +checksum = "56df96f5394370d1b20e49de146f9e6c25aa9ae750f449c9d665eafecb3ccae6" dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] name = "postgres-protocol" -version = "0.6.8" +version = "0.6.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76ff0abab4a9b844b93ef7b81f1efc0a366062aaef2cd702c76256b5dc075c54" +checksum = "fbef655056b916eb868048276cfd5d6a7dea4f81560dfd047f97c8c6fe3fcfd4" dependencies = [ "base64 0.22.1", "byteorder", @@ -4810,9 +4736,9 @@ dependencies = [ [[package]] name = "postgres-types" -version = "0.2.9" +version = "0.2.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "613283563cd90e1dfc3518d548caee47e0e725455ed619881f5cf21f36de4b48" +checksum = "ef4605b7c057056dd35baeb6ac0c0338e4975b1f2bef0f65da953285eb007095" dependencies = [ "bytes", "chrono", @@ -4823,9 +4749,9 @@ dependencies = [ [[package]] name = "potential_utf" -version = "0.1.2" +version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5a7c30837279ca13e7c867e9e40053bc68740f988cb07f7ca6df43cc734b585" +checksum = "b73949432f5e2a09657003c25bca5e19a0e9c84f8058ca374f49e0ebe605af77" dependencies = [ "zerovec", ] @@ -4887,19 +4813,19 @@ dependencies = [ [[package]] name = "prettyplease" -version = "0.2.36" +version = "0.2.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff24dfcda44452b9816fff4cd4227e1bb73ff5a2f1bc1105aa92fb8565ce44d2" +checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" dependencies = [ "proc-macro2", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] name = "proc-macro-crate" -version = "3.3.0" +version = "3.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "edce586971a4dfaa28950c6f18ed55e0406c1ab88bbce2c6f6293a7aaba73d35" +checksum = "219cb19e96be00ab2e37d6e299658a0cfa83e52429179969b0f0121b4ac46983" dependencies = [ "toml_edit", ] @@ -4930,9 +4856,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.97" +version = "1.0.103" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d61789d7719defeb74ea5fe81f2fdfdbd28a803847077cecce2ff14e1472f6f1" +checksum = "5ee95bc4ef87b8d5ba32e8b7714ccc834865276eab0aed5c9958d00ec45f49e8" dependencies = [ "unicode-ident", ] @@ -4963,7 +4889,7 @@ dependencies = [ "prost", "prost-types", "regex", - "syn 2.0.106", + "syn 2.0.110", "tempfile", ] @@ -4977,7 +4903,7 @@ dependencies = [ "itertools 0.14.0", "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] @@ -5000,10 +4926,11 @@ dependencies = [ [[package]] name = "psm" -version = "0.1.26" +version = "0.1.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e944464ec8536cd1beb0bbfd96987eb5e3b72f2ecdafdc5c769a37f1fa2ae1f" +checksum = "d11f2fedc3b7dafdc2851bc52f277377c5473d378859be234bc7ebb593144d01" dependencies = [ + "ar_archive_writer", "cc", ] @@ -5073,7 +5000,7 @@ dependencies = [ "proc-macro2", "pyo3-macros-backend", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] @@ -5086,7 +5013,7 @@ dependencies = [ "proc-macro2", "pyo3-build-config", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] @@ -5097,9 +5024,9 @@ checksum = "5a651516ddc9168ebd67b24afd085a718be02f8858fe406591b013d101ce2f40" [[package]] name = "quick-xml" -version = "0.38.1" +version = "0.38.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9845d9dccf565065824e69f9f235fafba1587031eda353c1f1561cd6a6be78f4" +checksum = "b66c2058c55a409d601666cffe35f04333cf1013010882cec174a7467cd4e21c" dependencies = [ "memchr", "serde", @@ -5107,18 +5034,18 @@ dependencies = [ [[package]] name = "quinn" -version = "0.11.8" +version = "0.11.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "626214629cda6781b6dc1d316ba307189c85ba657213ce642d9c77670f8202c8" +checksum = "b9e20a958963c291dc322d98411f541009df2ced7b5a4f2bd52337638cfccf20" dependencies = [ "bytes", "cfg_aliases", "pin-project-lite", "quinn-proto", "quinn-udp", - "rustc-hash 2.1.1", + "rustc-hash", "rustls", - "socket2 0.5.10", + "socket2 0.6.1", "thiserror", "tokio", "tracing", @@ -5127,16 +5054,16 @@ dependencies = [ [[package]] name = "quinn-proto" -version = "0.11.12" +version = "0.11.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49df843a9161c85bb8aae55f101bc0bac8bcafd637a620d9122fd7e0b2f7422e" +checksum = "f1906b49b0c3bc04b5fe5d86a77925ae6524a19b816ae38ce1e426255f1d8a31" dependencies = [ "bytes", - "getrandom 0.3.3", + "getrandom 0.3.4", "lru-slab", "rand 0.9.2", "ring", - "rustc-hash 2.1.1", + "rustc-hash", "rustls", "rustls-pki-types", "slab", @@ -5148,23 +5075,23 @@ dependencies = [ [[package]] name = "quinn-udp" -version = "0.5.13" +version = "0.5.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fcebb1209ee276352ef14ff8732e24cc2b02bbac986cd74a4c81bcb2f9881970" +checksum = "addec6a0dcad8a8d96a771f815f0eaf55f9d1805756410b39f5fa81332574cbd" dependencies = [ "cfg_aliases", "libc", "once_cell", - "socket2 0.5.10", + "socket2 0.6.1", "tracing", - "windows-sys 0.59.0", + "windows-sys 0.60.2", ] [[package]] name = "quote" -version = "1.0.40" +version = "1.0.42" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" +checksum = "a338cc41d27e6cc6dce6cefc13a0729dfbb81c262b1f519331575dd80ef3067f" dependencies = [ "proc-macro2", ] @@ -5247,7 +5174,7 @@ version = "0.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38" dependencies = [ - "getrandom 0.3.3", + "getrandom 0.3.4", ] [[package]] @@ -5297,7 +5224,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" dependencies = [ "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] @@ -5311,11 +5238,11 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.5.17" +version = "0.5.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5407465600fb0548f1442edf71dd20683c6ed326200ace4b1ef0763521bb3b77" +checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" dependencies = [ - "bitflags 2.9.1", + "bitflags 2.10.0", ] [[package]] @@ -5331,29 +5258,29 @@ dependencies = [ [[package]] name = "ref-cast" -version = "1.0.24" +version = "1.0.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a0ae411dbe946a674d89546582cea4ba2bb8defac896622d6496f14c23ba5cf" +checksum = "f354300ae66f76f1c85c5f84693f0ce81d747e2c3f21a45fef496d89c960bf7d" dependencies = [ "ref-cast-impl", ] [[package]] name = "ref-cast-impl" -version = "1.0.24" +version = "1.0.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1165225c21bff1f3bbce98f5a1f889949bc902d3575308cc7b0de30b4f6d27c7" +checksum = "b7186006dcb21920990093f30e3dea63b7d6e977bf1256be20c3563a5db070da" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] name = "regex" -version = "1.11.2" +version = "1.12.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23d7fd106d8c02486a8d64e778353d1cffe08ce79ac2e82f540c86d0facf6912" +checksum = "843bc0191f75f3e22651ae5f1e72939ab2f72a4bc30fa80a066bd66edefc24d4" dependencies = [ "aho-corasick", "memchr", @@ -5363,9 +5290,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.9" +version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" +checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c" dependencies = [ "aho-corasick", "memchr", @@ -5374,23 +5301,23 @@ dependencies = [ [[package]] name = "regex-lite" -version = "0.1.6" +version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53a49587ad06b26609c52e423de037e7f57f20d53535d66e08c695f347df952a" +checksum = "8d942b98df5e658f56f20d592c7f868833fe38115e65c33003d8cd224b0155da" [[package]] name = "regex-syntax" -version = "0.8.6" +version = "0.8.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "caf4aa5b0f434c91fe5c7f1ecb6a5ece2130b02ad2a590589dda5146df959001" +checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58" [[package]] name = "regress" -version = "0.10.4" +version = "0.10.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "145bb27393fe455dd64d6cbc8d059adfa392590a45eadf079c01b11857e7b010" +checksum = "2057b2325e68a893284d1538021ab90279adac1139957ca2a74426c6f118fb48" dependencies = [ - "hashbrown 0.15.5", + "hashbrown 0.16.0", "memchr", ] @@ -5420,9 +5347,9 @@ dependencies = [ [[package]] name = "reqwest" -version = "0.12.23" +version = "0.12.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d429f34c8092b2d42c7c93cec323bb4adeb7c67698f70839adec842ec10c7ceb" +checksum = "9d0946410b9f7b082a427e4ef5c8ff541a88b357bc6c637c40db3a68ac70a36f" dependencies = [ "base64 0.22.1", "bytes", @@ -5529,7 +5456,7 @@ dependencies = [ "regex", "relative-path", "rustc_version", - "syn 2.0.106", + "syn 2.0.110", "unicode-ident", ] @@ -5541,14 +5468,14 @@ checksum = "b3a8fb4672e840a587a66fc577a5491375df51ddb88f2a2c2a792598c326fe14" dependencies = [ "quote", "rand 0.8.5", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] name = "rust_decimal" -version = "1.37.2" +version = "1.39.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b203a6425500a03e0919c42d3c47caca51e79f1132046626d2c8871c5092035d" +checksum = "35affe401787a9bd846712274d97654355d21b2a2c092a3139aabe31e9022282" dependencies = [ "arrayvec", "borsh", @@ -5561,18 +5488,6 @@ dependencies = [ "serde_json", ] -[[package]] -name = "rustc-demangle" -version = "0.1.26" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56f7d92ca342cea22a06f2121d944b4fd82af56988c270852495420f961d4ace" - -[[package]] -name = "rustc-hash" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" - [[package]] name = "rustc-hash" version = "2.1.1" @@ -5590,35 +5505,22 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.44" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154" -dependencies = [ - "bitflags 2.9.1", - "errno", - "libc", - "linux-raw-sys 0.4.15", - "windows-sys 0.59.0", -] - -[[package]] -name = "rustix" -version = "1.0.8" +version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11181fbabf243db407ef8df94a6ce0b2f9a733bd8be4ad02b4eda9602296cac8" +checksum = "cd15f8a2c5551a84d56efdc1cd049089e409ac19a3072d5037a17fd70719ff3e" dependencies = [ - "bitflags 2.9.1", + "bitflags 2.10.0", "errno", "libc", - "linux-raw-sys 0.9.4", - "windows-sys 0.60.2", + "linux-raw-sys", + "windows-sys 0.61.2", ] [[package]] name = "rustls" -version = "0.23.31" +version = "0.23.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c0ebcbd2f03de0fc1122ad9bb24b127a5a6cd51d72604a3f3c50ac459762b6cc" +checksum = "533f54bc6a7d4f647e46ad909549eda97bf5afc1585190ef692b4286b198bd8f" dependencies = [ "aws-lc-rs", "once_cell", @@ -5631,9 +5533,9 @@ dependencies = [ [[package]] name = "rustls-native-certs" -version = "0.8.1" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fcff2dd52b58a8d98a70243663a0d234c4e2b79235637849d15913394a247d3" +checksum = "9980d917ebb0c0536119ba501e90834767bffc3d60641457fd84a1f3fd337923" dependencies = [ "openssl-probe", "rustls-pki-types", @@ -5652,9 +5554,9 @@ dependencies = [ [[package]] name = "rustls-pki-types" -version = "1.12.0" +version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "229a4a4c221013e7e1f1a043678c5cc39fe5171437c88fb47151a21e6f5b5c79" +checksum = "94182ad936a0c91c324cd46c6511b9510ed16af436d7b5bab34beab0afd55f7a" dependencies = [ "web-time", "zeroize", @@ -5662,9 +5564,9 @@ dependencies = [ [[package]] name = "rustls-webpki" -version = "0.103.4" +version = "0.103.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0a17884ae0c1b773f1ccd2bd4a8c72f16da897310a98b0e84bf349ad5ead92fc" +checksum = "2ffdfa2f5286e2247234e03f680868ac2815974dc39e00ea15adc445d0aafe52" dependencies = [ "aws-lc-rs", "ring", @@ -5680,11 +5582,11 @@ checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" [[package]] name = "rustyline" -version = "17.0.1" +version = "17.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6614df0b6d4cfb20d1d5e295332921793ce499af3ebc011bf1e393380e1e492" +checksum = "e902948a25149d50edc1a8e0141aad50f54e22ba83ff988cf8f7c9ef07f50564" dependencies = [ - "bitflags 2.9.1", + "bitflags 2.10.0", "cfg-if", "clipboard-win", "fd-lock", @@ -5695,7 +5597,7 @@ dependencies = [ "nix", "radix_trie", "unicode-segmentation", - "unicode-width 0.2.1", + "unicode-width 0.2.2", "utf8parse", "windows-sys 0.60.2", ] @@ -5717,11 +5619,11 @@ dependencies = [ [[package]] name = "schannel" -version = "0.1.27" +version = "0.1.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1f29ebaa345f945cec9fbbc532eb307f0fdad8161f281b6369539c8d84876b3d" +checksum = "891d81b926048e76efe18581bf793546b4c0eaf8448d72be8de2bbee5fd166e1" dependencies = [ - "windows-sys 0.59.0", + "windows-sys 0.61.2", ] [[package]] @@ -5750,9 +5652,9 @@ dependencies = [ [[package]] name = "schemars" -version = "1.0.4" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "82d20c4491bc164fa2f6c5d44565947a52ad80b9505d8e36f8d54c27c739fcd0" +checksum = "9558e172d4e8533736ba97870c4b2cd63f84b382a3d6eb063da41b91cce17289" dependencies = [ "dyn-clone", "ref-cast", @@ -5769,7 +5671,7 @@ dependencies = [ "proc-macro2", "quote", "serde_derive_internals", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] @@ -5786,11 +5688,11 @@ checksum = "1c107b6f4780854c8b126e228ea8869f4d7b71260f962fefb57b996b8959ba6b" [[package]] name = "security-framework" -version = "3.3.0" +version = "3.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "80fb1d92c5028aa318b4b8bd7302a5bfcf48be96a37fc6fc790f806b0004ee0c" +checksum = "b3297343eaf830f66ede390ea39da1d462b6b0c1b000f420d0a83f898bbbe6ef" dependencies = [ - "bitflags 2.9.1", + "bitflags 2.10.0", "core-foundation", "core-foundation-sys", "libc", @@ -5799,9 +5701,9 @@ dependencies = [ [[package]] name = "security-framework-sys" -version = "2.14.0" +version = "2.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49db231d56a190491cb4aeda9527f1ad45345af50b0851622a7adb8c03b01c32" +checksum = "cc1f0cbffaac4852523ce30d8bd3c5cdc873501d96ff467ca09b6767bb8cd5c0" dependencies = [ "core-foundation-sys", "libc", @@ -5809,11 +5711,12 @@ dependencies = [ [[package]] name = "semver" -version = "1.0.26" +version = "1.0.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56e6fa9c48d24d85fb3de5ad847117517440f6beceb7798af16b4a87d616b8d0" +checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2" dependencies = [ "serde", + "serde_core", ] [[package]] @@ -5824,31 +5727,42 @@ checksum = "1bc711410fbe7399f390ca1c3b60ad0f53f80e95c5eb935e52268a0e2cd49acc" [[package]] name = "serde" -version = "1.0.219" +version = "1.0.228" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" dependencies = [ + "serde_core", "serde_derive", ] [[package]] name = "serde_bytes" -version = "0.11.17" +version = "0.11.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8437fd221bde2d4ca316d61b90e337e9e702b3820b87d63caa9ba6c02bd06d96" +checksum = "a5d440709e79d88e51ac01c4b72fc6cb7314017bb7da9eeff678aa94c10e3ea8" dependencies = [ "serde", + "serde_core", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.219" +version = "1.0.228" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] @@ -5859,19 +5773,20 @@ checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] name = "serde_json" -version = "1.0.143" +version = "1.0.145" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d401abef1d108fbd9cbaebc3e46611f4b1021f714a0597a71f41ee463f5f4a5a" +checksum = "402a6f66d8c709116cf22f558eab210f5a50187f702eb4d7e5ef38d9a7f1c79c" dependencies = [ "itoa", "memchr", "ryu", "serde", + "serde_core", ] [[package]] @@ -5882,7 +5797,7 @@ checksum = "175ee3e80ae9982737ca543e96133087cbd9a485eecc3bc4de9c1a37b47ea59c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] @@ -5894,7 +5809,7 @@ dependencies = [ "proc-macro2", "quote", "serde", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] @@ -5911,19 +5826,18 @@ dependencies = [ [[package]] name = "serde_with" -version = "3.14.0" +version = "3.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2c45cd61fefa9db6f254525d46e392b852e0e61d9a1fd36e5bd183450a556d5" +checksum = "10574371d41b0d9b2cff89418eda27da52bcaff2cc8741db26382a77c29131f1" dependencies = [ "base64 0.22.1", "chrono", "hex", "indexmap 1.9.3", - "indexmap 2.11.0", + "indexmap 2.12.0", "schemars 0.9.0", - "schemars 1.0.4", - "serde", - "serde_derive", + "schemars 1.1.0", + "serde_core", "serde_json", "serde_with_macros", "time", @@ -5931,14 +5845,14 @@ dependencies = [ [[package]] name = "serde_with_macros" -version = "3.14.0" +version = "3.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de90945e6565ce0d9a25098082ed4ee4002e047cb59892c318d66821e14bb30f" +checksum = "08a72d8216842fdd57820dc78d840bef99248e35fb2554ff923319e60f2d686b" dependencies = [ - "darling 0.20.11", + "darling", "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] @@ -5947,7 +5861,7 @@ version = "0.9.34+deprecated" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47" dependencies = [ - "indexmap 2.11.0", + "indexmap 2.12.0", "itoa", "ryu", "serde", @@ -6000,6 +5914,12 @@ dependencies = [ "libc", ] +[[package]] +name = "simd-adler32" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d66dc143e6b11c1eddc06d5c423cfc97062865baf299914ab64caa38182078fe" + [[package]] name = "simdutf8" version = "0.1.5" @@ -6066,19 +5986,19 @@ dependencies = [ [[package]] name = "socket2" -version = "0.6.0" +version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "233504af464074f9d066d7b5416c5f9b894a5862a6506e306f7b816cdd6f1807" +checksum = "17129e116933cf371d018bb80ae557e889637989d8638274fb25622827b03881" dependencies = [ "libc", - "windows-sys 0.59.0", + "windows-sys 0.60.2", ] [[package]] name = "sqllogictest" -version = "0.28.3" +version = "0.28.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fcbf91368a8d6807093d94f274fa4d0978cd78a310fee1d20368c545a606f7a" +checksum = "3566426f72a13e393aa34ca3d542c5b0eb86da4c0db137ee9b5cfccc6179e52d" dependencies = [ "async-trait", "educe", @@ -6118,20 +6038,20 @@ checksum = "da5fc6819faabb412da764b99d3b713bb55083c11e7e0c00144d386cd6a1939c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] name = "stable_deref_trait" -version = "1.2.0" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" +checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" [[package]] name = "stacker" -version = "0.1.21" +version = "0.1.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cddb07e32ddb770749da91081d8d0ac3a16f1a569a18b20348cd371f5dead06b" +checksum = "e1f8b29fb42aafcea4edeeb6b2f2d7ecd0d969c48b4cf0d2e64aafc471dd6e59" dependencies = [ "cc", "cfg-if", @@ -6140,12 +6060,6 @@ dependencies = [ "windows-sys 0.59.0", ] -[[package]] -name = "static_assertions" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" - [[package]] name = "stringprep" version = "0.1.5" @@ -6172,7 +6086,7 @@ dependencies = [ "proc-macro2", "quote", "structmeta-derive", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] @@ -6183,7 +6097,7 @@ checksum = "152a0b65a590ff6c3da95cabe2353ee04e6167c896b28e3b14478c2636c922fc" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] @@ -6210,12 +6124,31 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "strum" +version = "0.26.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06" + [[package]] name = "strum" version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "af23d6f6c1a224baef9d3f61e287d2761385a5b88fdab4eb4c6f11aeb54c4bcf" +[[package]] +name = "strum_macros" +version = "0.26.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be" +dependencies = [ + "heck 0.5.0", + "proc-macro2", + "quote", + "rustversion", + "syn 2.0.110", +] + [[package]] name = "strum_macros" version = "0.27.2" @@ -6225,7 +6158,7 @@ dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] @@ -6259,7 +6192,7 @@ dependencies = [ "serde", "serde_json", "serde_yaml", - "syn 2.0.106", + "syn 2.0.110", "typify", "walkdir", ] @@ -6283,9 +6216,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.106" +version = "2.0.110" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ede7c438028d4436d71104916910f5bb611972c5cfd7f89b8300a8186e6fada6" +checksum = "a99801b5bd34ede4cf3fc688c5919368fea4e4814a4664359503e6015b280aea" dependencies = [ "proc-macro2", "quote", @@ -6309,14 +6242,14 @@ checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] name = "sysinfo" -version = "0.37.0" +version = "0.37.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07cec4dc2d2e357ca1e610cfb07de2fa7a10fc3e9fe89f72545f3d244ea87753" +checksum = "16607d5caffd1c07ce073528f9ed972d88db15dd44023fa57142963be3feb11f" dependencies = [ "libc", "memchr", @@ -6334,21 +6267,21 @@ checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" [[package]] name = "target-lexicon" -version = "0.13.2" +version = "0.13.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e502f78cdbb8ba4718f566c418c52bc729126ffd16baee5baa718cf25dd5a69a" +checksum = "df7f62577c25e07834649fc3b39fafdc597c0a3527dc1c60129201ccfcbaa50c" [[package]] name = "tempfile" -version = "3.21.0" +version = "3.23.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "15b61f8f20e3a6f7e0649d825294eaf317edce30f82cf6026e7e4cb9222a7d1e" +checksum = "2d31c77bdf42a745371d260a26ca7163f1e0924b64afa0b688e61b5a9fa02f16" dependencies = [ "fastrand", - "getrandom 0.3.3", + "getrandom 0.3.4", "once_cell", - "rustix 1.0.8", - "windows-sys 0.60.2", + "rustix", + "windows-sys 0.61.2", ] [[package]] @@ -6417,22 +6350,22 @@ dependencies = [ [[package]] name = "thiserror" -version = "2.0.16" +version = "2.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3467d614147380f2e4e374161426ff399c91084acd2363eaf549172b3d5e60c0" +checksum = "f63587ca0f12b72a0600bcba1d40081f830876000bb46dd2337a3051618f4fc8" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "2.0.16" +version = "2.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c5e1be1c48b9172ee610da68fd9cd2770e7a4056cb3fc98710ee6906f0c7960" +checksum = "3ff15c8ecd7de3849db632e14d18d2571fa09dfc5ed93479bc4485c7a517c913" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] @@ -6457,9 +6390,9 @@ dependencies = [ [[package]] name = "time" -version = "0.3.41" +version = "0.3.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a7619e19bc266e0f9c5e6686659d394bc57973859340060a69221e57dbc0c40" +checksum = "91e7d9e3bb61134e77bde20dd4825b97c010155709965fedf0f49bb138e52a9d" dependencies = [ "deranged", "itoa", @@ -6472,15 +6405,15 @@ dependencies = [ [[package]] name = "time-core" -version = "0.1.4" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c9e9a38711f559d9e3ce1cdb06dd7c5b8ea546bc90052da6d06bb76da74bb07c" +checksum = "40868e7c1d2f0b8d73e4a8c7f0ff63af4f6d19be117e90bd73eb1d62cf831c6b" [[package]] name = "time-macros" -version = "0.2.22" +version = "0.2.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3526739392ec93fd8b359c8e98514cb3e8e021beb4e5f597b00a0221f8ed8a49" +checksum = "30cfb0125f12d9c277f35663a0a33f8c30190f4e4574868a330595412d34ebf3" dependencies = [ "num-conv", "time-core", @@ -6497,9 +6430,9 @@ dependencies = [ [[package]] name = "tinystr" -version = "0.8.1" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d4f6d1145dcb577acf783d4e601bc1d76a13337bb54e6233add580b07344c8b" +checksum = "42d3e9c45c09de15d06dd8acf5f4e0e399e85927b7f00711024eb7ae10fa4869" dependencies = [ "displaydoc", "zerovec", @@ -6517,9 +6450,9 @@ dependencies = [ [[package]] name = "tinyvec" -version = "1.9.0" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09b3661f17e86524eccd4371ab0429194e0d7c008abb45f7a7495b1719463c71" +checksum = "bfa5fdc3bce6191a1dbc8c02d5c8bffcf557bafa17c124c5264a458f1b0613fa" dependencies = [ "tinyvec_macros", ] @@ -6532,40 +6465,37 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.47.1" +version = "1.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89e49afdadebb872d3145a5638b59eb0691ea23e46ca484037cfab3b76b95038" +checksum = "ff360e02eab121e0bc37a2d3b4d4dc622e6eda3a8e5253d5435ecf5bd4c68408" dependencies = [ - "backtrace", "bytes", - "io-uring", "libc", "mio", "parking_lot", "pin-project-lite", "signal-hook-registry", - "slab", - "socket2 0.6.0", + "socket2 0.6.1", "tokio-macros", - "windows-sys 0.59.0", + "windows-sys 0.61.2", ] [[package]] name = "tokio-macros" -version = "2.5.0" +version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8" +checksum = "af407857209536a95c8e56f8231ef2c2e2aff839b22e07a1ffcbc617e9db9fa5" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] name = "tokio-postgres" -version = "0.7.13" +version = "0.7.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c95d533c83082bb6490e0189acaa0bbeef9084e60471b696ca6988cd0541fb0" +checksum = "2b40d66d9b2cfe04b628173409368e58247e8eddbbd3b0e6c6ba1d09f20f6c9e" dependencies = [ "async-trait", "byteorder", @@ -6576,12 +6506,12 @@ dependencies = [ "log", "parking_lot", "percent-encoding", - "phf 0.11.3", + "phf 0.13.1", "pin-project-lite", "postgres-protocol", "postgres-types", "rand 0.9.2", - "socket2 0.5.10", + "socket2 0.6.1", "tokio", "tokio-util", "whoami", @@ -6589,9 +6519,9 @@ dependencies = [ [[package]] name = "tokio-rustls" -version = "0.26.2" +version = "0.26.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e727b36a1a0e8b74c376ac2211e40c2c8af09fb4013c60d910495810f008e9b" +checksum = "1729aa945f29d91ba541258c8df89027d5792d85a8841fb65e8bf0f4ede4ef61" dependencies = [ "rustls", "tokio", @@ -6625,9 +6555,9 @@ dependencies = [ [[package]] name = "tokio-util" -version = "0.7.16" +version = "0.7.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14307c986784f72ef81c89db7d9e28d6ac26d16213b109ea501696195e6e3ce5" +checksum = "2efa149fe76073d6e8fd97ef4f4eca7b67f599660115591483572e406e165594" dependencies = [ "bytes", "futures-core", @@ -6638,18 +6568,31 @@ dependencies = [ [[package]] name = "toml_datetime" -version = "0.6.11" +version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22cddaf88f4fbc13c51aebbf5f8eceb5c7c5a9da2ac40a13519eb5b0a0e8f11c" +checksum = "f2cdb639ebbc97961c51720f858597f7f24c4fc295327923af55b74c3c724533" +dependencies = [ + "serde_core", +] [[package]] name = "toml_edit" -version = "0.22.27" +version = "0.23.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41fe8c660ae4257887cf66394862d21dbca4a6ddd26f04a3560410406a2f819a" +checksum = "6485ef6d0d9b5d0ec17244ff7eb05310113c3f316f2d14200d4de56b3cb98f8d" dependencies = [ - "indexmap 2.11.0", + "indexmap 2.12.0", "toml_datetime", + "toml_parser", + "winnow", +] + +[[package]] +name = "toml_parser" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0cbe268d35bdb4bb5a56a2de88d0ad0eb70af5384a99d648cd4b3d04039800e" +dependencies = [ "winnow", ] @@ -6690,7 +6633,7 @@ checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9" dependencies = [ "futures-core", "futures-util", - "indexmap 2.11.0", + "indexmap 2.12.0", "pin-project-lite", "slab", "sync_wrapper", @@ -6707,7 +6650,7 @@ version = "0.6.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "adc82fd73de2a9722ac5da747f12383d2bfdb93591ee6c58486e0097890f05f2" dependencies = [ - "bitflags 2.9.1", + "bitflags 2.10.0", "bytes", "futures-util", "http 1.3.1", @@ -6750,7 +6693,7 @@ checksum = "81383ab64e72a7a8b8e13130c49e3dab29def6d0c7d76a03087b3cf71c5c6903" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] @@ -6811,9 +6754,9 @@ checksum = "e78122066b0cb818b8afd08f7ed22f7fdbc3e90815035726f0840d0d26c0747a" [[package]] name = "twox-hash" -version = "2.1.1" +version = "2.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b907da542cbced5261bd3256de1b3a1bf340a3d37f93425a07362a1d687de56" +checksum = "9ea3136b675547379c4bd395ca6b938e5ad3c3d20fad76e7fe85f9e0d011419c" [[package]] name = "typed-arena" @@ -6823,21 +6766,21 @@ checksum = "6af6ae20167a9ece4bcb41af5b80f8a1f1df981f6391189ce00fd257af04126a" [[package]] name = "typenum" -version = "1.18.0" +version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1dccffe3ce07af9386bfd29e80c0ab1a8205a2fc34e4bcd40364df902cfa8f3f" +checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb" [[package]] name = "typewit" -version = "1.12.1" +version = "1.14.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97e72ba082eeb9da9dc68ff5a2bf727ef6ce362556e8d29ec1aed3bd05e7d86a" +checksum = "f8c1ae7cc0fdb8b842d65d127cb981574b0d2b249b74d1c7a2986863dc134f71" [[package]] name = "typify" -version = "0.4.2" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c6c647a34e851cf0260ccc14687f17cdcb8302ff1a8a687a24b97ca0f82406f" +checksum = "7144144e97e987c94758a3017c920a027feac0799df325d6df4fc8f08d02068e" dependencies = [ "typify-impl", "typify-macro", @@ -6845,9 +6788,9 @@ dependencies = [ [[package]] name = "typify-impl" -version = "0.4.2" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "741b7f1e2e1338c0bee5ad5a7d3a9bbd4e24c33765c08b7691810e68d879365d" +checksum = "062879d46aa4c9dfe0d33b035bbaf512da192131645d05deacb7033ec8581a09" dependencies = [ "heck 0.5.0", "log", @@ -6858,16 +6801,16 @@ dependencies = [ "semver", "serde", "serde_json", - "syn 2.0.106", + "syn 2.0.110", "thiserror", "unicode-ident", ] [[package]] name = "typify-macro" -version = "0.4.2" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7560adf816a1e8dad7c63d8845ef6e31e673e39eab310d225636779230cbedeb" +checksum = "9708a3ceb6660ba3f8d2b8f0567e7d4b8b198e2b94d093b8a6077a751425de9e" dependencies = [ "proc-macro2", "quote", @@ -6876,7 +6819,7 @@ dependencies = [ "serde", "serde_json", "serde_tokenstream", - "syn 2.0.106", + "syn 2.0.110", "typify-impl", ] @@ -6888,24 +6831,24 @@ checksum = "5c1cb5db39152898a79168971543b1cb5020dff7fe43c8dc468b0885f5e29df5" [[package]] name = "unicode-ident" -version = "1.0.18" +version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" +checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5" [[package]] name = "unicode-normalization" -version = "0.1.24" +version = "0.1.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5033c97c4262335cded6d6fc3e5c18ab755e1a3dc96376350f3d8e9f009ad956" +checksum = "5fd4f6878c9cb28d874b009da9e8d183b5abc80117c40bbd187a1fde336be6e8" dependencies = [ "tinyvec", ] [[package]] name = "unicode-properties" -version = "0.1.3" +version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e70f2a8b45122e719eb623c01822704c4e0907e7e426a05927e1a1cfff5b75d0" +checksum = "7df058c713841ad818f1dc5d3fd88063241cc61f49f5fbea4b951e8cf5a8d71d" [[package]] name = "unicode-segmentation" @@ -6921,9 +6864,9 @@ checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af" [[package]] name = "unicode-width" -version = "0.2.1" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a1a07cc7db3810833284e8d372ccdc6da29741639ecc70c9ec107df0fa6154c" +checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" [[package]] name = "unindent" @@ -6933,9 +6876,9 @@ checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3" [[package]] name = "unit-prefix" -version = "0.5.1" +version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "323402cff2dd658f39ca17c789b502021b3f18707c91cdf22e3838e1b4023817" +checksum = "81e544489bf3d8ef66c953931f56617f423cd4b5494be343d9b9d3dda037b9a3" [[package]] name = "unsafe-libyaml" @@ -6985,7 +6928,7 @@ version = "1.18.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2f87b8aa10b915a06587d0dec516c282ff295b475d94abf425d62b57710070a2" dependencies = [ - "getrandom 0.3.3", + "getrandom 0.3.4", "js-sys", "serde", "wasm-bindgen", @@ -7044,12 +6987,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" [[package]] -name = "wasi" -version = "0.14.2+wasi-0.2.4" +name = "wasip2" +version = "1.0.1+wasi-0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9683f9a5a998d873c0d21fcbe3c083009670149a8fab228644b8bd36b2c48cb3" +checksum = "0562428422c63773dad2c345a1882263bbf4d65cf3f42e90921f787ef5ad58e7" dependencies = [ - "wit-bindgen-rt", + "wit-bindgen", ] [[package]] @@ -7060,35 +7003,22 @@ checksum = "b8dad83b4f25e74f184f64c43b150b91efe7647395b42289f38e50566d82855b" [[package]] name = "wasm-bindgen" -version = "0.2.100" +version = "0.2.105" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1edc8929d7499fc4e8f0be2262a241556cfc54a0bea223790e71446f2aab1ef5" +checksum = "da95793dfc411fbbd93f5be7715b0578ec61fe87cb1a42b12eb625caa5c5ea60" dependencies = [ "cfg-if", "once_cell", "rustversion", "wasm-bindgen-macro", -] - -[[package]] -name = "wasm-bindgen-backend" -version = "0.2.100" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f0a0651a5c2bc21487bde11ee802ccaf4c51935d0d3d42a6101f98161700bc6" -dependencies = [ - "bumpalo", - "log", - "proc-macro2", - "quote", - "syn 2.0.106", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-futures" -version = "0.4.50" +version = "0.4.55" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "555d470ec0bc3bb57890405e5d4322cc9ea83cebb085523ced7be4144dac1e61" +checksum = "551f88106c6d5e7ccc7cd9a16f312dd3b5d36ea8b4954304657d5dfba115d4a0" dependencies = [ "cfg-if", "js-sys", @@ -7099,9 +7029,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.100" +version = "0.2.105" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fe63fc6d09ed3792bd0897b314f53de8e16568c2b3f7982f468c0bf9bd0b407" +checksum = "04264334509e04a7bf8690f2384ef5265f05143a4bff3889ab7a3269adab59c2" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -7109,31 +7039,31 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.100" +version = "0.2.105" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de" +checksum = "420bc339d9f322e562942d52e115d57e950d12d88983a14c79b86859ee6c7ebc" dependencies = [ + "bumpalo", "proc-macro2", "quote", - "syn 2.0.106", - "wasm-bindgen-backend", + "syn 2.0.110", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.100" +version = "0.2.105" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a05d73b933a847d6cccdda8f838a22ff101ad9bf93e33684f39c1f5f0eece3d" +checksum = "76f218a38c84bcb33c25ec7059b07847d465ce0e0a76b995e134a45adcb6af76" dependencies = [ "unicode-ident", ] [[package]] name = "wasm-bindgen-test" -version = "0.3.50" +version = "0.3.55" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "66c8d5e33ca3b6d9fa3b4676d774c5778031d27a578c2b007f905acf816152c3" +checksum = "bfc379bfb624eb59050b509c13e77b4eb53150c350db69628141abce842f2373" dependencies = [ "js-sys", "minicov", @@ -7144,13 +7074,13 @@ dependencies = [ [[package]] name = "wasm-bindgen-test-macro" -version = "0.3.50" +version = "0.3.55" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17d5042cc5fa009658f9a7333ef24291b1291a25b6382dd68862a7f3b969f69b" +checksum = "085b2df989e1e6f9620c1311df6c996e83fe16f57792b272ce1e024ac16a90f1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] @@ -7168,9 +7098,9 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.77" +version = "0.3.82" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33b6dd2ef9186f1f2072e409e99cd22a975331a6b3591b12c764e0e55c60d5d2" +checksum = "3a1f95c0d03a47f4ae1f7a64643a6bb97465d9b740f0fa8f90ea33915c99a9a1" dependencies = [ "js-sys", "wasm-bindgen", @@ -7186,18 +7116,6 @@ dependencies = [ "wasm-bindgen", ] -[[package]] -name = "which" -version = "4.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7" -dependencies = [ - "either", - "home", - "once_cell", - "rustix 0.38.44", -] - [[package]] name = "whoami" version = "1.6.1" @@ -7227,11 +7145,11 @@ checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" [[package]] name = "winapi-util" -version = "0.1.9" +version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" +checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" dependencies = [ - "windows-sys 0.59.0", + "windows-sys 0.61.2", ] [[package]] @@ -7247,9 +7165,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9babd3a767a4c1aef6900409f85f5d53ce2544ccdfaa86dad48c91782c6d6893" dependencies = [ "windows-collections", - "windows-core", + "windows-core 0.61.2", "windows-future", - "windows-link", + "windows-link 0.1.3", "windows-numerics", ] @@ -7259,7 +7177,7 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3beeceb5e5cfd9eb1d76b381630e82c4241ccd0d27f1a39ed41b2760b255c5e8" dependencies = [ - "windows-core", + "windows-core 0.61.2", ] [[package]] @@ -7270,9 +7188,22 @@ checksum = "c0fdd3ddb90610c7638aa2b3a3ab2904fb9e5cdbecc643ddb3647212781c4ae3" dependencies = [ "windows-implement", "windows-interface", - "windows-link", - "windows-result", - "windows-strings", + "windows-link 0.1.3", + "windows-result 0.3.4", + "windows-strings 0.4.2", +] + +[[package]] +name = "windows-core" +version = "0.62.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8e83a14d34d0623b51dce9581199302a221863196a1dde71a7663a4c2be9deb" +dependencies = [ + "windows-implement", + "windows-interface", + "windows-link 0.2.1", + "windows-result 0.4.1", + "windows-strings 0.5.1", ] [[package]] @@ -7281,31 +7212,31 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fc6a41e98427b19fe4b73c550f060b59fa592d7d686537eebf9385621bfbad8e" dependencies = [ - "windows-core", - "windows-link", + "windows-core 0.61.2", + "windows-link 0.1.3", "windows-threading", ] [[package]] name = "windows-implement" -version = "0.60.0" +version = "0.60.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a47fddd13af08290e67f4acabf4b459f647552718f683a7b415d290ac744a836" +checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] name = "windows-interface" -version = "0.59.1" +version = "0.59.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd9211b69f8dcdfa817bfd14bf1c97c9188afa36f4750130fcdf3f400eca9fa8" +checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] @@ -7314,14 +7245,20 @@ version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5e6ad25900d524eaabdbbb96d20b4311e1e7ae1699af4fb28c17ae66c80d798a" +[[package]] +name = "windows-link" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" + [[package]] name = "windows-numerics" version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9150af68066c4c5c07ddc0ce30421554771e528bde427614c61038bc2c92c2b1" dependencies = [ - "windows-core", - "windows-link", + "windows-core 0.61.2", + "windows-link 0.1.3", ] [[package]] @@ -7330,7 +7267,16 @@ version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "56f42bd332cc6c8eac5af113fc0c1fd6a8fd2aa08a0119358686e5160d0586c6" dependencies = [ - "windows-link", + "windows-link 0.1.3", +] + +[[package]] +name = "windows-result" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7781fa89eaf60850ac3d2da7af8e5242a5ea78d1a11c49bf2910bb5a73853eb5" +dependencies = [ + "windows-link 0.2.1", ] [[package]] @@ -7339,7 +7285,16 @@ version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "56e6c93f3a0c3b36176cb1327a4958a0353d5d166c2a35cb268ace15e91d3b57" dependencies = [ - "windows-link", + "windows-link 0.1.3", +] + +[[package]] +name = "windows-strings" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7837d08f69c77cf6b07689544538e017c1bfcf57e34b4c0ff58e6c2cd3b37091" +dependencies = [ + "windows-link 0.2.1", ] [[package]] @@ -7366,7 +7321,16 @@ version = "0.60.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" dependencies = [ - "windows-targets 0.53.3", + "windows-targets 0.53.5", +] + +[[package]] +name = "windows-sys" +version = "0.61.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" +dependencies = [ + "windows-link 0.2.1", ] [[package]] @@ -7387,19 +7351,19 @@ dependencies = [ [[package]] name = "windows-targets" -version = "0.53.3" +version = "0.53.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d5fe6031c4041849d7c496a8ded650796e7b6ecc19df1a431c1a363342e5dc91" +checksum = "4945f9f551b88e0d65f3db0bc25c33b8acea4d9e41163edf90dcd0b19f9069f3" dependencies = [ - "windows-link", - "windows_aarch64_gnullvm 0.53.0", - "windows_aarch64_msvc 0.53.0", - "windows_i686_gnu 0.53.0", - "windows_i686_gnullvm 0.53.0", - "windows_i686_msvc 0.53.0", - "windows_x86_64_gnu 0.53.0", - "windows_x86_64_gnullvm 0.53.0", - "windows_x86_64_msvc 0.53.0", + "windows-link 0.2.1", + "windows_aarch64_gnullvm 0.53.1", + "windows_aarch64_msvc 0.53.1", + "windows_i686_gnu 0.53.1", + "windows_i686_gnullvm 0.53.1", + "windows_i686_msvc 0.53.1", + "windows_x86_64_gnu 0.53.1", + "windows_x86_64_gnullvm 0.53.1", + "windows_x86_64_msvc 0.53.1", ] [[package]] @@ -7408,7 +7372,7 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b66463ad2e0ea3bbf808b7f1d371311c80e115c0b71d60efc142cafbcfb057a6" dependencies = [ - "windows-link", + "windows-link 0.1.3", ] [[package]] @@ -7419,9 +7383,9 @@ checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" [[package]] name = "windows_aarch64_gnullvm" -version = "0.53.0" +version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86b8d5f90ddd19cb4a147a5fa63ca848db3df085e25fee3cc10b39b6eebae764" +checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53" [[package]] name = "windows_aarch64_msvc" @@ -7431,9 +7395,9 @@ checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" [[package]] name = "windows_aarch64_msvc" -version = "0.53.0" +version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7651a1f62a11b8cbd5e0d42526e55f2c99886c77e007179efff86c2b137e66c" +checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006" [[package]] name = "windows_i686_gnu" @@ -7443,9 +7407,9 @@ checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" [[package]] name = "windows_i686_gnu" -version = "0.53.0" +version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1dc67659d35f387f5f6c479dc4e28f1d4bb90ddd1a5d3da2e5d97b42d6272c3" +checksum = "960e6da069d81e09becb0ca57a65220ddff016ff2d6af6a223cf372a506593a3" [[package]] name = "windows_i686_gnullvm" @@ -7455,9 +7419,9 @@ checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" [[package]] name = "windows_i686_gnullvm" -version = "0.53.0" +version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ce6ccbdedbf6d6354471319e781c0dfef054c81fbc7cf83f338a4296c0cae11" +checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c" [[package]] name = "windows_i686_msvc" @@ -7467,9 +7431,9 @@ checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" [[package]] name = "windows_i686_msvc" -version = "0.53.0" +version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "581fee95406bb13382d2f65cd4a908ca7b1e4c2f1917f143ba16efe98a589b5d" +checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2" [[package]] name = "windows_x86_64_gnu" @@ -7479,9 +7443,9 @@ checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" [[package]] name = "windows_x86_64_gnu" -version = "0.53.0" +version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e55b5ac9ea33f2fc1716d1742db15574fd6fc8dadc51caab1c16a3d3b4190ba" +checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499" [[package]] name = "windows_x86_64_gnullvm" @@ -7491,9 +7455,9 @@ checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" [[package]] name = "windows_x86_64_gnullvm" -version = "0.53.0" +version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0a6e035dd0599267ce1ee132e51c27dd29437f63325753051e71dd9e42406c57" +checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1" [[package]] name = "windows_x86_64_msvc" @@ -7503,33 +7467,30 @@ checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" [[package]] name = "windows_x86_64_msvc" -version = "0.53.0" +version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "271414315aff87387382ec3d271b52d7ae78726f5d44ac98b4f4030c91880486" +checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" [[package]] name = "winnow" -version = "0.7.12" +version = "0.7.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3edebf492c8125044983378ecb5766203ad3b4c2f7a922bd7dd207f6d443e95" +checksum = "21a0236b59786fed61e2a80582dd500fe61f18b5dca67a4a067d0bc9039339cf" dependencies = [ "memchr", ] [[package]] -name = "wit-bindgen-rt" -version = "0.39.0" +name = "wit-bindgen" +version = "0.46.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1" -dependencies = [ - "bitflags 2.9.1", -] +checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59" [[package]] name = "writeable" -version = "0.6.1" +version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea2f10b9bb0928dfb1b42b65e1f9e36f7f54dbdf08457afefb38afcdec4fa2bb" +checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9" [[package]] name = "wyz" @@ -7542,12 +7503,12 @@ dependencies = [ [[package]] name = "xattr" -version = "1.5.1" +version = "1.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af3a19837351dc82ba89f8a125e22a3c475f05aba604acc023d62b2739ae2909" +checksum = "32e45ad4206f6d2479085147f02bc2ef834ac85886624a23575ae137c8aa8156" dependencies = [ "libc", - "rustix 1.0.8", + "rustix", ] [[package]] @@ -7562,15 +7523,6 @@ version = "0.8.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fdd20c5420375476fbd4394763288da7eb0cc0b8c11deed431a91562af7335d3" -[[package]] -name = "xz2" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2" -dependencies = [ - "lzma-sys", -] - [[package]] name = "yansi" version = "1.0.1" @@ -7579,11 +7531,10 @@ checksum = "cfe53a6657fd280eaa890a3bc59152892ffa3e30101319d168b781ed6529b049" [[package]] name = "yoke" -version = "0.8.0" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f41bb01b8226ef4bfd589436a297c53d118f65921786300e427be8d487695cc" +checksum = "72d6e5c6afb84d73944e5cedb052c4680d5657337201555f9f2a16b7406d4954" dependencies = [ - "serde", "stable_deref_trait", "yoke-derive", "zerofrom", @@ -7591,34 +7542,34 @@ dependencies = [ [[package]] name = "yoke-derive" -version = "0.8.0" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38da3c9736e16c5d3c8c597a9aaa5d1fa565d0532ae05e27c24aa62fb32c0ab6" +checksum = "b659052874eb698efe5b9e8cf382204678a0086ebf46982b79d6ca3182927e5d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", "synstructure", ] [[package]] name = "zerocopy" -version = "0.8.26" +version = "0.8.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1039dd0d3c310cf05de012d8a39ff557cb0d23087fd44cad61df08fc31907a2f" +checksum = "0894878a5fa3edfd6da3f88c4805f4c8558e2b996227a3d864f47fe11e38282c" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.8.26" +version = "0.8.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ecf5b4cc5364572d7f4c329661bcc82724222973f2cab6f050a4e5c22f75181" +checksum = "88d2b8d9c68ad2b9e4340d7832716a4d21a22a1154777ad56ea55c51a9cf3831" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] @@ -7638,21 +7589,21 @@ checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", "synstructure", ] [[package]] name = "zeroize" -version = "1.8.1" +version = "1.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde" +checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0" [[package]] name = "zerotrie" -version = "0.2.2" +version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "36f0bbd478583f79edad978b407914f61b2972f5af6fa089686016be8f9af595" +checksum = "2a59c17a5562d507e4b54960e8569ebee33bee890c70aa3fe7b97e85a9fd7851" dependencies = [ "displaydoc", "yoke", @@ -7661,9 +7612,9 @@ dependencies = [ [[package]] name = "zerovec" -version = "0.11.4" +version = "0.11.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7aa2bd55086f1ab526693ecbe444205da57e25f4489879da80635a46d90e73b" +checksum = "6c28719294829477f525be0186d13efa9a3c602f7ec202ca9e353d310fb9a002" dependencies = [ "yoke", "zerofrom", @@ -7672,20 +7623,20 @@ dependencies = [ [[package]] name = "zerovec-derive" -version = "0.11.1" +version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b96237efa0c878c64bd89c436f661be4e46b2f3eff1ebb976f7ef2321d2f58f" +checksum = "eadce39539ca5cb3985590102671f2567e659fca9666581ad3411d59207951f3" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] name = "zlib-rs" -version = "0.5.1" +version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "626bd9fa9734751fc50d6060752170984d7053f5a39061f524cda68023d4db8a" +checksum = "2f06ae92f42f5e5c42443fd094f245eb656abf56dd7cce9b8b263236565e00f2" [[package]] name = "zstd" @@ -7707,9 +7658,9 @@ dependencies = [ [[package]] name = "zstd-sys" -version = "2.0.15+zstd.1.5.7" +version = "2.0.16+zstd.1.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eb81183ddd97d0c74cedf1d50d85c8d08c1b8b68ee863bdee9e706eedba1a237" +checksum = "91e19ebc2adc8f83e43039e79776e3fda8ca919132d68a1fed6a5faca2683748" dependencies = [ "cc", "pkg-config", diff --git a/Cargo.toml b/Cargo.toml index 78e437250627..52c429926e3a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -89,7 +89,7 @@ version = "50.0.0" ahash = { version = "0.8", default-features = false, features = [ "runtime-rng", ] } -apache-avro = { version = "0.20", default-features = false } +apache-avro = { version = "0.21", default-features = false } arrow = { version = "56.1.0", features = [ "prettyprint", "chrono-tz", @@ -154,6 +154,7 @@ hashbrown = { version = "0.14.5", features = ["raw"] } hex = { version = "0.4.3" } indexmap = "2.11.0" itertools = "0.14" +liblzma = { version = "0.4.4", features = ["static"] } log = "^0.4" object_store = { version = "0.12.3", default-features = false } parking_lot = "0.12" diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml index aea5e51befb0..a3c08f1d7830 100644 --- a/datafusion/common/Cargo.toml +++ b/datafusion/common/Cargo.toml @@ -52,7 +52,7 @@ parquet = ["dep:parquet"] [dependencies] ahash = { workspace = true } -apache-avro = { version = "0.20", default-features = false, features = [ +apache-avro = { workspace = true, features = [ "bzip", "snappy", "xz", diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index 21c6e38945ee..1955c6ec2143 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -43,7 +43,7 @@ array_expressions = ["nested_expressions"] avro = ["datafusion-common/avro", "datafusion-datasource-avro"] backtrace = ["datafusion-common/backtrace"] compression = [ - "xz2", + "liblzma", "bzip2", "flate2", "zstd", @@ -138,6 +138,7 @@ flate2 = { version = "1.1.2", optional = true } futures = { workspace = true } hex = { workspace = true, optional = true } itertools = { workspace = true } +liblzma = { workspace = true, optional = true } log = { workspace = true } object_store = { workspace = true } parking_lot = { workspace = true } @@ -150,7 +151,6 @@ tempfile = { workspace = true } tokio = { workspace = true } url = { workspace = true } uuid = { version = "1.18", features = ["v4", "js"] } -xz2 = { version = "0.1", optional = true, features = ["static"] } zstd = { version = "0.13", optional = true, default-features = false } [dev-dependencies] diff --git a/datafusion/core/src/test/mod.rs b/datafusion/core/src/test/mod.rs index 68f83e7f1f11..2fe9d03c3ddb 100644 --- a/datafusion/core/src/test/mod.rs +++ b/datafusion/core/src/test/mod.rs @@ -53,9 +53,9 @@ use datafusion_datasource_csv::partitioned_csv_config; use flate2::write::GzEncoder; #[cfg(feature = "compression")] use flate2::Compression as GzCompression; -use object_store::local_unpartitioned_file; #[cfg(feature = "compression")] -use xz2::write::XzEncoder; +use liblzma::write::XzEncoder; +use object_store::local_unpartitioned_file; #[cfg(feature = "compression")] use zstd::Encoder as ZstdEncoder; diff --git a/datafusion/datasource/Cargo.toml b/datafusion/datasource/Cargo.toml index 6b2c6cbd405c..d39c457f8111 100644 --- a/datafusion/datasource/Cargo.toml +++ b/datafusion/datasource/Cargo.toml @@ -32,12 +32,12 @@ all-features = true [features] parquet = ["dep:parquet", "tempfile"] -compression = ["async-compression", "xz2", "bzip2", "flate2", "zstd", "tokio-util"] +compression = ["async-compression", "liblzma", "bzip2", "flate2", "zstd", "tokio-util"] default = ["compression"] [dependencies] arrow = { workspace = true } -async-compression = { version = "0.4.19", features = [ +async-compression = { version = "0.4.30", features = [ "bzip2", "gzip", "xz", @@ -61,6 +61,7 @@ flate2 = { version = "1.1.2", optional = true } futures = { workspace = true } glob = "0.3.0" itertools = { workspace = true } +liblzma = { workspace = true, optional = true } log = { workspace = true } object_store = { workspace = true } parquet = { workspace = true, optional = true } @@ -69,7 +70,6 @@ tempfile = { workspace = true, optional = true } tokio = { workspace = true } tokio-util = { version = "0.7.16", features = ["io"], optional = true } url = { workspace = true } -xz2 = { version = "0.1", optional = true, features = ["static"] } zstd = { version = "0.13", optional = true, default-features = false } [dev-dependencies] diff --git a/datafusion/datasource/src/file_compression_type.rs b/datafusion/datasource/src/file_compression_type.rs index 7cc3142564e9..9ca5d8763b74 100644 --- a/datafusion/datasource/src/file_compression_type.rs +++ b/datafusion/datasource/src/file_compression_type.rs @@ -43,13 +43,13 @@ use futures::stream::BoxStream; use futures::StreamExt; #[cfg(feature = "compression")] use futures::TryStreamExt; +#[cfg(feature = "compression")] +use liblzma::read::XzDecoder; use object_store::buffered::BufWriter; use tokio::io::AsyncWrite; #[cfg(feature = "compression")] use tokio_util::io::{ReaderStream, StreamReader}; #[cfg(feature = "compression")] -use xz2::read::XzDecoder; -#[cfg(feature = "compression")] use zstd::Decoder as ZstdDecoder; /// Readable file compression type