From 58469d910d5f1945f030594ace9e7e8152223389 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Fri, 24 Oct 2025 17:04:55 -0500 Subject: [PATCH 1/2] fix null count stats computation --- datafusion/datasource-parquet/src/metadata.rs | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/datafusion/datasource-parquet/src/metadata.rs b/datafusion/datasource-parquet/src/metadata.rs index 4de68793ce02..c8ee4d3b9f57 100644 --- a/datafusion/datasource-parquet/src/metadata.rs +++ b/datafusion/datasource-parquet/src/metadata.rs @@ -299,7 +299,6 @@ impl<'a> DFParquetMetadata<'a> { summarize_min_max_null_counts( &mut accumulators, idx, - num_rows, &stats_converter, row_groups_metadata, ) @@ -417,7 +416,6 @@ struct StatisticsAccumulators<'a> { fn summarize_min_max_null_counts( accumulators: &mut StatisticsAccumulators, arrow_schema_index: usize, - num_rows: usize, stats_converter: &StatisticsConverter, row_groups_metadata: &[RowGroupMetaData], ) -> Result<()> { @@ -449,11 +447,14 @@ fn summarize_min_max_null_counts( ); } - accumulators.null_counts_array[arrow_schema_index] = - Precision::Exact(match sum(&null_counts) { - Some(null_count) => null_count as usize, - None => num_rows, - }); + accumulators.null_counts_array[arrow_schema_index] = match sum(&null_counts) { + Some(null_count) => Precision::Exact(null_count as usize), + None => match null_counts.len() { + // If sum() returned None we either have no rows or all values are null + 0 => Precision::Exact(0), + _ => Precision::Absent, + }, + }; Ok(()) } From 75471c0532324e39708100934a368a2c47c8aed2 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Fri, 24 Oct 2025 17:39:54 -0500 Subject: [PATCH 2/2] add test --- .../sqllogictest/test_files/parquet.slt | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/datafusion/sqllogictest/test_files/parquet.slt b/datafusion/sqllogictest/test_files/parquet.slt index e722005bf0f0..11942108ab2b 100644 --- a/datafusion/sqllogictest/test_files/parquet.slt +++ b/datafusion/sqllogictest/test_files/parquet.slt @@ -862,3 +862,30 @@ select part, k, v from t order by k statement ok DROP TABLE t; + +# Regression test for files with stats on some columns and not others +# See https://github.com/apache/datafusion/pull/18276 + +query I +COPY (SELECT 1::int AS a, 2::int as b) +TO 'test_files/scratch/parquet/mixed_stats.parquet' +STORED AS PARQUET OPTIONS ( + 'STATISTICS_ENABLED::b' 'none' +); +---- +1 + +statement ok +CREATE EXTERNAL TABLE t +STORED AS PARQUET +LOCATION 'test_files/scratch/parquet/mixed_stats.parquet'; + +query I +SELECT b +FROM t +WHERE b = 2; +---- +2 + +statement ok +DROP TABLE t;