Add filter pushdown scatter threshold #9414

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open

Dandandan wants to merge 17 commits into apache:main from Dandandan:selectivity_threshold

+268 −19

parquet/benches/arrow_reader_clickbench.rs

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -716,15 +716,15 @@ impl ReadTest {
  
            };

            // setup the reader

            let mut stream = ParquetRecordBatchStreamBuilder::new_with_metadata(

            let builder = ParquetRecordBatchStreamBuilder::new_with_metadata(

                parquet_file,

                self.arrow_reader_metadata.clone(),

            )

            .with_batch_size(8192)

            .with_projection(self.projection_mask.clone())

            .with_row_filter(self.row_filter())

            .build()

            .unwrap();

            .with_scatter_threshold(Some(0.01));

            let mut stream = builder.build().unwrap();

            // run the stream to its end

            let mut row_count = 0;

    @@ -747,15 +747,15 @@ impl ReadTest {
  
            let reader = ParquetObjectReader::new(store, location);

            // setup the reader

            let mut stream = ParquetRecordBatchStreamBuilder::new_with_metadata(

            let builder = ParquetRecordBatchStreamBuilder::new_with_metadata(

                reader,

                self.arrow_reader_metadata.clone(),

            )

            .with_batch_size(8192)

            .with_projection(self.projection_mask.clone())

            .with_row_filter(self.row_filter())

            .build()

            .unwrap();

            .with_scatter_threshold(Some(0.01));

            let mut stream = builder.build().unwrap();

            // run the stream to its end

            let mut row_count = 0;

    @@ -774,15 +774,15 @@ impl ReadTest {
  
            };

            // setup the reader

            let reader = ParquetRecordBatchReaderBuilder::new_with_metadata(

            let builder = ParquetRecordBatchReaderBuilder::new_with_metadata(

                parquet_file,

                self.arrow_reader_metadata.clone(),

            )

            .with_batch_size(8192)

            .with_projection(self.projection_mask.clone())

            .with_row_filter(self.row_filter())

            .build()

            .unwrap();

            .with_scatter_threshold(Some(0.01));

            let reader = builder.build().unwrap();

            // run the stream to its end

            let mut row_count = 0;

parquet/src/arrow/arrow_reader/mod.rs

-Original file line number
+Diff line change
@@ Expand Up / @@ -139,6 +139,8 @@ pub struct ArrowReaderBuilder<T> { @@
         pub(crate) metrics: ArrowReaderMetrics,
         pub(crate) max_predicate_cache_size: usize,
+        pub(crate) scatter_threshold: Option<f64>,
     }
     impl<T: Debug> Debug for ArrowReaderBuilder<T> {
@@ Expand All / @@ -157,6 +159,7 @@ impl<T: Debug> Debug for ArrowReaderBuilder<T> { @@
                 .field("limit", &self.limit)
                 .field("offset", &self.offset)
                 .field("metrics", &self.metrics)
+                .field("scatter_threshold", &self.scatter_threshold)
                 .finish()
         }
     }
@@ Expand All / @@ -178,6 +181,7 @@ impl<T> ArrowReaderBuilder<T> { @@
                 offset: None,
                 metrics: ArrowReaderMetrics::Disabled,
                 max_predicate_cache_size: 100 * 1024 * 1024, // 100MB default cache size
+                scatter_threshold: None,
             }
         }
@@ Expand Down Expand Up / @@ -430,6 +434,32 @@ impl<T> ArrowReaderBuilder<T> { @@
                 ..self
             }
         }
+        /// Set a scatter threshold for filter deferral.
+        ///
+        /// The threshold is the maximum allowed **selector density**
+        /// (`selector_count / row_count`). If applying a predicate would
+        /// produce a density above this value, its result is deferred.
+        /// For example, `0.25` allows at most 25 selectors per 100 rows.
+        ///
+        /// A high selector density means many small skip/read transitions,
+        /// which slows subsequent predicate evaluation and data decoding.
+        /// Deferring scattering predicates keeps the selection contiguous
+        /// for intermediate steps.
+        ///
+        /// The deferred results are applied at the end via
+        /// [`RowSelection::intersection`], so correctness is preserved.
+        ///
+        /// `None` disables deferral (the default).
+        ///
+        /// [`RowFilter`]: crate::arrow::arrow_reader::RowFilter
+        /// [`RowSelection::intersection`]: crate::arrow::arrow_reader::RowSelection::intersection
+        pub fn with_scatter_threshold(self, threshold: Option<f64>) -> Self {
+            Self {
+                scatter_threshold: threshold,
+                ..self
+            }
+        }
     }
     /// Options that control how [`ParquetMetaData`] is read when constructing
@@ Expand Down Expand Up @@
                 metrics,
                 // Not used for the sync reader, see https://github.com/apache/arrow-rs/issues/8000
                 max_predicate_cache_size: _,
+                scatter_threshold,
             } = self;
             // Try to avoid allocate large buffer
@@ Expand All @@
             let mut plan_builder = ReadPlanBuilder::new(batch_size)
                 .with_selection(selection)
-                .with_row_selection_policy(row_selection_policy);
+                .with_row_selection_policy(row_selection_policy)
+                .with_scatter_threshold(scatter_threshold);
             // Update selection based on any filters
             if let Some(filter) = filter.as_mut() {
@@ Expand All @@
                         .with_parquet_metadata(&reader.metadata)
                         .build_array_reader(fields.as_deref(), predicate.projection())?;
-                    plan_builder = plan_builder.with_predicate(array_reader, predicate.as_mut())?;
+                    let row_count: usize = reader
+                        .row_groups
+                        .iter()
+                        .map(|&i| reader.metadata.row_group(i).num_rows() as usize)
+                        .sum();
+                    plan_builder =
+                        plan_builder.with_predicate(array_reader, predicate.as_mut(), row_count)?;
                 }
             }
@@ Expand Down @@

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Add filter pushdown scatter threshold #9414

Diff view

Diff view

There are no files selected for viewing

adriangb Mar 20, 2026

Uh oh!

Dandandan Mar 20, 2026 •

edited

Loading

Uh oh!

Uh oh!

Uh oh!

Add filter pushdown scatter threshold #9414

Are you sure you want to change the base?

Add filter pushdown scatter threshold #9414

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

adriangb Mar 20, 2026

Choose a reason for hiding this comment

Uh oh!

Dandandan Mar 20, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Dandandan Mar 20, 2026 •

edited

Loading