-
Notifications
You must be signed in to change notification settings - Fork 1.8k
Prune columns are all null in ParquetExec by row_counts , handle IS NOT NULL #9989
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
de93d2a
18c9f4d
4c908f0
11567d9
93f9bea
5ecdc5c
0d91757
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -338,8 +338,10 @@ impl<'a> PruningStatistics for RowGroupPruningStatistics<'a> { | |
| scalar.to_array().ok() | ||
| } | ||
|
|
||
| fn row_counts(&self, _column: &Column) -> Option<ArrayRef> { | ||
| None | ||
| fn row_counts(&self, column: &Column) -> Option<ArrayRef> { | ||
| let (c, _) = self.column(&column.name)?; | ||
| let scalar = ScalarValue::UInt64(Some(c.num_values() as u64)); | ||
| scalar.to_array().ok() | ||
| } | ||
|
|
||
| fn contained( | ||
|
|
@@ -1022,15 +1024,17 @@ mod tests { | |
| column_statistics: Vec<ParquetStatistics>, | ||
| ) -> RowGroupMetaData { | ||
| let mut columns = vec![]; | ||
| let number_row = 1000; | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Before all unit test set each col with default
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was wondering if this could cause problems in real files (for example, if the row counts were not included in the statistics that were written into the file). However, I double checked the code and it seems like |
||
| for (i, s) in column_statistics.iter().enumerate() { | ||
| let column = ColumnChunkMetaData::builder(schema_descr.column(i)) | ||
| .set_statistics(s.clone()) | ||
| .set_num_values(number_row) | ||
| .build() | ||
| .unwrap(); | ||
| columns.push(column); | ||
| } | ||
| RowGroupMetaData::builder(schema_descr.clone()) | ||
| .set_num_rows(1000) | ||
| .set_num_rows(number_row) | ||
| .set_total_byte_size(2000) | ||
| .set_column_metadata(columns) | ||
| .build() | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1262,3 +1262,63 @@ async fn prune_periods_in_column_names() { | |
| .test_row_group_prune() | ||
| .await; | ||
| } | ||
|
|
||
| #[tokio::test] | ||
| async fn test_row_group_with_null_values() { | ||
| // Three row groups: | ||
| // 1. all Null values | ||
| // 2. values from 1 to 5 | ||
| // 3. all Null values | ||
|
|
||
| // After pruning, only row group 2 should be selected | ||
| RowGroupPruningTest::new() | ||
| .with_scenario(Scenario::WithNullValues) | ||
| .with_query("SELECT * FROM t WHERE \"i8\" <= 5") | ||
| .with_expected_errors(Some(0)) | ||
| .with_matched_by_stats(Some(1)) | ||
| .with_pruned_by_stats(Some(2)) | ||
| .with_expected_rows(5) | ||
| .with_matched_by_bloom_filter(Some(0)) | ||
| .with_pruned_by_bloom_filter(Some(0)) | ||
| .test_row_group_prune() | ||
| .await; | ||
|
|
||
| // After pruning, only row group 1,3 should be selected | ||
| RowGroupPruningTest::new() | ||
| .with_scenario(Scenario::WithNullValues) | ||
| .with_query("SELECT * FROM t WHERE \"i8\" is Null") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you also add a tests:
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| .with_expected_errors(Some(0)) | ||
| .with_matched_by_stats(Some(2)) | ||
| .with_pruned_by_stats(Some(1)) | ||
| .with_expected_rows(10) | ||
| .with_matched_by_bloom_filter(Some(0)) | ||
| .with_pruned_by_bloom_filter(Some(0)) | ||
| .test_row_group_prune() | ||
| .await; | ||
|
|
||
| // After pruning, only row group 2should be selected | ||
| RowGroupPruningTest::new() | ||
| .with_scenario(Scenario::WithNullValues) | ||
| .with_query("SELECT * FROM t WHERE \"i16\" is Not Null") | ||
| .with_expected_errors(Some(0)) | ||
| .with_matched_by_stats(Some(1)) | ||
| .with_pruned_by_stats(Some(2)) | ||
| .with_expected_rows(5) | ||
| .with_matched_by_bloom_filter(Some(0)) | ||
| .with_pruned_by_bloom_filter(Some(0)) | ||
| .test_row_group_prune() | ||
| .await; | ||
|
|
||
| // All row groups will be pruned | ||
| RowGroupPruningTest::new() | ||
| .with_scenario(Scenario::WithNullValues) | ||
| .with_query("SELECT * FROM t WHERE \"i32\" > 7") | ||
| .with_expected_errors(Some(0)) | ||
| .with_matched_by_stats(Some(0)) | ||
| .with_pruned_by_stats(Some(3)) | ||
| .with_expected_rows(0) | ||
| .with_matched_by_bloom_filter(Some(0)) | ||
| .with_pruned_by_bloom_filter(Some(0)) | ||
| .test_row_group_prune() | ||
| .await; | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Another implementor returns an
Int64array. I'm not sure if the type matters?https://github.com/apache/arrow-datafusion/blob/215f30f74a12e91fd7dca0d30e37014c8c3493ed/datafusion/core/src/physical_optimizer/pruning.rs#L1699-L1709
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Try to keep with https://github.com/apache/arrow-datafusion/blob/18c9f4d7e68b3eb470b2b2ef3f2297e018dd4298/datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs#L340-L342 avoid cast during
=, i think Int64Array only use in testThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
UInt64also matches the documentation https://docs.rs/datafusion/latest/datafusion/physical_optimizer/pruning/trait.PruningStatistics.html#tymethod.row_counts 👍I made a PR to fix the tests: #10004