apache · alamb · Nov 16, 2022 · Nov 9, 2022 · alamb · Nov 15, 2022
diff --git a/datafusion/core/src/physical_plan/filter.rs b/datafusion/core/src/physical_plan/filter.rs
@@ -38,7 +38,7 @@ use arrow::error::Result as ArrowResult;
 use arrow::record_batch::RecordBatch;
 use datafusion_expr::Operator;
 use datafusion_physical_expr::expressions::BinaryExpr;
-use datafusion_physical_expr::split_conjunction;
+use datafusion_physical_expr::{split_conjunction, AnalysisContext};
 
 use log::debug;
 
@@ -168,9 +168,27 @@ impl ExecutionPlan for FilterExec {
         Some(self.metrics.clone_inner())
     }
 
-    /// The output statistics of a filtering operation are unknown
+    /// The output statistics of a filtering operation can be estimated if the
+    /// predicate's selectivity value can be determined for the incoming data.
     fn statistics(&self) -> Statistics {
-        Statistics::default()
+        let input_stats = self.input.statistics();
+        let analysis_ctx =
+            AnalysisContext::from_statistics(self.input.schema().as_ref(), &input_stats);
+
+        let predicate_selectivity = self
+            .predicate
+            .boundaries(&analysis_ctx)
+            .and_then(|bounds| bounds.selectivity);
+
+        match predicate_selectivity {
+            Some(selectivity) => Statistics {
+                num_rows: input_stats
+                    .num_rows
+                    .map(|num_rows| (num_rows as f64 * selectivity).ceil() as usize),
+                ..Default::default()
+            },
+            None => Statistics::default(),
+        }
     }
 }
 
@@ -282,9 +300,14 @@ mod tests {
     use crate::physical_plan::{collect, with_new_children_if_necessary};
     use crate::prelude::SessionContext;
     use crate::test;
+    use crate::test::exec::StatisticsExec;
     use crate::test_util;
+    use arrow::datatypes::{DataType, Field, Schema};
+    use datafusion_common::ColumnStatistics;
+    use datafusion_common::ScalarValue;
     use datafusion_expr::Operator;
     use std::iter::Iterator;
+    use std::sync::Arc;
 
     #[tokio::test]
     async fn simple_predicate() -> Result<()> {
@@ -380,4 +403,108 @@ mod tests {
 
         Ok(())
     }
+
+    #[tokio::test]
+    async fn test_filter_statistics_basic_expr() -> Result<()> {
+        // Table:
+        //      a: min=1, max=100
+        let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]);
+        let input = Arc::new(StatisticsExec::new(
+            Statistics {
+                num_rows: Some(100),
+                column_statistics: Some(vec![ColumnStatistics {
+                    min_value: Some(ScalarValue::Int32(Some(1))),
+                    max_value: Some(ScalarValue::Int32(Some(100))),
+                    ..Default::default()
+                }]),
+                ..Default::default()
+            },
+            schema.clone(),
+        ));
+
+        // a <= 25
+        let predicate: Arc<dyn PhysicalExpr> =
+            binary(col("a", &schema)?, Operator::LtEq, lit(25i32), &schema)?;
+
+        // WHERE a <= 25
+        let filter: Arc<dyn ExecutionPlan> =
+            Arc::new(FilterExec::try_new(predicate, input)?);
+
+        let statistics = filter.statistics();
+        assert_eq!(statistics.num_rows, Some(25));
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    #[ignore]
+    // This test requires propagation of column boundaries from the comparison analysis
+    // to the analysis context. This is not yet implemented.
+    async fn test_filter_statistics_column_level_basic_expr() -> Result<()> {
+        // Table:
+        //      a: min=1, max=100
+        let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]);
+        let input = Arc::new(StatisticsExec::new(
+            Statistics {
+                num_rows: Some(100),
+                column_statistics: Some(vec![ColumnStatistics {
+                    min_value: Some(ScalarValue::Int32(Some(1))),
+                    max_value: Some(ScalarValue::Int32(Some(100))),
+                    ..Default::default()
+                }]),
+                ..Default::default()
+            },
+            schema.clone(),
+        ));
+
+        // a <= 25
+        let predicate: Arc<dyn PhysicalExpr> =
+            binary(col("a", &schema)?, Operator::LtEq, lit(25i32), &schema)?;
+
+        // WHERE a <= 25
+        let filter: Arc<dyn ExecutionPlan> =
+            Arc::new(FilterExec::try_new(predicate, input)?);
+
+        let statistics = filter.statistics();
+        assert_eq!(statistics.num_rows, Some(25));
+        assert_eq!(
+            statistics.column_statistics,
+            Some(vec![ColumnStatistics {
+                min_value: Some(ScalarValue::Int32(Some(1))),
+                max_value: Some(ScalarValue::Int32(Some(25))),
+                ..Default::default()
+            }])
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_filter_statistics_when_input_stats_missing() -> Result<()> {
+        // Table:
+        //      a: min=???, max=??? (missing)
+        let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]);
+        let input = Arc::new(StatisticsExec::new(
+            Statistics {
+                column_statistics: Some(vec![ColumnStatistics {
+                    ..Default::default()
+                }]),
+                ..Default::default()
+            },
+            schema.clone(),
+        ));
+
+        // a <= 25
+        let predicate: Arc<dyn PhysicalExpr> =
+            binary(col("a", &schema)?, Operator::LtEq, lit(25i32), &schema)?;
+
+        // WHERE a <= 25
+        let filter: Arc<dyn ExecutionPlan> =
+            Arc::new(FilterExec::try_new(predicate, input)?);
+
+        let statistics = filter.statistics();
+        assert_eq!(statistics.num_rows, None);
+
+        Ok(())
+    }
 }
diff --git a/datafusion/core/tests/statistics.rs b/datafusion/core/tests/statistics.rs
@@ -238,8 +238,9 @@ async fn sql_filter() -> Result<()> {
         .await
         .unwrap();
 
-    // with a filtering condition we loose all knowledge about the statistics
-    assert_eq!(Statistics::default(), physical_plan.statistics());
+    let stats = physical_plan.statistics();
+    assert!(!stats.is_exact);
+    assert_eq!(stats.num_rows, Some(1));
 
     Ok(())
 }

diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs
@@ -2970,7 +2970,7 @@ mod tests {
         ];
 
         for ((operator, rhs), (exp_selectivity, _, _)) in cases {
-            let context = AnalysisContext::from_statistics(&schema, statistics.clone());
+            let context = AnalysisContext::from_statistics(&schema, &statistics);
             let left = col("a", &schema).unwrap();
             let right = ScalarValue::Int64(Some(rhs));
             let boundaries =
@@ -3039,7 +3039,7 @@ mod tests {
         ];
 
         for ((operator, rhs), (exp_selectivity, _, _)) in cases {
-            let context = AnalysisContext::from_statistics(&schema, statistics.clone());
+            let context = AnalysisContext::from_statistics(&schema, &statistics);
             let left = col("a", &schema).unwrap();
             let right = ScalarValue::from(rhs);
             let boundaries =
@@ -3085,7 +3085,7 @@ mod tests {
             &schema,
         );
 
-        let context = AnalysisContext::from_statistics(&schema, statistics);
+        let context = AnalysisContext::from_statistics(&schema, &statistics);
         let predicate_boundaries = gt
             .boundaries(&context)
             .expect("boundaries should not be None");
@@ -3113,7 +3113,7 @@ mod tests {
             &schema,
         );
 
-        let context = AnalysisContext::from_statistics(&schema, statistics);
+        let context = AnalysisContext::from_statistics(&schema, &statistics);
         let predicate_boundaries = gt
             .boundaries(&context)
             .expect("boundaries should not be None");

diff --git a/datafusion/physical-expr/src/expressions/column.rs b/datafusion/physical-expr/src/expressions/column.rs
@@ -290,7 +290,7 @@ mod test {
     #[test]
     fn stats_bounds_analysis() -> Result<()> {
         let (schema, statistics) = get_test_table_stats();
-        let context = AnalysisContext::from_statistics(&schema, statistics);
+        let context = AnalysisContext::from_statistics(&schema, &statistics);
 
         let cases = [
             // (name, index, expected boundaries)

diff --git a/datafusion/physical-expr/src/physical_expr.rs b/datafusion/physical-expr/src/physical_expr.rs
@@ -103,11 +103,11 @@ impl AnalysisContext {
     }
 
     /// Create a new analysis context from column statistics.
-    pub fn from_statistics(input_schema: &Schema, statistics: Statistics) -> Self {
+    pub fn from_statistics(input_schema: &Schema, statistics: &Statistics) -> Self {
         // Even if the underlying statistics object doesn't have any column level statistics,
         // we can still create an analysis context with the same number of columns and see whether
         // we can infer it during the way.
-        let column_boundaries = match statistics.column_statistics {
+        let column_boundaries = match &statistics.column_statistics {
             Some(columns) => columns
                 .iter()
                 .map(ExprBoundaries::from_column)