apache · mshtelma · Mar 26, 2018 · Apr 3, 2018 · Apr 12, 2018 · Apr 13, 2018
diff --git a/.../scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala b/.../scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
@@ -392,6 +392,10 @@ case class FilterEstimation(plan: Filter) extends Logging {
     val dataType = attr.dataType
     var newNdv = ndv
 
+    if (ndv.toDouble == 0 || colStat.min.isEmpty || colStat.max.isEmpty)  {
+      return Some(0.0)
+    }
+
     // use [min, max] to filter the original hSet
     dataType match {
       case _: NumericType | BooleanType | DateType | TimestampType =>

diff --git a/.../src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala b/.../src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala
@@ -357,6 +357,17 @@ class FilterEstimationSuite extends StatsEstimationTestBase {
       expectedRowCount = 3)
   }
 
+  test("evaluateInSet with all zeros") {
+    validateEstimatedStats(
+      Filter(InSet(attrString, Set(3, 4, 5)),
+        StatsTestPlan(Seq(attrString), 0,
+          AttributeMap(Seq(attrString ->
+            ColumnStat(distinctCount = Some(0), min = None, max = None,
+              nullCount = Some(0), avgLen = Some(0), maxLen = Some(0)))))),
+      Seq(attrString -> ColumnStat(distinctCount = Some(0))),
+      expectedRowCount = 0)
+  }
+
   test("cint NOT IN (3, 4, 5)") {
     validateEstimatedStats(
       Filter(Not(InSet(attrInt, Set(3, 4, 5))), childStatsTestPlan(Seq(attrInt), 10L)),

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
@@ -382,4 +382,32 @@ class StatisticsCollectionSuite extends StatisticsCollectionTestBase with Shared
       }
     }
   }
+
+  test("Simple queries must be working, if CBO is turned on") {
+    withSQLConf(SQLConf.CBO_ENABLED.key -> "true") {
+      withTable("TBL1", "TBL") {
+        import org.apache.spark.sql.functions._
+        val df = spark.range(1000L).select('id,
+          'id * 2 as "FLD1",
+          'id * 12 as "FLD2",
+          lit("aaa") + 'id as "fld3")
+        df.write
+          .mode(SaveMode.Overwrite)
+          .bucketBy(10, "id", "FLD1", "FLD2")
+          .sortBy("id", "FLD1", "FLD2")
+          .saveAsTable("TBL")
+        sql("ANALYZE TABLE TBL COMPUTE STATISTICS ")
+        sql("ANALYZE TABLE TBL COMPUTE STATISTICS FOR COLUMNS ID, FLD1, FLD2, FLD3")
+        val df2 = spark.sql(
+          """
+             |SELECT t1.id, t1.fld1, t1.fld2, t1.fld3
+             |FROM tbl t1
+             |JOIN tbl t2 on t1.id=t2.id
+             |WHERE  t1.fld3 IN (-123.23,321.23)
+          """.stripMargin)
+        df2.createTempView("TBL2")
+        sql("SELECT * FROM tbl2 WHERE fld3 IN ('qqq', 'qwe')  ").queryExecution.executedPlan
+      }
+    }
+  }
 }