Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-23799][SQL] FilterEstimation.evaluateInSet produces devision by zero in a case of empty table with analyzed statistics #21052

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -392,6 +392,10 @@ case class FilterEstimation(plan: Filter) extends Logging {
val dataType = attr.dataType
var newNdv = ndv

if (ndv.toDouble == 0 || colStat.min.isEmpty || colStat.max.isEmpty) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why colStat.min.isEmpty || colStat.max.isEmpty means empty output? string type always has no max/min

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, we need to correct it in the next PR

return Some(0.0)
}

// use [min, max] to filter the original hSet
dataType match {
case _: NumericType | BooleanType | DateType | TimestampType =>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -357,6 +357,17 @@ class FilterEstimationSuite extends StatsEstimationTestBase {
expectedRowCount = 3)
}

test("evaluateInSet with all zeros") {
validateEstimatedStats(
Filter(InSet(attrString, Set(3, 4, 5)),
StatsTestPlan(Seq(attrString), 0,
AttributeMap(Seq(attrString ->
ColumnStat(distinctCount = Some(0), min = None, max = None,
nullCount = Some(0), avgLen = Some(0), maxLen = Some(0)))))),
Seq(attrString -> ColumnStat(distinctCount = Some(0))),
expectedRowCount = 0)
}

test("cint NOT IN (3, 4, 5)") {
validateEstimatedStats(
Filter(Not(InSet(attrInt, Set(3, 4, 5))), childStatsTestPlan(Seq(attrInt), 10L)),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -382,4 +382,32 @@ class StatisticsCollectionSuite extends StatisticsCollectionTestBase with Shared
}
}
}

test("Simple queries must be working, if CBO is turned on") {
withSQLConf(SQLConf.CBO_ENABLED.key -> "true") {
withTable("TBL1", "TBL") {
import org.apache.spark.sql.functions._
val df = spark.range(1000L).select('id,
'id * 2 as "FLD1",
'id * 12 as "FLD2",
lit("aaa") + 'id as "fld3")
df.write
.mode(SaveMode.Overwrite)
.bucketBy(10, "id", "FLD1", "FLD2")
.sortBy("id", "FLD1", "FLD2")
.saveAsTable("TBL")
sql("ANALYZE TABLE TBL COMPUTE STATISTICS ")
sql("ANALYZE TABLE TBL COMPUTE STATISTICS FOR COLUMNS ID, FLD1, FLD2, FLD3")
val df2 = spark.sql(
"""
|SELECT t1.id, t1.fld1, t1.fld2, t1.fld3
|FROM tbl t1
|JOIN tbl t2 on t1.id=t2.id
|WHERE t1.fld3 IN (-123.23,321.23)
""".stripMargin)
df2.createTempView("TBL2")
sql("SELECT * FROM tbl2 WHERE fld3 IN ('qqq', 'qwe') ").queryExecution.executedPlan
}
}
}
}