[SPARK-24879][SQL] Fix NPE in Hive partition pruning filter pushdown

## What changes were proposed in this pull request? We get a NPE when we have a filter on a partition column of the form `col in (x, null)`. This is due to the filter converter in HiveShim not handling `null`s correctly. This patch fixes this bug while still pushing down as much of the partition pruning predicates as possible, by filtering out `null`s from any `in` predicate. Since Hive only supports very simple partition pruning filters, this change should preserve correctness. ## How was this patch tested? Unit tests, manual tests Author: William Sheu <william.sheu@databricks.com> Closes #21832 from PenguinToast/partition-pruning-npe. (cherry picked from commit bbd6f0c) Signed-off-by: Xiao Li <gatorsmile@gmail.com>
apache · Jul 21, 2018 · bd6bfac · bd6bfac
1 parent db1f3cc
commit bd6bfac
Show file tree

Hide file tree

Showing 2 changed files with 32 additions and 1 deletion.
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
@@ -599,6 +599,7 @@ private[client] class Shim_v0_13 extends Shim_v0_12 {
 
     object ExtractableLiteral {
       def unapply(expr: Expression): Option[String] = expr match {
+        case Literal(null, _) => None // `null`s can be cast as other types; we want to avoid NPEs.
         case Literal(value, _: IntegralType) => Some(value.toString)
         case Literal(value, _: StringType) => Some(quoteStringLiteral(value.toString))
         case _ => None
@@ -607,7 +608,23 @@ private[client] class Shim_v0_13 extends Shim_v0_12 {
 
     object ExtractableLiterals {
       def unapply(exprs: Seq[Expression]): Option[Seq[String]] = {
-        val extractables = exprs.map(ExtractableLiteral.unapply)
+        // SPARK-24879: The Hive metastore filter parser does not support "null", but we still want
+        // to push down as many predicates as we can while still maintaining correctness.
+        // In SQL, the `IN` expression evaluates as follows:
+        //  > `1 in (2, NULL)` -> NULL
+        //  > `1 in (1, NULL)` -> true
+        //  > `1 in (2)` -> false
+        // Since Hive metastore filters are NULL-intolerant binary operations joined only by
+        // `AND` and `OR`, we can treat `NULL` as `false` and thus rewrite `1 in (2, NULL)` as
+        // `1 in (2)`.
+        // If the Hive metastore begins supporting NULL-tolerant predicates and Spark starts
+        // pushing down these predicates, then this optimization will become incorrect and need
+        // to be changed.
+        val extractables = exprs
+            .filter {
+              case Literal(null, _) => false
+              case _ => true
+            }.map(ExtractableLiteral.unapply)
         if (extractables.nonEmpty && extractables.forall(_.isDefined)) {
           Some(extractables.map(_.get))
         } else {

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala
@@ -72,6 +72,20 @@ class FiltersSuite extends SparkFunSuite with Logging with PlanTest {
       (Literal("p2\" and q=\"q2") === a("stringcol", StringType)) :: Nil,
     """stringcol = 'p1" and q="q1' and 'p2" and q="q2' = stringcol""")
 
+  filterTest("SPARK-24879 null literals should be ignored for IN constructs",
+    (a("intcol", IntegerType) in (Literal(1), Literal(null))) :: Nil,
+    "(intcol = 1)")
+
+  // Applying the predicate `x IN (NULL)` should return an empty set, but since this optimization
+  // will be applied by Catalyst, this filter converter does not need to account for this.
+  filterTest("SPARK-24879 IN predicates with only NULLs will not cause a NPE",
+    (a("intcol", IntegerType) in Literal(null)) :: Nil,
+    "")
+
+  filterTest("typecast null literals should not be pushed down in simple predicates",
+    (a("intcol", IntegerType) === Literal(null, IntegerType)) :: Nil,
+    "")
+
   private def filterTest(name: String, filters: Seq[Expression], result: String) = {
     test(name) {
       withSQLConf(SQLConf.ADVANCED_PARTITION_PREDICATE_PUSHDOWN.key -> "true") {