apache · MaxGekk · Apr 24, 2020 · Apr 24, 2020 · Apr 24, 2020 · Apr 24, 2020
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -426,10 +426,22 @@ case class In(value: Expression, list: Seq[Expression]) extends Predicate {
  * Optimized version of In clause, when all filter values of In clause are
  * static.
  */
-case class InSet(child: Expression, hset: Set[Any]) extends UnaryExpression with Predicate {
+case class InSet(
+    child: Expression,
+    hset: Set[Any],
+    hsetElemType: DataType) extends UnaryExpression with Predicate {
 
   require(hset != null, "hset could not be null")
 
+  override def checkInputDataTypes(): TypeCheckResult = {
+    if (!DataType.equalsStructurally(child.dataType, hsetElemType, ignoreNullability = true)) {
+      TypeCheckResult.TypeCheckFailure(s"Arguments must be same type but were: " +
+        s"${child.dataType.catalogString} != ${hsetElemType.catalogString}")
+    } else {
+      TypeUtils.checkForOrderingExpr(child.dataType, s"function $prettyName")
+    }
+  }
+
   override def toString: String = s"$child INSET ${hset.mkString("(", ",", ")")}"
 
   @transient private[this] lazy val hasNull: Boolean = hset.contains(null)
@@ -446,12 +458,12 @@ case class InSet(child: Expression, hset: Set[Any]) extends UnaryExpression with
     }
   }
 
-  @transient lazy val set: Set[Any] = child.dataType match {
+  @transient lazy val set: Set[Any] = hsetElemType match {
     case t: AtomicType if !t.isInstanceOf[BinaryType] => hset
     case _: NullType => hset
     case _ =>
       // for structs use interpreted ordering to be able to compare UnsafeRows with non-UnsafeRows
-      TreeSet.empty(TypeUtils.getInterpretedOrdering(child.dataType)) ++ (hset - null)
+      TreeSet.empty(TypeUtils.getInterpretedOrdering(hsetElemType)) ++ (hset - null)
   }
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
@@ -462,7 +474,7 @@ case class InSet(child: Expression, hset: Set[Any]) extends UnaryExpression with
     }
   }
 
-  private def canBeComputedUsingSwitch: Boolean = child.dataType match {
+  private def canBeComputedUsingSwitch: Boolean = hsetElemType match {
     case ByteType | ShortType | IntegerType | DateType => true
     case _ => false
   }
@@ -521,7 +533,7 @@ case class InSet(child: Expression, hset: Set[Any]) extends UnaryExpression with
   override def sql: String = {
     val valueSQL = child.sql
     val listSQL = hset.toSeq
-      .map(elem => Literal(convertToScala(elem, child.dataType)).sql)
+      .map(elem => Literal(convertToScala(elem, hsetElemType)).sql)
       .mkString(", ")
     s"($valueSQL IN ($listSQL))"
   }

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
@@ -251,7 +251,7 @@ object OptimizeIn extends Rule[LogicalPlan] {
           EqualTo(v, newList.head)
         } else if (newList.length > SQLConf.get.optimizerInSetConversionThreshold) {
           val hSet = newList.map(e => e.eval(EmptyRow))
-          InSet(v, HashSet() ++ hSet)
+          InSet(v, HashSet() ++ hSet, v.dataType)
         } else if (newList.length < list.length) {
           expr.copy(list = newList)
         } else { // newList.length == list.length && newList.length > 1

diff --git a/.../scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala b/.../scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
@@ -172,7 +172,7 @@ case class FilterEstimation(plan: Filter) extends Logging {
         val hSet = expList.map(e => e.eval())
         evaluateInSet(ar, HashSet() ++ hSet, update)
 
-      case InSet(ar: Attribute, set) =>
+      case InSet(ar: Attribute, set, _) =>
         evaluateInSet(ar, set, update)
 
       // In current stage, we don't have advanced statistics such as sketches or histograms.

diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
@@ -130,7 +130,9 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper {
   private def checkInAndInSet(in: In, expected: Any): Unit = {
     // expecting all in.list are Literal or NonFoldableLiteral.
     checkEvaluation(in, expected)
-    checkEvaluation(InSet(in.value, HashSet() ++ in.list.map(_.eval())), expected)
+    checkEvaluation(
+      InSet(in.value, HashSet() ++ in.list.map(_.eval()), in.value.dataType),
+      expected)
   }
 
   test("basic IN/INSET predicate test") {
@@ -154,7 +156,7 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper {
         Literal(2)))),
       true)
     checkEvaluation(
-      And(InSet(Literal(1), HashSet(1, 2)), InSet(Literal(2), Set(1, 2))),
+      And(InSet(Literal(1), HashSet(1, 2), IntegerType), InSet(Literal(2), Set(1, 2), IntegerType)),
       true)
 
     val ns = NonFoldableLiteral.create(null, StringType)
@@ -256,12 +258,12 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper {
 
       val nullLiteral = Literal(null, presentValue.dataType)
 
-      checkEvaluation(InSet(nullLiteral, values), expected = null)
-      checkEvaluation(InSet(nullLiteral, values + null), expected = null)
-      checkEvaluation(InSet(presentValue, values), expected = true)
-      checkEvaluation(InSet(presentValue, values + null), expected = true)
-      checkEvaluation(InSet(absentValue, values), expected = false)
-      checkEvaluation(InSet(absentValue, values + null), expected = null)
+      checkEvaluation(InSet(nullLiteral, values, nullLiteral.dataType), expected = null)
+      checkEvaluation(InSet(nullLiteral, values + null, nullLiteral.dataType), expected = null)
+      checkEvaluation(InSet(presentValue, values, presentValue.dataType), expected = true)
+      checkEvaluation(InSet(presentValue, values + null, presentValue.dataType), expected = true)
+      checkEvaluation(InSet(absentValue, values, absentValue.dataType), expected = false)
+      checkEvaluation(InSet(absentValue, values + null, absentValue.dataType), expected = null)
     }
 
     def checkAllTypes(): Unit = {
@@ -498,7 +500,7 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper {
 
   test("SPARK-22693: InSet should not use global variables") {
     val ctx = new CodegenContext
-    InSet(Literal(1), Set(1, 2, 3, 4)).genCode(ctx)
+    InSet(Literal(1), Set(1, 2, 3, 4), IntegerType).genCode(ctx)
     assert(ctx.inlinedMutableStates.isEmpty)
   }
 
@@ -535,7 +537,7 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper {
 
   test("SPARK-29100: InSet with empty input set") {
     val row = create_row(1)
-    val inSet = InSet(BoundReference(0, IntegerType, true), Set.empty)
+    val inSet = InSet(BoundReference(0, IntegerType, true), Set.empty, IntegerType)
     checkEvaluation(inSet, false, row)
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeInSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeInSuite.scala
@@ -85,7 +85,7 @@ class OptimizeInSuite extends PlanTest {
     val optimized = Optimize.execute(originalQuery.analyze)
     val correctAnswer =
       testRelation
-        .where(InSet(UnresolvedAttribute("a"), (1 to 11).toSet))
+        .where(InSet(UnresolvedAttribute("a"), (1 to 11).toSet, IntegerType))
         .analyze
 
     comparePlans(optimized, correctAnswer)

diff --git a/.../src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala b/.../src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala
@@ -27,6 +27,7 @@ import org.apache.spark.sql.catalyst.plans.logical.statsEstimation.{ColumnStatsM
 import org.apache.spark.sql.catalyst.plans.logical.statsEstimation.EstimationUtils._
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
 
 /**
  * In this test suite, we test predicates containing the following operators:
@@ -352,15 +353,15 @@ class FilterEstimationSuite extends StatsEstimationTestBase {
 
   test("cint IN (3, 4, 5)") {
     validateEstimatedStats(
-      Filter(InSet(attrInt, Set(3, 4, 5)), childStatsTestPlan(Seq(attrInt), 10L)),
+      Filter(InSet(attrInt, Set(3, 4, 5), IntegerType), childStatsTestPlan(Seq(attrInt), 10L)),
       Seq(attrInt -> ColumnStat(distinctCount = Some(3), min = Some(3), max = Some(5),
         nullCount = Some(0), avgLen = Some(4), maxLen = Some(4))),
       expectedRowCount = 3)
   }
 
   test("evaluateInSet with all zeros") {
     validateEstimatedStats(
-      Filter(InSet(attrString, Set(3, 4, 5)),
+      Filter(InSet(attrString, Set(3, 4, 5), IntegerType),
         StatsTestPlan(Seq(attrString), 0,
           AttributeMap(Seq(attrString ->
             ColumnStat(distinctCount = Some(0), min = None, max = None,
@@ -371,7 +372,7 @@ class FilterEstimationSuite extends StatsEstimationTestBase {
 
   test("evaluateInSet with string") {
     validateEstimatedStats(
-      Filter(InSet(attrString, Set("A0")),
+      Filter(InSet(attrString, Set(UTF8String.fromString("A0")), StringType),
         StatsTestPlan(Seq(attrString), 10,
           AttributeMap(Seq(attrString ->
             ColumnStat(distinctCount = Some(10), min = None, max = None,
@@ -383,14 +384,14 @@ class FilterEstimationSuite extends StatsEstimationTestBase {
 
   test("cint NOT IN (3, 4, 5)") {
     validateEstimatedStats(
-      Filter(Not(InSet(attrInt, Set(3, 4, 5))), childStatsTestPlan(Seq(attrInt), 10L)),
+      Filter(Not(InSet(attrInt, Set(3, 4, 5), IntegerType)), childStatsTestPlan(Seq(attrInt), 10L)),
       Seq(attrInt -> colStatInt.copy(distinctCount = Some(7))),
       expectedRowCount = 7)
   }
 
   test("cbool IN (true)") {
     validateEstimatedStats(
-      Filter(InSet(attrBool, Set(true)), childStatsTestPlan(Seq(attrBool), 10L)),
+      Filter(InSet(attrBool, Set(true), BooleanType), childStatsTestPlan(Seq(attrBool), 10L)),
       Seq(attrBool -> ColumnStat(distinctCount = Some(1), min = Some(true), max = Some(true),
         nullCount = Some(0), avgLen = Some(1), maxLen = Some(1))),
       expectedRowCount = 5)
@@ -510,7 +511,7 @@ class FilterEstimationSuite extends StatsEstimationTestBase {
       attributeStats = AttributeMap(Seq(attrInt -> cornerChildColStatInt))
     )
     validateEstimatedStats(
-      Filter(InSet(attrInt, Set(1, 2, 3, 4, 5)), cornerChildStatsTestplan),
+      Filter(InSet(attrInt, Set(1, 2, 3, 4, 5), IntegerType), cornerChildStatsTestplan),
       Seq(attrInt -> ColumnStat(distinctCount = Some(2), min = Some(1), max = Some(5),
         nullCount = Some(0), avgLen = Some(4), maxLen = Some(4))),
       expectedRowCount = 2)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
@@ -828,11 +828,12 @@ class Column(val expr: Expression) extends Logging {
    * @since 2.4.0
    */
   def isInCollection(values: scala.collection.Iterable[_]): Column = withExpr {
-    val hSet = values.toSet[Any]
-    if (hSet.size > SQLConf.get.optimizerInSetConversionThreshold) {
-      InSet(expr, hSet)
+    val exprValues = values.toSeq.map(lit(_).expr)
+    if (exprValues.size > SQLConf.get.optimizerInSetConversionThreshold) {
+      val elemType = exprValues.headOption.map(_.dataType).getOrElse(NullType)
+      InSet(expr, exprValues.map(_.eval()).toSet, elemType)
 override def checkInputDataTypes(): TypeCheckResult = { 
   val mismatchOpt = list.find(l => !DataType.equalsStructurally(l.dataType, value.dataType, 
     ignoreNullability = true)) 
   if (mismatchOpt.isDefined) { 
     TypeCheckResult.TypeCheckFailure(s"Arguments must be same type but were: " + 
       s"${value.dataType.catalogString} != ${mismatchOpt.get.dataType.catalogString}") 
   } else { 
     TypeUtils.checkForOrderingExpr(value.dataType, s"function $prettyName") 
   } 
 } 
 override def checkInputDataTypes(): TypeCheckResult = { 
   val mismatchOpt = list.find(l => !DataType.equalsStructurally(l.dataType, value.dataType, 
     ignoreNullability = true)) 
   if (mismatchOpt.isDefined) { 
     TypeCheckResult.TypeCheckFailure(s"Arguments must be same type but were: " + 
       s"${value.dataType.catalogString} != ${mismatchOpt.get.dataType.catalogString}") 
   } else { 
     TypeUtils.checkForOrderingExpr(value.dataType, s"function $prettyName") 
   } 
 } 
     } else {
-      In(expr, values.toSeq.map(lit(_).expr))
+      In(expr, exprValues)
     }
   }
 

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -479,7 +479,7 @@ object DataSourceStrategy {
     case expressions.LessThanOrEqual(Literal(v, t), PushableColumn(name)) =>
       Some(sources.GreaterThanOrEqual(name, convertToScala(v, t)))
 
-    case expressions.InSet(e @ PushableColumn(name), set) =>
+    case expressions.InSet(e @ PushableColumn(name), set, _) =>
       val toScala = CatalystTypeConverters.createToScalaConverter(e.dataType)
       Some(sources.In(name, set.toArray.map(toScala)))
 

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala
@@ -89,7 +89,7 @@ object FileSourceStrategy extends Strategy with Logging {
       case expressions.In(a: Attribute, list)
         if list.forall(_.isInstanceOf[Literal]) && a.name == bucketColumnName =>
         getBucketSetFromIterable(a, list.map(e => e.eval(EmptyRow)))
-      case expressions.InSet(a: Attribute, hset)
+      case expressions.InSet(a: Attribute, hset, _)
         if hset.forall(_.isInstanceOf[Literal]) && a.name == bucketColumnName =>
         getBucketSetFromIterable(a, hset.map(e => expressions.Literal(e).eval(EmptyRow)))
       case expressions.IsNull(a: Attribute) if a.name == bucketColumnName =>

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/subquery.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/subquery.scala
@@ -159,7 +159,7 @@ case class InSubqueryExec(
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     prepareResult()
-    InSet(child, result.toSet).doGenCode(ctx, ev)
+    InSet(child, result.toSet, child.dataType).doGenCode(ctx, ev)
   }
 
   override lazy val canonicalized: InSubqueryExec = {

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
@@ -483,6 +483,10 @@ class ColumnExpressionSuite extends QueryTest with SharedSparkSession {
           "due to data type mismatch: Arguments must be same type but were").foreach { s =>
             assert(e.getMessage.toLowerCase(Locale.ROOT).contains(s.toLowerCase(Locale.ROOT)))
           }
+        val errMsg = intercept[AnalysisException] {
+          df.select($"a".isInCollection(Seq(0, 1).map(new java.sql.Timestamp(_)))).collect()
+        }.getMessage
+        assert(errMsg.contains("Arguments must be same type"))
       }
     }
   }
@@ -872,7 +876,18 @@ class ColumnExpressionSuite extends QueryTest with SharedSparkSession {
   }
 
   test("SPARK-31563: sql of InSet for UTF8String collection") {
-    val inSet = InSet(Literal("a"), Set("a", "b").map(UTF8String.fromString))
+    val inSet = InSet(Literal("a"), Set("a", "b").map(UTF8String.fromString), StringType)
     assert(inSet.sql === "('a' IN ('a', 'b'))")
   }
+
+  test("SPARK-31553: isInCollection for collection sizes above a threshold") {
+    val threshold = 100
+    withSQLConf(SQLConf.OPTIMIZER_INSET_CONVERSION_THRESHOLD.key -> threshold.toString) {
+      val set = (0 until 2 * threshold).map(_.toString).toSet
+      val elem = "10"
+      val data = Seq(elem).toDF("x")
+      assert(set.contains(elem))
+      checkAnswer(data.select($"x".isInCollection(set)), Row(true))
+    }
+  }
 }
diff --git a/...e/src/test/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategySuite.scala b/...e/src/test/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategySuite.scala
@@ -110,7 +110,9 @@ class DataSourceStrategySuite extends PlanTest with SharedSparkSession {
     testTranslateFilter(LessThanOrEqual(1, attrInt),
       Some(sources.GreaterThanOrEqual(intColName, 1)))
 
-    testTranslateFilter(InSet(attrInt, Set(1, 2, 3)), Some(sources.In(intColName, Array(1, 2, 3))))
+    testTranslateFilter(
+      InSet(attrInt, Set(1, 2, 3), IntegerType),
+      Some(sources.In(intColName, Array(1, 2, 3))))
 
     testTranslateFilter(In(attrInt, Seq(1, 2, 3)), Some(sources.In(intColName, Array(1, 2, 3))))
 

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala
@@ -37,6 +37,7 @@ import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.internal.StaticSQLConf.CATALOG_IMPLEMENTATION
 import org.apache.spark.sql.test.{SharedSparkSession, SQLTestUtils}
+import org.apache.spark.sql.types.IntegerType
 import org.apache.spark.util.Utils
 import org.apache.spark.util.collection.BitSet
 
@@ -188,8 +189,10 @@ abstract class BucketedReadSuite extends QueryTest with SQLTestUtils {
         df)
 
       // Case 4: InSet
-      val inSetExpr = expressions.InSet($"j".expr,
-        Set(bucketValue, bucketValue + 1, bucketValue + 2, bucketValue + 3).map(lit(_).expr))
+      val inSetExpr = expressions.InSet(
+        $"j".expr,
+        Set(bucketValue, bucketValue + 1, bucketValue + 2, bucketValue + 3).map(lit(_).expr),
+        IntegerType)
       checkPrunedAnswers(
         bucketSpec,
         bucketValues = Seq(bucketValue, bucketValue + 1, bucketValue + 2, bucketValue + 3),

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
@@ -740,7 +740,7 @@ private[client] class Shim_v0_13 extends Shim_v0_12 {
           if useAdvanced =>
         Some(convertInToOr(name, values))
 
-      case InSet(ExtractAttribute(SupportedAttribute(name)), ExtractableValues(values))
+      case InSet(ExtractAttribute(SupportedAttribute(name)), ExtractableValues(values), _)
           if useAdvanced =>
         Some(convertInToOr(name, values))
 

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HivePartitionFilteringSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HivePartitionFilteringSuite.scala
@@ -213,7 +213,7 @@ class HivePartitionFilteringSuite(version: String)
       0 to 4,
       "aa" :: "ab" :: "ba" :: "bb" :: Nil, {
         case expr @ In(v, list) if expr.inSetConvertible =>
-          InSet(v, list.map(_.eval(EmptyRow)).toSet)
+          InSet(v, list.map(_.eval(EmptyRow)).toSet, v.dataType)
       })
   }
 
@@ -225,7 +225,7 @@ class HivePartitionFilteringSuite(version: String)
       0 to 4,
       "aa" :: "ab" :: "ba" :: "bb" :: Nil, {
         case expr @ In(v, list) if expr.inSetConvertible =>
-          InSet(v, list.map(_.eval(EmptyRow)).toSet)
+          InSet(v, list.map(_.eval(EmptyRow)).toSet, v.dataType)
       })
   }
 
@@ -244,7 +244,7 @@ class HivePartitionFilteringSuite(version: String)
       0 to 4,
       "ab" :: "ba" :: Nil, {
         case expr @ In(v, list) if expr.inSetConvertible =>
-          InSet(v, list.map(_.eval(EmptyRow)).toSet)
+          InSet(v, list.map(_.eval(EmptyRow)).toSet, v.dataType)
       })
   }