apache · viirya · Feb 8, 2020 · Feb 8, 2020 · Feb 8, 2020 · Feb 10, 2020
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
@@ -492,7 +492,10 @@ object MimaExcludes {
     ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.ml.regression.AFTSurvivalRegression.setPredictionCol"),
 
     // [SPARK-29543][SS][UI] Init structured streaming ui
-    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.streaming.StreamingQueryListener#QueryStartedEvent.this")
+    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.streaming.StreamingQueryListener#QueryStartedEvent.this"),
+
+    // [SPARK-30590][SQL] Untyped select API cannot take typed column expression
+    ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.sql.functions.count")
   )
 
   // Exclude rules for 2.4.x

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -1430,6 +1430,11 @@ class Dataset[T] private[sql](
    */
   @scala.annotation.varargs
   def select(cols: Column*): DataFrame = withPlan {
+    cols.find(_.isInstanceOf[TypedColumn[_, _]]).foreach { typedCol =>
+      throw new AnalysisException(s"$typedCol is a typed column that " +
+        "cannot be passed in untyped `select` API. If you are going to select " +
+        "multiple typed columns, you can use `Dataset.selectUntyped` API.")
+    }
     Project(cols.map(_.named), logicalPlan)
   }
 
@@ -1493,11 +1498,12 @@ class Dataset[T] private[sql](
   }
 
   /**
-   * Internal helper function for building typed selects that return tuples. For simplicity and
-   * code reuse, we do this without the help of the type system and then use helper functions
-   * that cast appropriately for the user facing interface.
+   * Selects a set of typed column based expressions.
+   *
+   * @group typedrel
+   * @since 3.1.0
    */
-  protected def selectUntyped(columns: TypedColumn[_, _]*): Dataset[_] = {
+  def selectUntyped(columns: TypedColumn[_, _]*): Dataset[_] = {
     val encoders = columns.map(_.encoder)
     val namedColumns =
       columns.map(_.withInputType(exprEnc, logicalPlan.output).named)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -352,8 +352,7 @@ object functions {
    * @group agg_funcs
    * @since 1.3.0
    */
-  def count(columnName: String): TypedColumn[Any, Long] =
-    count(Column(columnName)).as(ExpressionEncoder[Long]())
+  def count(columnName: String): Column = count(Column(columnName))
 
   /**
    * Aggregate function: returns the number of distinct items in a group.

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetAggregatorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetAggregatorSuite.scala
@@ -219,6 +219,15 @@ case class OptionBooleanIntAggregator(colName: String)
   def OptionalBoolIntEncoder: Encoder[Option[(Boolean, Int)]] = ExpressionEncoder()
 }
 
+case class FooAgg(s: Int) extends Aggregator[Row, Int, Int] {
+  def zero: Int = s
+  def reduce(b: Int, r: Row): Int = b + r.getAs[Int](0)
+  def merge(b1: Int, b2: Int): Int = b1 + b2
+  def finish(b: Int): Int = b
+  def bufferEncoder: Encoder[Int] = Encoders.scalaInt
+  def outputEncoder: Encoder[Int] = Encoders.scalaInt
+}
+
 class DatasetAggregatorSuite extends QueryTest with SharedSparkSession {
   import testImplicits._
 
@@ -394,4 +403,21 @@ class DatasetAggregatorSuite extends QueryTest with SharedSparkSession {
     checkAnswer(group, Row("bob", Row(true, 3)) :: Nil)
     checkDataset(group.as[OptionBooleanIntData], OptionBooleanIntData("bob", Some((true, 3))))
   }
+
+  test("SPARK-30590: select multiple typed column expressions") {
+    val df = Seq((1, 2, 3, 4, 5, 6)).toDF("a", "b", "c", "d", "e", "f")
+    val fooAgg = (i: Int) => FooAgg(i).toColumn.name(s"foo_agg_$i")
+
+    val agg1 = df.select(fooAgg(1), fooAgg(2), fooAgg(3), fooAgg(4), fooAgg(5))
+    checkDataset(agg1, (3, 5, 7, 9, 11))
+
+    val agg2 = df.selectUntyped(fooAgg(1), fooAgg(2), fooAgg(3), fooAgg(4), fooAgg(5), fooAgg(6))
+      .asInstanceOf[Dataset[(Int, Int, Int, Int, Int, Int)]]
+    checkDataset(agg2, (3, 5, 7, 9, 11, 13))
+
+    val err = intercept[AnalysisException] {
+      df.select(fooAgg(1), fooAgg(2), fooAgg(3), fooAgg(4), fooAgg(5), fooAgg(6))
+    }.getMessage
+    assert(err.contains("a typed column that cannot be passed in untyped `select` API"))
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -597,7 +597,8 @@ class DatasetSuite extends QueryTest
     val ds = Seq(("a", 10), ("a", 20), ("b", 1), ("b", 2), ("c", 1)).toDS()
 
     checkDatasetUnorderly(
-      ds.groupByKey(_._1).agg(sum("_2").as[Long], sum($"_2" + 1).as[Long], count("*")),
+      ds.groupByKey(_._1).agg(sum("_2").as[Long],
+        sum($"_2" + 1).as[Long], count("*").as[Long]),
       ("a", 30L, 32L, 2L), ("b", 3L, 5L, 2L), ("c", 1L, 2L, 1L))
   }
 

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DeprecatedDatasetAggregatorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DeprecatedDatasetAggregatorSuite.scala
@@ -40,7 +40,7 @@ class DeprecatedDatasetAggregatorSuite extends QueryTest with SharedSparkSession
       ds.groupByKey(_._1).agg(
         typed.sum(_._2),
         expr("sum(_2)").as[Long],
-        count("*")),
+        count("*").as[Long]),
       ("a", 30.0, 30L, 2L), ("b", 3.0, 3L, 2L), ("c", 1.0, 1L, 1L))
   }