[SPARK-40357][SQL] Migrate window type check failures onto error classes

### What changes were proposed in this pull request? In the PR, I propose to use error classes in the case of type check failure in window expressions. ### Why are the changes needed? Migration onto error classes unifies Spark SQL error messages. ### Does this PR introduce _any_ user-facing change? Yes. The PR changes user-facing error messages. ### How was this patch tested? ``` build/sbt "sql/testOnly *SQLQueryTestSuite" build/sbt "test:testOnly *Window*Suite" ``` Closes #37986 from lvshaokang/SPARK-40357. Authored-by: lvshaokang <lvshaokang1@gmail.com> Signed-off-by: Max Gekk <max.gekk@gmail.com>
apache · Sep 26, 2022 · 7230f59 · 7230f59
1 parent 7e39d9b
commit 7230f59
Show file tree

Hide file tree

Showing 9 changed files with 484 additions and 75 deletions.
diff --git a/core/src/main/resources/error/error-classes.json b/core/src/main/resources/error/error-classes.json
@@ -118,6 +118,11 @@
           "To convert values from <srcType> to <targetType>, you can use the functions <functionNames> instead."
         ]
       },
+      "FRAME_LESS_OFFSET_WITHOUT_FOLDABLE" : {
+        "message" : [
+          "Offset expression <offset> must be a literal."
+        ]
+      },
       "INVALID_JSON_MAP_KEY_TYPE" : {
         "message" : [
           "Input schema <schema> can only contain STRING as a key type for a MAP."
@@ -138,11 +143,56 @@
           "all arguments must be strings."
         ]
       },
+      "RANGE_FRAME_INVALID_TYPE" : {
+        "message" : [
+          "The data type <orderSpecType> used in the order specification does not match the data type <valueBoundaryType> which is used in the range frame."
+        ]
+      },
+      "RANGE_FRAME_MULTI_ORDER" : {
+        "message" : [
+          "A range window frame with value boundaries cannot be used in a window specification with multiple order by expressions: <orderSpec>."
+        ]
+      },
+      "RANGE_FRAME_WITHOUT_ORDER" : {
+        "message" : [
+          "A range window frame cannot be used in an unordered window specification."
+        ]
+      },
+      "SPECIFIED_WINDOW_FRAME_DIFF_TYPES" : {
+        "message" : [
+          "Window frame bounds <lower> and <upper> do not have the same type: <lowerType> <> <upperType>."
+        ]
+      },
+      "SPECIFIED_WINDOW_FRAME_INVALID_BOUND" : {
+        "message" : [
+          "Window frame upper bound <upper> does not follow the lower bound <lower>."
+        ]
+      },
+      "SPECIFIED_WINDOW_FRAME_UNACCEPTED_TYPE" : {
+        "message" : [
+          "The data type of the <location> bound <exprType> does not match the expected data type <expectedType>."
+        ]
+      },
+      "SPECIFIED_WINDOW_FRAME_WITHOUT_FOLDABLE" : {
+        "message" : [
+          "Window frame <location> bound <expression> is not a literal."
+        ]
+      },
+      "SPECIFIED_WINDOW_FRAME_WRONG_COMPARISON" : {
+        "message" : [
+          "The lower bound of a window frame must be <comparison> to the upper bound."
+        ]
+      },
       "UNEXPECTED_INPUT_TYPE" : {
         "message" : [
           "parameter <paramIndex> requires <requiredType> type, however, <inputSql> is of <inputType> type."
         ]
       },
+      "UNSPECIFIED_FRAME" : {
+        "message" : [
+          "Cannot use an UnspecifiedFrame. This should have been converted during analysis."
+        ]
+      },
       "WRONG_NUM_PARAMS" : {
         "message" : [
           "wrong number of parameters: <actualNum>."

diff --git a/core/src/main/scala/org/apache/spark/SparkThrowableHelper.scala b/core/src/main/scala/org/apache/spark/SparkThrowableHelper.scala
@@ -102,7 +102,8 @@ private[spark] object SparkThrowableHelper {
             messageParameters.asScala
               .toMap // To remove duplicates
               .toSeq.sortBy(_._1)
-              .foreach { case (name, value) => g.writeStringField(name, value) }
+              .foreach { case (name, value) =>
+                g.writeStringField(name, value.replaceAll("#\\d+", "#x")) }
             g.writeEndObject()
           }
           val queryContext = e.getQueryContext

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCheckResult.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCheckResult.scala
@@ -51,7 +51,7 @@ object TypeCheckResult {
    */
   case class DataTypeMismatch(
       errorSubClass: String,
-      messageParameters: Map[String, String])
+      messageParameters: Map[String, String] = Map.empty)
     extends TypeCheckResult {
     def isSuccess: Boolean = false
   }

diff --git a/...catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala b/...catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala
@@ -20,8 +20,9 @@ package org.apache.spark.sql.catalyst.expressions
 import java.util.Locale
 
 import org.apache.spark.sql.catalyst.analysis.{TypeCheckResult, UnresolvedException}
-import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.{TypeCheckFailure, TypeCheckSuccess}
+import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.{DataTypeMismatch, TypeCheckFailure, TypeCheckSuccess}
 import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.expressions.Cast.{toSQLExpr, toSQLType}
 import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateFunction, DeclarativeAggregate, NoOp}
 import org.apache.spark.sql.catalyst.trees.{BinaryLike, LeafLike, TernaryLike, UnaryLike}
 import org.apache.spark.sql.catalyst.trees.TreePattern.{TreePattern, UNRESOLVED_WINDOW_EXPRESSION, WINDOW_EXPRESSION}
@@ -65,24 +66,27 @@ case class WindowSpecDefinition(
   override def checkInputDataTypes(): TypeCheckResult = {
     frameSpecification match {
       case UnspecifiedFrame =>
-        TypeCheckFailure(
-          "Cannot use an UnspecifiedFrame. This should have been converted during analysis. " +
-            "Please file a bug report.")
+        DataTypeMismatch(errorSubClass = "UNSPECIFIED_FRAME")
       case f: SpecifiedWindowFrame if f.frameType == RangeFrame && !f.isUnbounded &&
           orderSpec.isEmpty =>
-        TypeCheckFailure(
-          "A range window frame cannot be used in an unordered window specification.")
+        DataTypeMismatch(errorSubClass = "RANGE_FRAME_WITHOUT_ORDER")
       case f: SpecifiedWindowFrame if f.frameType == RangeFrame && f.isValueBound &&
           orderSpec.size > 1 =>
-        TypeCheckFailure(
-          s"A range window frame with value boundaries cannot be used in a window specification " +
-            s"with multiple order by expressions: ${orderSpec.mkString(",")}")
+        DataTypeMismatch(
+          errorSubClass = "RANGE_FRAME_MULTI_ORDER",
+          messageParameters = Map(
+            "orderSpec" -> orderSpec.mkString(",")
+          )
+        )
       case f: SpecifiedWindowFrame if f.frameType == RangeFrame && f.isValueBound &&
           !isValidFrameType(f.valueBoundary.head.dataType) =>
-        TypeCheckFailure(
-          s"The data type '${orderSpec.head.dataType.catalogString}' used in the order " +
-            "specification does not match the data type " +
-            s"'${f.valueBoundary.head.dataType.catalogString}' which is used in the range frame.")
+        DataTypeMismatch(
+          errorSubClass = "RANGE_FRAME_INVALID_TYPE",
+          messageParameters = Map(
+            "orderSpecType" -> toSQLType(orderSpec.head.dataType),
+            "valueBoundaryType" -> toSQLType(f.valueBoundary.head.dataType)
+          )
+        )
       case _ => TypeCheckSuccess
     }
   }
@@ -215,17 +219,32 @@ case class SpecifiedWindowFrame(
     // Check combination (of expressions).
     (lower, upper) match {
       case (l: Expression, u: Expression) if !isValidFrameBoundary(l, u) =>
-        TypeCheckFailure(s"Window frame upper bound '$upper' does not follow the lower bound " +
-          s"'$lower'.")
+        DataTypeMismatch(
+          errorSubClass = "SPECIFIED_WINDOW_FRAME_INVALID_BOUND",
+          messageParameters = Map(
+            "upper" -> toSQLExpr(upper),
+            "lower" -> toSQLExpr(lower)
+          )
+        )
       case (l: SpecialFrameBoundary, _) => TypeCheckSuccess
       case (_, u: SpecialFrameBoundary) => TypeCheckSuccess
       case (l: Expression, u: Expression) if l.dataType != u.dataType =>
-        TypeCheckFailure(
-          s"Window frame bounds '$lower' and '$upper' do no not have the same data type: " +
-            s"'${l.dataType.catalogString}' <> '${u.dataType.catalogString}'")
+        DataTypeMismatch(
+          errorSubClass = "SPECIFIED_WINDOW_FRAME_DIFF_TYPES",
+          messageParameters = Map(
+            "lower" -> toSQLExpr(lower),
+            "upper" -> toSQLExpr(upper),
+            "lowerType" -> toSQLType(l.dataType),
+            "upperType" -> toSQLType(u.dataType)
+          )
+        )
       case (l: Expression, u: Expression) if isGreaterThan(l, u) =>
-        TypeCheckFailure(
-          "The lower bound of a window frame must be less than or equal to the upper bound")
+        DataTypeMismatch(
+          errorSubClass = "SPECIFIED_WINDOW_FRAME_WRONG_COMPARISON",
+          messageParameters = Map(
+            "comparison" -> "less than or equal"
+          )
+        )
       case _ => TypeCheckSuccess
     }
   }
@@ -262,11 +281,22 @@ case class SpecifiedWindowFrame(
   private def checkBoundary(b: Expression, location: String): TypeCheckResult = b match {
     case _: SpecialFrameBoundary => TypeCheckSuccess
     case e: Expression if !e.foldable =>
-      TypeCheckFailure(s"Window frame $location bound '$e' is not a literal.")
+      DataTypeMismatch(
+        errorSubClass = "SPECIFIED_WINDOW_FRAME_WITHOUT_FOLDABLE",
+        messageParameters = Map(
+          "location" -> location,
+          "expression" -> toSQLExpr(e)
+        )
+      )
     case e: Expression if !frameType.inputType.acceptsType(e.dataType) =>
-      TypeCheckFailure(
-        s"The data type of the $location bound '${e.dataType.catalogString}' does not match " +
-          s"the expected data type '${frameType.inputType.simpleString}'.")
+      DataTypeMismatch(
+        errorSubClass = "SPECIFIED_WINDOW_FRAME_UNACCEPTED_TYPE",
+        messageParameters = Map(
+          "location" -> location,
+          "exprType" -> toSQLType(e.dataType),
+          "expectedType" -> toSQLType(frameType.inputType)
+        )
+      )
     case _ => TypeCheckSuccess
   }
 
@@ -421,7 +451,12 @@ sealed abstract class FrameLessOffsetWindowFunction
     if (check.isFailure) {
       check
     } else if (!offset.foldable) {
-      TypeCheckFailure(s"Offset expression '$offset' must be a literal.")
+      DataTypeMismatch(
+        errorSubClass = "FRAME_LESS_OFFSET_WITHOUT_FOLDABLE",
+        messageParameters = Map(
+          "offset" -> toSQLExpr(offset)
+        )
+      )
     } else {
       TypeCheckSuccess
     }

diff --git a/sql/core/src/test/resources/sql-tests/results/postgreSQL/window_part3.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/window_part3.sql.out
@@ -246,7 +246,20 @@ from t1 where f1 = f2
 struct<>
 -- !query output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(PARTITION BY spark_catalog.default.t1.f1 RANGE BETWEEN 1 PRECEDING AND 1 FOLLOWING)' due to data type mismatch: A range window frame cannot be used in an unordered window specification.; line 1 pos 24
+{
+  "errorClass" : "DATATYPE_MISMATCH",
+  "errorSubClass" : "RANGE_FRAME_WITHOUT_ORDER",
+  "messageParameters" : {
+    "sqlExpr" : "\"(PARTITION BY f1 RANGE BETWEEN 1 PRECEDING AND 1 FOLLOWING)\""
+  },
+  "queryContext" : [ {
+    "objectType" : "",
+    "objectName" : "",
+    "startIndex" : 25,
+    "stopIndex" : 108,
+    "fragment" : "(partition by f1\n                         range between 1 preceding and 1 following)"
+  } ]
+}
 
 
 -- !query

diff --git a/...core/src/test/resources/sql-tests/results/typeCoercion/native/windowFrameCoercion.sql.out b/...core/src/test/resources/sql-tests/results/typeCoercion/native/windowFrameCoercion.sql.out
@@ -165,7 +165,23 @@ SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as string) DESC RANGE BETWE
 struct<>
 -- !query output
 org.apache.spark.sql.AnalysisException
-cannot resolve 'RANGE BETWEEN CURRENT ROW AND CAST(1 AS STRING) FOLLOWING' due to data type mismatch: The data type of the upper bound 'string' does not match the expected data type '(numeric or interval day to second or interval year to month or interval)'.; line 1 pos 21
+{
+  "errorClass" : "DATATYPE_MISMATCH",
+  "errorSubClass" : "SPECIFIED_WINDOW_FRAME_UNACCEPTED_TYPE",
+  "messageParameters" : {
+    "expectedType" : "(\"NUMERIC\" or \"INTERVAL DAY TO SECOND\" or \"INTERVAL YEAR TO MONTH\" or \"INTERVAL\")",
+    "exprType" : "\"STRING\"",
+    "location" : "upper",
+    "sqlExpr" : "\"RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING\""
+  },
+  "queryContext" : [ {
+    "objectType" : "",
+    "objectName" : "",
+    "startIndex" : 22,
+    "stopIndex" : 111,
+    "fragment" : "(PARTITION BY 1 ORDER BY cast(1 as string) DESC RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING)"
+  } ]
+}
 
 
 -- !query
@@ -174,7 +190,23 @@ SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast('1' as binary) DESC RANGE BET
 struct<>
 -- !query output
 org.apache.spark.sql.AnalysisException
-cannot resolve 'RANGE BETWEEN CURRENT ROW AND CAST(1 AS BINARY) FOLLOWING' due to data type mismatch: The data type of the upper bound 'binary' does not match the expected data type '(numeric or interval day to second or interval year to month or interval)'.; line 1 pos 21
+{
+  "errorClass" : "DATATYPE_MISMATCH",
+  "errorSubClass" : "SPECIFIED_WINDOW_FRAME_UNACCEPTED_TYPE",
+  "messageParameters" : {
+    "expectedType" : "(\"NUMERIC\" or \"INTERVAL DAY TO SECOND\" or \"INTERVAL YEAR TO MONTH\" or \"INTERVAL\")",
+    "exprType" : "\"BINARY\"",
+    "location" : "upper",
+    "sqlExpr" : "\"RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING\""
+  },
+  "queryContext" : [ {
+    "objectType" : "",
+    "objectName" : "",
+    "startIndex" : 22,
+    "stopIndex" : 113,
+    "fragment" : "(PARTITION BY 1 ORDER BY cast('1' as binary) DESC RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING)"
+  } ]
+}
 
 
 -- !query
@@ -183,7 +215,23 @@ SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as boolean) DESC RANGE BETW
 struct<>
 -- !query output
 org.apache.spark.sql.AnalysisException
-cannot resolve 'RANGE BETWEEN CURRENT ROW AND CAST(1 AS BOOLEAN) FOLLOWING' due to data type mismatch: The data type of the upper bound 'boolean' does not match the expected data type '(numeric or interval day to second or interval year to month or interval)'.; line 1 pos 21
+{
+  "errorClass" : "DATATYPE_MISMATCH",
+  "errorSubClass" : "SPECIFIED_WINDOW_FRAME_UNACCEPTED_TYPE",
+  "messageParameters" : {
+    "expectedType" : "(\"NUMERIC\" or \"INTERVAL DAY TO SECOND\" or \"INTERVAL YEAR TO MONTH\" or \"INTERVAL\")",
+    "exprType" : "\"BOOLEAN\"",
+    "location" : "upper",
+    "sqlExpr" : "\"RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING\""
+  },
+  "queryContext" : [ {
+    "objectType" : "",
+    "objectName" : "",
+    "startIndex" : 22,
+    "stopIndex" : 112,
+    "fragment" : "(PARTITION BY 1 ORDER BY cast(1 as boolean) DESC RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING)"
+  } ]
+}
 
 
 -- !query
@@ -192,7 +240,22 @@ SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast('2017-12-11 09:30:00.0' as ti
 struct<>
 -- !query output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(PARTITION BY 1 ORDER BY CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) DESC NULLS LAST RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING)' due to data type mismatch: The data type 'timestamp' used in the order specification does not match the data type 'int' which is used in the range frame.; line 1 pos 21
+{
+  "errorClass" : "DATATYPE_MISMATCH",
+  "errorSubClass" : "RANGE_FRAME_INVALID_TYPE",
+  "messageParameters" : {
+    "orderSpecType" : "\"TIMESTAMP\"",
+    "sqlExpr" : "\"(PARTITION BY 1 ORDER BY CAST(2017-12-11 09:30:00.0 AS TIMESTAMP) DESC NULLS LAST RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING)\"",
+    "valueBoundaryType" : "\"INT\""
+  },
+  "queryContext" : [ {
+    "objectType" : "",
+    "objectName" : "",
+    "startIndex" : 22,
+    "stopIndex" : 136,
+    "fragment" : "(PARTITION BY 1 ORDER BY cast('2017-12-11 09:30:00.0' as timestamp) DESC RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING)"
+  } ]
+}
 
 
 -- !query