Skip to content

Commit

Permalink
[SPARK-40357][SQL] Migrate window type check failures onto error classes
Browse files Browse the repository at this point in the history
### What changes were proposed in this pull request?

In the PR, I propose to use error classes in the case of type check failure in window expressions.

### Why are the changes needed?

Migration onto error classes unifies Spark SQL error messages.

### Does this PR introduce _any_ user-facing change?

Yes. The PR changes user-facing error messages.

### How was this patch tested?

```
build/sbt "sql/testOnly *SQLQueryTestSuite"
build/sbt "test:testOnly *Window*Suite"
```

Closes #37986 from lvshaokang/SPARK-40357.

Authored-by: lvshaokang <lvshaokang1@gmail.com>
Signed-off-by: Max Gekk <max.gekk@gmail.com>
  • Loading branch information
lvshaokang authored and MaxGekk committed Sep 26, 2022
1 parent 7e39d9b commit 7230f59
Show file tree
Hide file tree
Showing 9 changed files with 484 additions and 75 deletions.
50 changes: 50 additions & 0 deletions core/src/main/resources/error/error-classes.json
Expand Up @@ -118,6 +118,11 @@
"To convert values from <srcType> to <targetType>, you can use the functions <functionNames> instead."
]
},
"FRAME_LESS_OFFSET_WITHOUT_FOLDABLE" : {
"message" : [
"Offset expression <offset> must be a literal."
]
},
"INVALID_JSON_MAP_KEY_TYPE" : {
"message" : [
"Input schema <schema> can only contain STRING as a key type for a MAP."
Expand All @@ -138,11 +143,56 @@
"all arguments must be strings."
]
},
"RANGE_FRAME_INVALID_TYPE" : {
"message" : [
"The data type <orderSpecType> used in the order specification does not match the data type <valueBoundaryType> which is used in the range frame."
]
},
"RANGE_FRAME_MULTI_ORDER" : {
"message" : [
"A range window frame with value boundaries cannot be used in a window specification with multiple order by expressions: <orderSpec>."
]
},
"RANGE_FRAME_WITHOUT_ORDER" : {
"message" : [
"A range window frame cannot be used in an unordered window specification."
]
},
"SPECIFIED_WINDOW_FRAME_DIFF_TYPES" : {
"message" : [
"Window frame bounds <lower> and <upper> do not have the same type: <lowerType> <> <upperType>."
]
},
"SPECIFIED_WINDOW_FRAME_INVALID_BOUND" : {
"message" : [
"Window frame upper bound <upper> does not follow the lower bound <lower>."
]
},
"SPECIFIED_WINDOW_FRAME_UNACCEPTED_TYPE" : {
"message" : [
"The data type of the <location> bound <exprType> does not match the expected data type <expectedType>."
]
},
"SPECIFIED_WINDOW_FRAME_WITHOUT_FOLDABLE" : {
"message" : [
"Window frame <location> bound <expression> is not a literal."
]
},
"SPECIFIED_WINDOW_FRAME_WRONG_COMPARISON" : {
"message" : [
"The lower bound of a window frame must be <comparison> to the upper bound."
]
},
"UNEXPECTED_INPUT_TYPE" : {
"message" : [
"parameter <paramIndex> requires <requiredType> type, however, <inputSql> is of <inputType> type."
]
},
"UNSPECIFIED_FRAME" : {
"message" : [
"Cannot use an UnspecifiedFrame. This should have been converted during analysis."
]
},
"WRONG_NUM_PARAMS" : {
"message" : [
"wrong number of parameters: <actualNum>."
Expand Down
Expand Up @@ -102,7 +102,8 @@ private[spark] object SparkThrowableHelper {
messageParameters.asScala
.toMap // To remove duplicates
.toSeq.sortBy(_._1)
.foreach { case (name, value) => g.writeStringField(name, value) }
.foreach { case (name, value) =>
g.writeStringField(name, value.replaceAll("#\\d+", "#x")) }
g.writeEndObject()
}
val queryContext = e.getQueryContext
Expand Down
Expand Up @@ -51,7 +51,7 @@ object TypeCheckResult {
*/
case class DataTypeMismatch(
errorSubClass: String,
messageParameters: Map[String, String])
messageParameters: Map[String, String] = Map.empty)
extends TypeCheckResult {
def isSuccess: Boolean = false
}
Expand Down
Expand Up @@ -20,8 +20,9 @@ package org.apache.spark.sql.catalyst.expressions
import java.util.Locale

import org.apache.spark.sql.catalyst.analysis.{TypeCheckResult, UnresolvedException}
import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.{TypeCheckFailure, TypeCheckSuccess}
import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.{DataTypeMismatch, TypeCheckFailure, TypeCheckSuccess}
import org.apache.spark.sql.catalyst.dsl.expressions._
import org.apache.spark.sql.catalyst.expressions.Cast.{toSQLExpr, toSQLType}
import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateFunction, DeclarativeAggregate, NoOp}
import org.apache.spark.sql.catalyst.trees.{BinaryLike, LeafLike, TernaryLike, UnaryLike}
import org.apache.spark.sql.catalyst.trees.TreePattern.{TreePattern, UNRESOLVED_WINDOW_EXPRESSION, WINDOW_EXPRESSION}
Expand Down Expand Up @@ -65,24 +66,27 @@ case class WindowSpecDefinition(
override def checkInputDataTypes(): TypeCheckResult = {
frameSpecification match {
case UnspecifiedFrame =>
TypeCheckFailure(
"Cannot use an UnspecifiedFrame. This should have been converted during analysis. " +
"Please file a bug report.")
DataTypeMismatch(errorSubClass = "UNSPECIFIED_FRAME")
case f: SpecifiedWindowFrame if f.frameType == RangeFrame && !f.isUnbounded &&
orderSpec.isEmpty =>
TypeCheckFailure(
"A range window frame cannot be used in an unordered window specification.")
DataTypeMismatch(errorSubClass = "RANGE_FRAME_WITHOUT_ORDER")
case f: SpecifiedWindowFrame if f.frameType == RangeFrame && f.isValueBound &&
orderSpec.size > 1 =>
TypeCheckFailure(
s"A range window frame with value boundaries cannot be used in a window specification " +
s"with multiple order by expressions: ${orderSpec.mkString(",")}")
DataTypeMismatch(
errorSubClass = "RANGE_FRAME_MULTI_ORDER",
messageParameters = Map(
"orderSpec" -> orderSpec.mkString(",")
)
)
case f: SpecifiedWindowFrame if f.frameType == RangeFrame && f.isValueBound &&
!isValidFrameType(f.valueBoundary.head.dataType) =>
TypeCheckFailure(
s"The data type '${orderSpec.head.dataType.catalogString}' used in the order " +
"specification does not match the data type " +
s"'${f.valueBoundary.head.dataType.catalogString}' which is used in the range frame.")
DataTypeMismatch(
errorSubClass = "RANGE_FRAME_INVALID_TYPE",
messageParameters = Map(
"orderSpecType" -> toSQLType(orderSpec.head.dataType),
"valueBoundaryType" -> toSQLType(f.valueBoundary.head.dataType)
)
)
case _ => TypeCheckSuccess
}
}
Expand Down Expand Up @@ -215,17 +219,32 @@ case class SpecifiedWindowFrame(
// Check combination (of expressions).
(lower, upper) match {
case (l: Expression, u: Expression) if !isValidFrameBoundary(l, u) =>
TypeCheckFailure(s"Window frame upper bound '$upper' does not follow the lower bound " +
s"'$lower'.")
DataTypeMismatch(
errorSubClass = "SPECIFIED_WINDOW_FRAME_INVALID_BOUND",
messageParameters = Map(
"upper" -> toSQLExpr(upper),
"lower" -> toSQLExpr(lower)
)
)
case (l: SpecialFrameBoundary, _) => TypeCheckSuccess
case (_, u: SpecialFrameBoundary) => TypeCheckSuccess
case (l: Expression, u: Expression) if l.dataType != u.dataType =>
TypeCheckFailure(
s"Window frame bounds '$lower' and '$upper' do no not have the same data type: " +
s"'${l.dataType.catalogString}' <> '${u.dataType.catalogString}'")
DataTypeMismatch(
errorSubClass = "SPECIFIED_WINDOW_FRAME_DIFF_TYPES",
messageParameters = Map(
"lower" -> toSQLExpr(lower),
"upper" -> toSQLExpr(upper),
"lowerType" -> toSQLType(l.dataType),
"upperType" -> toSQLType(u.dataType)
)
)
case (l: Expression, u: Expression) if isGreaterThan(l, u) =>
TypeCheckFailure(
"The lower bound of a window frame must be less than or equal to the upper bound")
DataTypeMismatch(
errorSubClass = "SPECIFIED_WINDOW_FRAME_WRONG_COMPARISON",
messageParameters = Map(
"comparison" -> "less than or equal"
)
)
case _ => TypeCheckSuccess
}
}
Expand Down Expand Up @@ -262,11 +281,22 @@ case class SpecifiedWindowFrame(
private def checkBoundary(b: Expression, location: String): TypeCheckResult = b match {
case _: SpecialFrameBoundary => TypeCheckSuccess
case e: Expression if !e.foldable =>
TypeCheckFailure(s"Window frame $location bound '$e' is not a literal.")
DataTypeMismatch(
errorSubClass = "SPECIFIED_WINDOW_FRAME_WITHOUT_FOLDABLE",
messageParameters = Map(
"location" -> location,
"expression" -> toSQLExpr(e)
)
)
case e: Expression if !frameType.inputType.acceptsType(e.dataType) =>
TypeCheckFailure(
s"The data type of the $location bound '${e.dataType.catalogString}' does not match " +
s"the expected data type '${frameType.inputType.simpleString}'.")
DataTypeMismatch(
errorSubClass = "SPECIFIED_WINDOW_FRAME_UNACCEPTED_TYPE",
messageParameters = Map(
"location" -> location,
"exprType" -> toSQLType(e.dataType),
"expectedType" -> toSQLType(frameType.inputType)
)
)
case _ => TypeCheckSuccess
}

Expand Down Expand Up @@ -421,7 +451,12 @@ sealed abstract class FrameLessOffsetWindowFunction
if (check.isFailure) {
check
} else if (!offset.foldable) {
TypeCheckFailure(s"Offset expression '$offset' must be a literal.")
DataTypeMismatch(
errorSubClass = "FRAME_LESS_OFFSET_WITHOUT_FOLDABLE",
messageParameters = Map(
"offset" -> toSQLExpr(offset)
)
)
} else {
TypeCheckSuccess
}
Expand Down
Expand Up @@ -246,7 +246,20 @@ from t1 where f1 = f2
struct<>
-- !query output
org.apache.spark.sql.AnalysisException
cannot resolve '(PARTITION BY spark_catalog.default.t1.f1 RANGE BETWEEN 1 PRECEDING AND 1 FOLLOWING)' due to data type mismatch: A range window frame cannot be used in an unordered window specification.; line 1 pos 24
{
"errorClass" : "DATATYPE_MISMATCH",
"errorSubClass" : "RANGE_FRAME_WITHOUT_ORDER",
"messageParameters" : {
"sqlExpr" : "\"(PARTITION BY f1 RANGE BETWEEN 1 PRECEDING AND 1 FOLLOWING)\""
},
"queryContext" : [ {
"objectType" : "",
"objectName" : "",
"startIndex" : 25,
"stopIndex" : 108,
"fragment" : "(partition by f1\n range between 1 preceding and 1 following)"
} ]
}


-- !query
Expand Down
Expand Up @@ -165,7 +165,23 @@ SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as string) DESC RANGE BETWE
struct<>
-- !query output
org.apache.spark.sql.AnalysisException
cannot resolve 'RANGE BETWEEN CURRENT ROW AND CAST(1 AS STRING) FOLLOWING' due to data type mismatch: The data type of the upper bound 'string' does not match the expected data type '(numeric or interval day to second or interval year to month or interval)'.; line 1 pos 21
{
"errorClass" : "DATATYPE_MISMATCH",
"errorSubClass" : "SPECIFIED_WINDOW_FRAME_UNACCEPTED_TYPE",
"messageParameters" : {
"expectedType" : "(\"NUMERIC\" or \"INTERVAL DAY TO SECOND\" or \"INTERVAL YEAR TO MONTH\" or \"INTERVAL\")",
"exprType" : "\"STRING\"",
"location" : "upper",
"sqlExpr" : "\"RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING\""
},
"queryContext" : [ {
"objectType" : "",
"objectName" : "",
"startIndex" : 22,
"stopIndex" : 111,
"fragment" : "(PARTITION BY 1 ORDER BY cast(1 as string) DESC RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING)"
} ]
}


-- !query
Expand All @@ -174,7 +190,23 @@ SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast('1' as binary) DESC RANGE BET
struct<>
-- !query output
org.apache.spark.sql.AnalysisException
cannot resolve 'RANGE BETWEEN CURRENT ROW AND CAST(1 AS BINARY) FOLLOWING' due to data type mismatch: The data type of the upper bound 'binary' does not match the expected data type '(numeric or interval day to second or interval year to month or interval)'.; line 1 pos 21
{
"errorClass" : "DATATYPE_MISMATCH",
"errorSubClass" : "SPECIFIED_WINDOW_FRAME_UNACCEPTED_TYPE",
"messageParameters" : {
"expectedType" : "(\"NUMERIC\" or \"INTERVAL DAY TO SECOND\" or \"INTERVAL YEAR TO MONTH\" or \"INTERVAL\")",
"exprType" : "\"BINARY\"",
"location" : "upper",
"sqlExpr" : "\"RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING\""
},
"queryContext" : [ {
"objectType" : "",
"objectName" : "",
"startIndex" : 22,
"stopIndex" : 113,
"fragment" : "(PARTITION BY 1 ORDER BY cast('1' as binary) DESC RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING)"
} ]
}


-- !query
Expand All @@ -183,7 +215,23 @@ SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as boolean) DESC RANGE BETW
struct<>
-- !query output
org.apache.spark.sql.AnalysisException
cannot resolve 'RANGE BETWEEN CURRENT ROW AND CAST(1 AS BOOLEAN) FOLLOWING' due to data type mismatch: The data type of the upper bound 'boolean' does not match the expected data type '(numeric or interval day to second or interval year to month or interval)'.; line 1 pos 21
{
"errorClass" : "DATATYPE_MISMATCH",
"errorSubClass" : "SPECIFIED_WINDOW_FRAME_UNACCEPTED_TYPE",
"messageParameters" : {
"expectedType" : "(\"NUMERIC\" or \"INTERVAL DAY TO SECOND\" or \"INTERVAL YEAR TO MONTH\" or \"INTERVAL\")",
"exprType" : "\"BOOLEAN\"",
"location" : "upper",
"sqlExpr" : "\"RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING\""
},
"queryContext" : [ {
"objectType" : "",
"objectName" : "",
"startIndex" : 22,
"stopIndex" : 112,
"fragment" : "(PARTITION BY 1 ORDER BY cast(1 as boolean) DESC RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING)"
} ]
}


-- !query
Expand All @@ -192,7 +240,22 @@ SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast('2017-12-11 09:30:00.0' as ti
struct<>
-- !query output
org.apache.spark.sql.AnalysisException
cannot resolve '(PARTITION BY 1 ORDER BY CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) DESC NULLS LAST RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING)' due to data type mismatch: The data type 'timestamp' used in the order specification does not match the data type 'int' which is used in the range frame.; line 1 pos 21
{
"errorClass" : "DATATYPE_MISMATCH",
"errorSubClass" : "RANGE_FRAME_INVALID_TYPE",
"messageParameters" : {
"orderSpecType" : "\"TIMESTAMP\"",
"sqlExpr" : "\"(PARTITION BY 1 ORDER BY CAST(2017-12-11 09:30:00.0 AS TIMESTAMP) DESC NULLS LAST RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING)\"",
"valueBoundaryType" : "\"INT\""
},
"queryContext" : [ {
"objectType" : "",
"objectName" : "",
"startIndex" : 22,
"stopIndex" : 136,
"fragment" : "(PARTITION BY 1 ORDER BY cast('2017-12-11 09:30:00.0' as timestamp) DESC RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING)"
} ]
}


-- !query
Expand Down

0 comments on commit 7230f59

Please sign in to comment.