Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions common/utils/src/main/resources/error/error-classes.json
Original file line number Diff line number Diff line change
Expand Up @@ -2030,6 +2030,11 @@
"expects an integer value in [0, <upper>), but got <invalidValue>."
]
},
"CHARSET" : {
"message" : [
"expects one of the charsets 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16', but got <charset>."
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we give advice on the legacy configuration?

Copy link
Member Author

@MaxGekk MaxGekk Nov 10, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I plan to restrict the supported charsets in the code, and add a config for the legacy behaviour. In the following PR, I will modify the message and will add some advice.

]
},
"DATETIME_UNIT" : {
"message" : [
"expects one of the units without quotes YEAR, QUARTER, MONTH, WEEK, DAY, DAYOFYEAR, HOUR, MINUTE, SECOND, MILLISECOND, MICROSECOND, but got the string literal <invalidValue>."
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,10 @@ expects one of binary formats 'base64', 'hex', 'utf-8', but got `<invalidFormat>

expects an integer value in [0, `<upper>`), but got `<invalidValue>`.

## CHARSET

expects one of the charsets 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16', but got `<charset>`.

## DATETIME_UNIT

expects one of the units without quotes YEAR, QUARTER, MONTH, WEEK, DAY, DAYOFYEAR, HOUR, MINUTE, SECOND, MILLISECOND, MICROSECOND, but got the string literal `<invalidValue>`.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

package org.apache.spark.sql.catalyst.expressions

import java.io.UnsupportedEncodingException
import java.text.{BreakIterator, DecimalFormat, DecimalFormatSymbols}
import java.util.{Base64 => JBase64}
import java.util.{HashMap, Locale, Map => JMap}
Expand Down Expand Up @@ -2694,17 +2695,25 @@ case class Encode(value: Expression, charset: Expression)

protected override def nullSafeEval(input1: Any, input2: Any): Any = {
val toCharset = input2.asInstanceOf[UTF8String].toString
input1.asInstanceOf[UTF8String].toString.getBytes(toCharset)
try {
input1.asInstanceOf[UTF8String].toString.getBytes(toCharset)
} catch {
case _: UnsupportedEncodingException =>
throw QueryExecutionErrors.invalidCharsetError(prettyName, toCharset)
}
}

override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
nullSafeCodeGen(ctx, ev, (string, charset) =>
nullSafeCodeGen(ctx, ev, (string, charset) => {
val toCharset = ctx.freshName("toCharset")
s"""
String $toCharset = $charset.toString();
try {
${ev.value} = $string.toString().getBytes($charset.toString());
${ev.value} = $string.toString().getBytes($toCharset);
} catch (java.io.UnsupportedEncodingException e) {
org.apache.spark.unsafe.Platform.throwException(e);
}""")
throw QueryExecutionErrors.invalidCharsetError("$prettyName", $toCharset);
}"""
})
}

override protected def withNewChildrenInternal(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2758,4 +2758,13 @@ private[sql] object QueryExecutionErrors extends QueryErrorsBase with ExecutionE
"upper" -> size.toString,
"invalidValue" -> pos.toString))
}

def invalidCharsetError(functionName: String, charset: String): RuntimeException = {
new SparkIllegalArgumentException(
errorClass = "INVALID_PARAMETER_VALUE.CHARSET",
messageParameters = Map(
"functionName" -> toSQLId(functionName),
"parameter" -> toSQLId("charset"),
"charset" -> charset))
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -640,6 +640,21 @@ Project [rpad(cast(0x57 as string), 5, abc) AS rpad(X'57', 5, abc)#x]
+- OneRowRelation


-- !query
select encode('hello', 'Windows-xxx')
-- !query analysis
Project [encode(hello, Windows-xxx) AS encode(hello, Windows-xxx)#x]
+- OneRowRelation


-- !query
select encode(scol, ecol) from values('hello', 'Windows-xxx') as t(scol, ecol)
-- !query analysis
Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x]
+- SubqueryAlias t
+- LocalRelation [scol#x, ecol#x]


-- !query
select decode()
-- !query analysis
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -640,6 +640,21 @@ Project [rpad(cast(0x57 as string), 5, abc) AS rpad(X'57', 5, abc)#x]
+- OneRowRelation


-- !query
select encode('hello', 'Windows-xxx')
-- !query analysis
Project [encode(hello, Windows-xxx) AS encode(hello, Windows-xxx)#x]
+- OneRowRelation


-- !query
select encode(scol, ecol) from values('hello', 'Windows-xxx') as t(scol, ecol)
-- !query analysis
Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x]
+- SubqueryAlias t
+- LocalRelation [scol#x, ecol#x]


-- !query
select decode()
-- !query analysis
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,10 @@ SELECT lpad(x'57', 5, 'abc');
SELECT rpad('abc', 5, x'57');
SELECT rpad(x'57', 5, 'abc');

-- encode
select encode('hello', 'Windows-xxx');
select encode(scol, ecol) from values('hello', 'Windows-xxx') as t(scol, ecol);

-- decode
select decode();
select decode(encode('abc', 'utf-8'));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -803,6 +803,40 @@ struct<rpad(X'57', 5, abc):string>
Wabca


-- !query
select encode('hello', 'Windows-xxx')
-- !query schema
struct<>
-- !query output
org.apache.spark.SparkIllegalArgumentException
{
"errorClass" : "INVALID_PARAMETER_VALUE.CHARSET",
"sqlState" : "22023",
"messageParameters" : {
"charset" : "Windows-xxx",
"functionName" : "`encode`",
"parameter" : "`charset`"
}
}


-- !query
select encode(scol, ecol) from values('hello', 'Windows-xxx') as t(scol, ecol)
-- !query schema
struct<>
-- !query output
org.apache.spark.SparkIllegalArgumentException
{
"errorClass" : "INVALID_PARAMETER_VALUE.CHARSET",
"sqlState" : "22023",
"messageParameters" : {
"charset" : "Windows-xxx",
"functionName" : "`encode`",
"parameter" : "`charset`"
}
}


-- !query
select decode()
-- !query schema
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -735,6 +735,40 @@ struct<rpad(X'57', 5, abc):string>
Wabca


-- !query
select encode('hello', 'Windows-xxx')
-- !query schema
struct<>
-- !query output
org.apache.spark.SparkIllegalArgumentException
{
"errorClass" : "INVALID_PARAMETER_VALUE.CHARSET",
"sqlState" : "22023",
"messageParameters" : {
"charset" : "Windows-xxx",
"functionName" : "`encode`",
"parameter" : "`charset`"
}
}


-- !query
select encode(scol, ecol) from values('hello', 'Windows-xxx') as t(scol, ecol)
-- !query schema
struct<>
-- !query output
org.apache.spark.SparkIllegalArgumentException
{
"errorClass" : "INVALID_PARAMETER_VALUE.CHARSET",
"sqlState" : "22023",
"messageParameters" : {
"charset" : "Windows-xxx",
"functionName" : "`encode`",
"parameter" : "`charset`"
}
}


-- !query
select decode()
-- !query schema
Expand Down