From a909cb13b3d10f18c7ea11e3f3f0f1ffe1f27260 Mon Sep 17 00:00:00 2001 From: Gideon P Date: Wed, 11 Sep 2024 09:54:09 -0400 Subject: [PATCH 01/27] reapply changes --- .../resources/error/error-conditions.json | 5 + .../catalyst/expressions/aggregate/Mode.scala | 81 +++++++--- .../sql/CollationSQLExpressionsSuite.scala | 152 +++++++++++++++++- 3 files changed, 213 insertions(+), 25 deletions(-) diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json index 4bc48c042a0b3..2bc681453556c 100644 --- a/common/utils/src/main/resources/error/error-conditions.json +++ b/common/utils/src/main/resources/error/error-conditions.json @@ -1005,6 +1005,11 @@ "The input of can't be type data." ] }, + "UNSUPPORTED_MODE_DATA_TYPE" : { + "message" : [ + "The does not support the data type, because ." + ] + }, "UNSUPPORTED_UDF_INPUT_TYPE" : { "message" : [ "UDFs do not support '' as an input data type." diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala index e254a670991a1..3c7caad6dcac6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala @@ -17,17 +17,20 @@ package org.apache.spark.sql.catalyst.expressions.aggregate +import org.apache.spark.SparkUnsupportedOperationException import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.{ExpressionBuilder, TypeCheckResult, UnresolvedWithinGroup} import org.apache.spark.sql.catalyst.expressions.{Ascending, Descending, Expression, ExpressionDescription, ImplicitCastInputTypes, SortOrder} import org.apache.spark.sql.catalyst.trees.UnaryLike import org.apache.spark.sql.catalyst.types.PhysicalDataType -import org.apache.spark.sql.catalyst.util.{CollationFactory, GenericArrayData, UnsafeRowUtils} +import org.apache.spark.sql.catalyst.util.{ArrayData, CollationFactory, GenericArrayData, UnsafeRowUtils} +import org.apache.spark.sql.errors.DataTypeErrors.{toSQLId, toSQLType} import org.apache.spark.sql.errors.QueryCompilationErrors -import org.apache.spark.sql.types.{AbstractDataType, AnyDataType, ArrayType, BooleanType, DataType, StringType} +import org.apache.spark.sql.types.{AbstractDataType, AnyDataType, ArrayType, BooleanType, DataType, MapType, StringType, StructField, StructType} import org.apache.spark.unsafe.types.UTF8String import org.apache.spark.util.collection.OpenHashMap + case class Mode( child: Expression, mutableAggBufferOffset: Int = 0, @@ -50,17 +53,21 @@ case class Mode( override def inputTypes: Seq[AbstractDataType] = Seq(AnyDataType) override def checkInputDataTypes(): TypeCheckResult = { - if (UnsafeRowUtils.isBinaryStable(child.dataType) || child.dataType.isInstanceOf[StringType]) { + // TODO: SPARK-49358: Mode expression for map type with collated fields + if (UnsafeRowUtils.isBinaryStable(child.dataType) || + !child.dataType.existsRecursively(f => f.isInstanceOf[MapType] && + !UnsafeRowUtils.isBinaryStable(f))) { /* * The Mode class uses collation awareness logic to handle string data. - * Complex types with collated fields are not yet supported. + * All complex types except MapType with collated fields are supported. */ - // TODO: SPARK-48700: Mode expression for complex types (all collations) super.checkInputDataTypes() } else { - TypeCheckResult.TypeCheckFailure("The input to the function 'mode' was" + - " a type of binary-unstable type that is " + - s"not currently supported by ${prettyName}.") + TypeCheckResult.DataTypeMismatch("UNSUPPORTED_MODE_DATA_TYPE", + messageParameters = + Map("child" -> toSQLType(child.dataType), + "mode" -> toSQLId(prettyName), + "reason" -> "MapType with collated fields")) } } @@ -86,6 +93,49 @@ case class Mode( buffer } + private def getCollationAwareBuffer( + childDataType: DataType, + buffer: OpenHashMap[AnyRef, Long]): Iterable[(AnyRef, Long)] = { + def groupAndReduceBuffer(groupingFunction: AnyRef => _): Iterable[(AnyRef, Long)] = { + buffer.groupMapReduce(t => + groupingFunction(t._1))(x => x)((x, y) => (x._1, x._2 + y._2)).values + } + def determineBufferingFunction( + childDataType: DataType): Option[AnyRef => _] = { + childDataType match { + case _ if UnsafeRowUtils.isBinaryStable(child.dataType) => None + case _ => Some(collationAwareTransform(_, childDataType)) + } + } + determineBufferingFunction(childDataType).map(groupAndReduceBuffer).getOrElse(buffer) + } + + private def collationAwareTransform(data: AnyRef, dataType: DataType): AnyRef = { + dataType match { + case _ if UnsafeRowUtils.isBinaryStable(dataType) => data + case st: StructType => + processStructTypeWithBuffer(data.asInstanceOf[InternalRow].toSeq(st).zip(st.fields)) + case at: ArrayType => processArrayTypeWithBuffer(at, data.asInstanceOf[ArrayData]) + case st: StringType => + CollationFactory.getCollationKey(data.asInstanceOf[UTF8String], st.collationId) + case _ => + throw new SparkUnsupportedOperationException( + s"Unsupported data type for collation-aware mode: $dataType") + } + } + + private def processStructTypeWithBuffer( + tuples: Seq[(Any, StructField)]): Seq[Any] = { + tuples.map(t => collationAwareTransform(t._1.asInstanceOf[AnyRef], t._2.dataType)) + } + + private def processArrayTypeWithBuffer( + a: ArrayType, + data: ArrayData): Seq[Any] = { + (0 until data.numElements()).map(i => + collationAwareTransform(data.get(i, a.elementType), a.elementType)) + } + override def eval(buffer: OpenHashMap[AnyRef, Long]): Any = { if (buffer.isEmpty) { return null @@ -102,17 +152,12 @@ case class Mode( * to a single value (the sum of the counts), and finally reduces the groups to a single map. * * The new map is then used in the rest of the Mode evaluation logic. + * + * It is expected to work for all simple and complex types with + * collated fields, except for MapType (temporarily). */ - val collationAwareBuffer = child.dataType match { - case c: StringType if - !CollationFactory.fetchCollation(c.collationId).supportsBinaryEquality => - val collationId = c.collationId - val modeMap = buffer.toSeq.groupMapReduce { - case (k, _) => CollationFactory.getCollationKey(k.asInstanceOf[UTF8String], collationId) - }(x => x)((x, y) => (x._1, x._2 + y._2)).values - modeMap - case _ => buffer - } + val collationAwareBuffer = getCollationAwareBuffer(child.dataType, buffer) + reverseOpt.map { reverse => val defaultKeyOrdering = if (reverse) { PhysicalDataType.ordering(child.dataType).asInstanceOf[Ordering[AnyRef]].reverse diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala index f8cd840ecdbb9..1bb135cf71f50 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala @@ -19,11 +19,12 @@ package org.apache.spark.sql import java.sql.{Date, Timestamp} import java.text.SimpleDateFormat +import java.util.Locale import scala.collection.immutable.Seq -import org.apache.spark.{SparkConf, SparkException, SparkIllegalArgumentException, SparkRuntimeException} -import org.apache.spark.sql.catalyst.ExtendedAnalysisException +import org.apache.spark.{SparkConf, SparkException, SparkIllegalArgumentException, SparkRuntimeException, SparkThrowable} +import org.apache.spark.sql.catalyst.{ExtendedAnalysisException, InternalRow} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.aggregate.Mode import org.apache.spark.sql.internal.{SqlApiConf, SQLConf} @@ -1711,9 +1712,9 @@ class CollationSQLExpressionsSuite test("Support Mode.eval(buffer)") { case class UTF8StringModeTestCase[R]( - collationId: String, - bufferValues: Map[UTF8String, Long], - result: R) + collationId: String, + bufferValues: Map[UTF8String, Long], + result: R) val bufferValuesUTF8String = Map( UTF8String.fromString("a") -> 5L, @@ -1736,6 +1737,40 @@ class CollationSQLExpressionsSuite }) } + test("Support Mode.eval(buffer) with complex types") { + case class UTF8StringModeTestCase[R]( + collationId: String, + bufferValues: Map[InternalRow, Long], + result: R) + + val bufferValuesUTF8String: Map[Any, Long] = Map( + UTF8String.fromString("a") -> 5L, + UTF8String.fromString("b") -> 4L, + UTF8String.fromString("B") -> 3L, + UTF8String.fromString("d") -> 2L, + UTF8String.fromString("e") -> 1L) + + val bufferValuesComplex = bufferValuesUTF8String.map{ + case (k, v) => (InternalRow.fromSeq(Seq(k, k, k)), v) + } + val testCasesUTF8String = Seq( + UTF8StringModeTestCase("utf8_binary", bufferValuesComplex, "[a,a,a]"), + UTF8StringModeTestCase("UTF8_LCASE", bufferValuesComplex, "[b,b,b]"), + UTF8StringModeTestCase("unicode_ci", bufferValuesComplex, "[b,b,b]"), + UTF8StringModeTestCase("unicode", bufferValuesComplex, "[a,a,a]")) + + testCasesUTF8String.foreach(t => { + val buffer = new OpenHashMap[AnyRef, Long](5) + val myMode = Mode(child = Literal.create(null, StructType(Seq( + StructField("f1", StringType(t.collationId), true), + StructField("f2", StringType(t.collationId), true), + StructField("f3", StringType(t.collationId), true) + )))) + t.bufferValues.foreach { case (k, v) => buffer.update(k, v) } + assert(myMode.eval(buffer).toString.toLowerCase() == t.result.toLowerCase()) + }) + } + test("Support mode for string expression with collated strings in struct") { case class ModeTestCase[R](collationId: String, bufferValues: Map[String, Long], result: R) val testCases = Seq( @@ -1801,7 +1836,7 @@ class CollationSQLExpressionsSuite s"named_struct('f2', collate('$elt', '${t.collationId}')), 'f3', 1)").mkString(",") }.mkString(",") - val tableName = s"t_${t.collationId}_mode_nested_struct" + val tableName = s"t_${t.collationId}_mode_nested_struct1" withTable(tableName) { sql(s"CREATE TABLE ${tableName}(i STRUCT, f3: INT>) USING parquet") @@ -1839,6 +1874,58 @@ class CollationSQLExpressionsSuite } test("Support mode for string expression with collated strings in array complex type") { + case class ModeTestCase[R](collationId: String, bufferValues: Map[String, Long], result: R) + val testCases = Seq( + ModeTestCase("utf8_binary", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "a"), + ModeTestCase("UTF8_LCASE", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "b"), + ModeTestCase("unicode", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "a"), + ModeTestCase("unicode_ci", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "b") + ) + testCases.foreach(t => { + val valuesToAdd = t.bufferValues.map { case (elt, numRepeats) => + (0L to numRepeats).map(_ => s"array(named_struct('f2', " + + s"collate('$elt', '${t.collationId}'), 'f3', 1))").mkString(",") + }.mkString(",") + + val tableName = s"t_${t.collationId}_mode_nested_struct2" + withTable(tableName) { + sql(s"CREATE TABLE ${tableName}(" + + s"i ARRAY< STRUCT>)" + + s" USING parquet") + sql(s"INSERT INTO ${tableName} VALUES " + valuesToAdd) + val query = s"SELECT lower(element_at(mode(i).f2, 1)) FROM ${tableName}" + checkAnswer(sql(query), Row(t.result)) + } + }) + } + + test("Support mode for string expression with collated strings in 3D array type") { + case class ModeTestCase[R](collationId: String, bufferValues: Map[String, Long], result: R) + val testCases = Seq( + ModeTestCase("utf8_binary", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "a"), + ModeTestCase("UTF8_LCASE", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "b"), + ModeTestCase("unicode", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "a"), + ModeTestCase("unicode_ci", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "b") + ) + testCases.foreach(t => { + val valuesToAdd = t.bufferValues.map { case (elt, numRepeats) => + (0L to numRepeats).map(_ => + s"array(array(array(collate('$elt', '${t.collationId}'))))").mkString(",") + }.mkString(",") + + val tableName = s"t_${t.collationId}_mode_nested_3d_array" + withTable(tableName) { + sql(s"CREATE TABLE ${tableName}(i ARRAY>>) USING parquet") + sql(s"INSERT INTO ${tableName} VALUES " + valuesToAdd) + val query = s"SELECT lower(" + + s"element_at(element_at(element_at(mode(i),1),1),1)) FROM ${tableName}" + checkAnswer(sql(query), Row(t.result)) + } + }) + } + + test("Support mode for string expression with collated complex type - Highly nested") { case class ModeTestCase[R](collationId: String, bufferValues: Map[String, Long], result: R) val testCases = Seq( ModeTestCase("utf8_binary", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "a"), @@ -1852,7 +1939,7 @@ class CollationSQLExpressionsSuite s"array(collate('$elt', '${t.collationId}'))), 'f3', 1))").mkString(",") }.mkString(",") - val tableName = s"t_${t.collationId}_mode_nested_struct" + val tableName = s"t_${t.collationId}_mode_highly_nested_struct" withTable(tableName) { sql(s"CREATE TABLE ${tableName}(" + s"i ARRAY>, f3: INT>>)" + @@ -1886,6 +1973,57 @@ class CollationSQLExpressionsSuite }) } + test("Support mode expression with collated in recursively nested struct with map with keys") { + case class ModeTestCase(collationId: String, bufferValues: Map[String, Long], result: String) + Seq( + ModeTestCase("utf8_binary", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "{a -> 1}"), + ModeTestCase("unicode", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "{a -> 1}"), + ModeTestCase("utf8_lcase", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "{b -> 1}"), + ModeTestCase("unicode_ci", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "{b -> 1}") + ).foreach(t1 => { + def checkThisError(t: ModeTestCase, query: String): Any = { + val c = s"STRUCT>" + val c1 = s"\"${c}\"" + checkError( + exception = intercept[SparkThrowable] { + sql(query).collect() + }, + condition = "DATATYPE_MISMATCH.UNSUPPORTED_MODE_DATA_TYPE", + parameters = Map( + ("sqlExpr", "\"mode(i)\""), + ("child", c1), + ("mode", "`mode`"), + ("reason", "MapType with collated fields")), + queryContext = Seq(ExpectedContext("mode(i)", 18, 24)).toArray + ) + } + + def getValuesToAdd(t: ModeTestCase): String = { + val valuesToAdd = t.bufferValues.map { + case (elt, numRepeats) => + (0L to numRepeats).map(i => + s"named_struct('m1', map(collate('$elt', '${t.collationId}'), 1))" + ).mkString(",") + }.mkString(",") + valuesToAdd + } + val tableName = s"t_${t1.collationId}_mode_nested_map_struct1" + withTable(tableName) { + sql(s"CREATE TABLE ${tableName}(" + + s"i STRUCT>) USING parquet") + sql(s"INSERT INTO ${tableName} VALUES ${getValuesToAdd(t1)}") + val query = "SELECT lower(cast(mode(i).m1 as string))" + + s" FROM ${tableName}" + if (t1.collationId == "utf8_binary") { + checkAnswer(sql(query), Row(t1.result)) + } else { + checkThisError(t1, query) + } + } + } + ) + } + test("SPARK-48430: Map value extraction with collations") { for { collateKey <- Seq(true, false) From ce865a9b9ff1e44e1175811b585c55f05406a914 Mon Sep 17 00:00:00 2001 From: Gideon P Date: Wed, 11 Sep 2024 17:02:36 -0400 Subject: [PATCH 02/27] SPARK COLLATIONS MAP --- .../sql/CollationSQLExpressionsSuite.scala | 79 +------------------ 1 file changed, 3 insertions(+), 76 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala index 1bb135cf71f50..cffcb43bfbe2b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala @@ -1791,33 +1791,7 @@ class CollationSQLExpressionsSuite t.collationId + ", f2: INT>) USING parquet") sql(s"INSERT INTO ${tableName} VALUES " + valuesToAdd) val query = s"SELECT lower(mode(i).f1) FROM ${tableName}" - if(t.collationId == "UTF8_LCASE" || - t.collationId == "unicode_ci" || - t.collationId == "unicode") { - // Cannot resolve "mode(i)" due to data type mismatch: - // Input to function mode was a complex type with strings collated on non-binary - // collations, which is not yet supported.. SQLSTATE: 42K09; line 1 pos 13; - val params = Seq(("sqlExpr", "\"mode(i)\""), - ("msg", "The input to the function 'mode'" + - " was a type of binary-unstable type that is not currently supported by mode."), - ("hint", "")).toMap - checkError( - exception = intercept[AnalysisException] { - sql(query) - }, - condition = "DATATYPE_MISMATCH.TYPE_CHECK_FAILURE_WITH_HINT", - parameters = params, - queryContext = Array( - ExpectedContext(objectType = "", - objectName = "", - startIndex = 13, - stopIndex = 19, - fragment = "mode(i)") - ) - ) - } else { - checkAnswer(sql(query), Row(t.result)) - } + checkAnswer(sql(query), Row(t.result)) } }) } @@ -1842,33 +1816,7 @@ class CollationSQLExpressionsSuite t.collationId + ">, f3: INT>) USING parquet") sql(s"INSERT INTO ${tableName} VALUES " + valuesToAdd) val query = s"SELECT lower(mode(i).f1.f2) FROM ${tableName}" - if(t.collationId == "UTF8_LCASE" || - t.collationId == "unicode_ci" || - t.collationId == "unicode") { - // Cannot resolve "mode(i)" due to data type mismatch: - // Input to function mode was a complex type with strings collated on non-binary - // collations, which is not yet supported.. SQLSTATE: 42K09; line 1 pos 13; - val params = Seq(("sqlExpr", "\"mode(i)\""), - ("msg", "The input to the function 'mode' " + - "was a type of binary-unstable type that is not currently supported by mode."), - ("hint", "")).toMap - checkError( - exception = intercept[AnalysisException] { - sql(query) - }, - condition = "DATATYPE_MISMATCH.TYPE_CHECK_FAILURE_WITH_HINT", - parameters = params, - queryContext = Array( - ExpectedContext(objectType = "", - objectName = "", - startIndex = 13, - stopIndex = 19, - fragment = "mode(i)") - ) - ) - } else { - checkAnswer(sql(query), Row(t.result)) - } + checkAnswer(sql(query), Row(t.result)) } }) } @@ -1946,29 +1894,8 @@ class CollationSQLExpressionsSuite s" USING parquet") sql(s"INSERT INTO ${tableName} VALUES " + valuesToAdd) val query = s"SELECT lower(element_at(element_at(mode(i), 1).s1.a2, 1)) FROM ${tableName}" - if(t.collationId == "UTF8_LCASE" || - t.collationId == "unicode_ci" || t.collationId == "unicode") { - val params = Seq(("sqlExpr", "\"mode(i)\""), - ("msg", "The input to the function 'mode' was a type" + - " of binary-unstable type that is not currently supported by mode."), - ("hint", "")).toMap - checkError( - exception = intercept[AnalysisException] { - sql(query) - }, - condition = "DATATYPE_MISMATCH.TYPE_CHECK_FAILURE_WITH_HINT", - parameters = params, - queryContext = Array( - ExpectedContext(objectType = "", - objectName = "", - startIndex = 35, - stopIndex = 41, - fragment = "mode(i)") - ) - ) - } else { + checkAnswer(sql(query), Row(t.result)) - } } }) } From d6974039d717065aa94500e6a4e94964ecdad266 Mon Sep 17 00:00:00 2001 From: Gideon P Date: Fri, 13 Sep 2024 10:39:33 -0400 Subject: [PATCH 03/27] formatting --- .../sql/CollationSQLExpressionsSuite.scala | 482 +++++++++--------- 1 file changed, 243 insertions(+), 239 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala index cffcb43bfbe2b..a2b306bc10568 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala @@ -35,7 +35,7 @@ import org.apache.spark.util.collection.OpenHashMap // scalastyle:off nonascii class CollationSQLExpressionsSuite - extends QueryTest + extends QueryTest with SharedSparkSession with ExpressionEvalHelper { @@ -43,10 +43,10 @@ class CollationSQLExpressionsSuite test("Support Md5 hash expression with collation") { case class Md5TestCase( - input: String, - collationName: String, - result: String - ) + input: String, + collationName: String, + result: String + ) val testCases = Seq( Md5TestCase("Spark", "UTF8_BINARY", "8cde774d6f7333752ed72cacddb05126"), @@ -73,11 +73,11 @@ class CollationSQLExpressionsSuite test("Support Sha2 hash expression with collation") { case class Sha2TestCase( - input: String, - collationName: String, - bitLength: Int, - result: String - ) + input: String, + collationName: String, + bitLength: Int, + result: String + ) val testCases = Seq( Sha2TestCase("Spark", "UTF8_BINARY", 256, @@ -108,10 +108,10 @@ class CollationSQLExpressionsSuite test("Support Sha1 hash expression with collation") { case class Sha1TestCase( - input: String, - collationName: String, - result: String - ) + input: String, + collationName: String, + result: String + ) val testCases = Seq( Sha1TestCase("Spark", "UTF8_BINARY", "85f5955f4b27a9a4c2aab6ffe5d7189fc298b92c"), @@ -138,10 +138,10 @@ class CollationSQLExpressionsSuite test("Support Crc32 hash expression with collation") { case class Crc321TestCase( - input: String, - collationName: String, - result: Int - ) + input: String, + collationName: String, + result: Int + ) val testCases = Seq( Crc321TestCase("Spark", "UTF8_BINARY", 1557323817), @@ -166,10 +166,10 @@ class CollationSQLExpressionsSuite test("Support Murmur3Hash hash expression with collation") { case class Murmur3HashTestCase( - input: String, - collationName: String, - result: Int - ) + input: String, + collationName: String, + result: Int + ) val testCases = Seq( Murmur3HashTestCase("Spark", "UTF8_BINARY", 228093765), @@ -194,10 +194,10 @@ class CollationSQLExpressionsSuite test("Support XxHash64 hash expression with collation") { case class XxHash64TestCase( - input: String, - collationName: String, - result: Long - ) + input: String, + collationName: String, + result: Long + ) val testCases = Seq( XxHash64TestCase("Spark", "UTF8_BINARY", -4294468057691064905L), @@ -222,10 +222,10 @@ class CollationSQLExpressionsSuite test("Support UrlEncode hash expression with collation") { case class UrlEncodeTestCase( - input: String, - collationName: String, - result: String - ) + input: String, + collationName: String, + result: String + ) val testCases = Seq( UrlEncodeTestCase("https://spark.apache.org", "UTF8_BINARY", @@ -256,10 +256,10 @@ class CollationSQLExpressionsSuite test("Support UrlDecode hash expression with collation") { case class UrlDecodeTestCase( - input: String, - collationName: String, - result: String - ) + input: String, + collationName: String, + result: String + ) val testCases = Seq( UrlDecodeTestCase("https%3A%2F%2Fspark.apache.org", "UTF8_BINARY", @@ -290,11 +290,11 @@ class CollationSQLExpressionsSuite test("Support ParseUrl hash expression with collation") { case class ParseUrlTestCase( - input: String, - collationName: String, - path: String, - result: String - ) + input: String, + collationName: String, + path: String, + result: String + ) val testCases = Seq( ParseUrlTestCase("http://spark.apache.org/path?query=1", "UTF8_BINARY", "HOST", @@ -325,13 +325,13 @@ class CollationSQLExpressionsSuite test("Support CsvToStructs csv expression with collation") { case class CsvToStructsTestCase( - input: String, - collationName: String, - schema: String, - options: String, - result: Row, - structFields: Seq[StructField] - ) + input: String, + collationName: String, + schema: String, + options: String, + result: Row, + structFields: Seq[StructField] + ) val testCases = Seq( CsvToStructsTestCase("1", "UTF8_BINARY", "'a INT'", "", @@ -374,10 +374,10 @@ class CollationSQLExpressionsSuite test("Support SchemaOfCsv csv expression with collation") { case class SchemaOfCsvTestCase( - input: String, - collationName: String, - result: String - ) + input: String, + collationName: String, + result: String + ) val testCases = Seq( SchemaOfCsvTestCase("1", "UTF8_BINARY", "STRUCT<_c0: INT>"), @@ -406,10 +406,10 @@ class CollationSQLExpressionsSuite test("Support StructsToCsv csv expression with collation") { case class StructsToCsvTestCase( - input: String, - collationName: String, - result: String - ) + input: String, + collationName: String, + result: String + ) val testCases = Seq( StructsToCsvTestCase("named_struct('a', 1, 'b', 2)", "UTF8_BINARY", "1,2"), @@ -438,11 +438,11 @@ class CollationSQLExpressionsSuite test("Conv expression with collation") { // Supported collations case class ConvTestCase( - num: String, - from_base: String, - to_base: String, - collationName: String, - result: String) + num: String, + from_base: String, + to_base: String, + collationName: String, + result: String) val testCases = Seq( ConvTestCase("100", "2", "10", "UTF8_BINARY", "4"), @@ -464,9 +464,9 @@ class CollationSQLExpressionsSuite test("Bin expression with collation") { // Supported collations case class BinTestCase( - num: String, - collationName: String, - result: String) + num: String, + collationName: String, + result: String) val testCases = Seq( BinTestCase("13", "UTF8_BINARY", "1101"), @@ -489,9 +489,9 @@ class CollationSQLExpressionsSuite test("Hex with non-string input expression with collation") { case class HexTestCase( - num: String, - collationName: String, - result: String) + num: String, + collationName: String, + result: String) val testCases = Seq( HexTestCase("13", "UTF8_BINARY", "D"), @@ -514,9 +514,9 @@ class CollationSQLExpressionsSuite test("Hex with string input expression with collation") { case class HexTestCase( - num: String, - collationName: String, - result: String) + num: String, + collationName: String, + result: String) val testCases = Seq( HexTestCase("Spark SQL", "UTF8_BINARY", "537061726B2053514C"), @@ -537,9 +537,9 @@ class CollationSQLExpressionsSuite test("UnHex expression with collation") { case class UnHexTestCase( - num: String, - collationName: String, - result: String) + num: String, + collationName: String, + result: String) val testCases = Seq( UnHexTestCase("537061726B2053514C", "UTF8_BINARY", "Spark SQL"), @@ -560,13 +560,13 @@ class CollationSQLExpressionsSuite test("Support XPath expressions with collation") { case class XPathTestCase( - xml: String, - xpath: String, - functionName: String, - collationName: String, - result: Any, - resultType: DataType - ) + xml: String, + xpath: String, + functionName: String, + collationName: String, + result: Any, + resultType: DataType + ) val testCases = Seq( XPathTestCase("1", "a/b", @@ -604,10 +604,10 @@ class CollationSQLExpressionsSuite test("Support StringSpace expression with collation") { case class StringSpaceTestCase( - input: Int, - collationName: String, - result: String - ) + input: Int, + collationName: String, + result: String + ) val testCases = Seq( StringSpaceTestCase(1, "UTF8_BINARY", " "), @@ -634,12 +634,12 @@ class CollationSQLExpressionsSuite test("Support ToNumber & TryToNumber expressions with collation") { case class ToNumberTestCase( - input: String, - collationName: String, - format: String, - result: Any, - resultType: DataType - ) + input: String, + collationName: String, + format: String, + result: Any, + resultType: DataType + ) val testCases = Seq( ToNumberTestCase("123", "UTF8_BINARY", "999", 123, DecimalType(3, 0)), @@ -705,11 +705,11 @@ class CollationSQLExpressionsSuite test("Support ToChar expression with collation") { case class ToCharTestCase( - input: Int, - collationName: String, - format: String, - result: String - ) + input: Int, + collationName: String, + format: String, + result: String + ) val testCases = Seq( ToCharTestCase(12, "UTF8_BINARY", "999", " 12"), @@ -736,11 +736,11 @@ class CollationSQLExpressionsSuite test("Support GetJsonObject json expression with collation") { case class GetJsonObjectTestCase( - input: String, - path: String, - collationName: String, - result: String - ) + input: String, + path: String, + collationName: String, + result: String + ) val testCases = Seq( GetJsonObjectTestCase("{\"a\":\"b\"}", "$.a", "UTF8_BINARY", "b"), @@ -767,11 +767,11 @@ class CollationSQLExpressionsSuite test("Support JsonTuple json expression with collation") { case class JsonTupleTestCase( - input: String, - names: String, - collationName: String, - result: Row - ) + input: String, + names: String, + collationName: String, + result: Row + ) val testCases = Seq( JsonTupleTestCase("{\"a\":1, \"b\":2}", "'a', 'b'", "UTF8_BINARY", @@ -802,11 +802,11 @@ class CollationSQLExpressionsSuite test("Support JsonToStructs json expression with collation") { case class JsonToStructsTestCase( - input: String, - schema: String, - collationName: String, - result: Row - ) + input: String, + schema: String, + collationName: String, + result: Row + ) val testCases = Seq( JsonToStructsTestCase("{\"a\":1, \"b\":2.0}", "a INT, b DOUBLE", @@ -837,10 +837,10 @@ class CollationSQLExpressionsSuite test("Support StructsToJson json expression with collation") { case class StructsToJsonTestCase( - struct: String, - collationName: String, - result: Row - ) + struct: String, + collationName: String, + result: Row + ) val testCases = Seq( StructsToJsonTestCase("named_struct('a', 1, 'b', 2)", @@ -871,10 +871,10 @@ class CollationSQLExpressionsSuite test("Support LengthOfJsonArray json expression with collation") { case class LengthOfJsonArrayTestCase( - input: String, - collationName: String, - result: Row - ) + input: String, + collationName: String, + result: Row + ) val testCases = Seq( LengthOfJsonArrayTestCase("'[1,2,3,4]'", "UTF8_BINARY", Row(4)), @@ -900,10 +900,10 @@ class CollationSQLExpressionsSuite test("Support JsonObjectKeys json expression with collation") { case class JsonObjectKeysJsonArrayTestCase( - input: String, - collationName: String, - result: Row - ) + input: String, + collationName: String, + result: Row + ) val testCases = Seq( JsonObjectKeysJsonArrayTestCase("{}", "UTF8_BINARY", @@ -934,10 +934,10 @@ class CollationSQLExpressionsSuite test("Support SchemaOfJson json expression with collation") { case class SchemaOfJsonTestCase( - input: String, - collationName: String, - result: Row - ) + input: String, + collationName: String, + result: Row + ) val testCases = Seq( SchemaOfJsonTestCase("'[{\"col\":0}]'", @@ -968,11 +968,11 @@ class CollationSQLExpressionsSuite test("Support `StringToMap` expression with collation") { case class StringToMapTestCase[R]( - text: String, - pairDelim: String, - keyValueDelim: String, - collation: String, - result: R) + text: String, + pairDelim: String, + keyValueDelim: String, + collation: String, + result: R) val testCases = Seq( StringToMapTestCase("a:1,b:2,c:3", ",", ":", "UTF8_BINARY", Map("a" -> "1", "b" -> "2", "c" -> "3")), @@ -1095,11 +1095,11 @@ class CollationSQLExpressionsSuite test("Support AesEncrypt misc expression with collation") { // Supported collations case class AesEncryptTestCase( - input: String, - collationName: String, - params: String, - result: String - ) + input: String, + collationName: String, + params: String, + result: String + ) val testCases = Seq( AesEncryptTestCase("Spark", "UTF8_BINARY", "'1234567890abcdef', 'ECB'", "8DE7DB79A23F3E8ED530994DDEA98913"), @@ -1127,11 +1127,11 @@ class CollationSQLExpressionsSuite test("Support AesDecrypt misc expression with collation") { // Supported collations case class AesDecryptTestCase( - input: String, - collationName: String, - params: String, - result: String - ) + input: String, + collationName: String, + params: String, + result: String + ) val testCases = Seq( AesDecryptTestCase("8DE7DB79A23F3E8ED530994DDEA98913", "UTF8_BINARY", "'1234567890abcdef', 'ECB'", "Spark"), @@ -1156,7 +1156,7 @@ class CollationSQLExpressionsSuite test("Support Mask expression with collation") { // Supported collations case class MaskTestCase[R](i: String, u: String, l: String, d: String, o: String, c: String, - result: R) + result: R) val testCases = Seq( MaskTestCase("ab-CD-12-@$", null, null, null, null, "UTF8_BINARY", "ab-CD-12-@$"), MaskTestCase("ab-CD-12-@$", "X", null, null, null, "UTF8_LCASE", "ab-XX-12-@$"), @@ -1165,6 +1165,7 @@ class CollationSQLExpressionsSuite ) testCases.foreach(t => { def col(s: String): String = if (s == null) "null" else s"collate('$s', '${t.c}')" + val query = s"SELECT mask(${col(t.i)}, ${col(t.u)}, ${col(t.l)}, ${col(t.d)}, ${col(t.o)})" // Result & data type var result = sql(query) @@ -1177,7 +1178,9 @@ class CollationSQLExpressionsSuite ) testCasting.foreach(t => { def col(s: String): String = if (s == null) "null" else s"collate('$s', '${t.c}')" + def str(s: String): String = if (s == null) "null" else s"'$s'" + val query1 = s"SELECT mask(${col(t.i)}, ${str(t.u)}, ${str(t.l)}, ${str(t.d)}, ${str(t.o)})" val query2 = s"SELECT mask(${str(t.i)}, ${col(t.u)}, ${str(t.l)}, ${str(t.d)}, ${str(t.o)})" val query3 = s"SELECT mask(${str(t.i)}, ${str(t.u)}, ${col(t.l)}, ${str(t.d)}, ${str(t.o)})" @@ -1201,13 +1204,13 @@ class CollationSQLExpressionsSuite test("Support XmlToStructs xml expression with collation") { case class XmlToStructsTestCase( - input: String, - collationName: String, - schema: String, - options: String, - result: Row, - structFields: Seq[StructField] - ) + input: String, + collationName: String, + schema: String, + options: String, + result: Row, + structFields: Seq[StructField] + ) val testCases = Seq( XmlToStructsTestCase("

1

", "UTF8_BINARY", "'a INT'", "", @@ -1249,10 +1252,10 @@ class CollationSQLExpressionsSuite test("Support SchemaOfXml xml expression with collation") { case class SchemaOfXmlTestCase( - input: String, - collationName: String, - result: String - ) + input: String, + collationName: String, + result: String + ) val testCases = Seq( SchemaOfXmlTestCase("

1

", "UTF8_BINARY", "STRUCT"), @@ -1281,10 +1284,10 @@ class CollationSQLExpressionsSuite test("Support StructsToXml xml expression with collation") { case class StructsToXmlTestCase( - input: String, - collationName: String, - result: String - ) + input: String, + collationName: String, + result: String + ) val testCases = Seq( StructsToXmlTestCase("named_struct('a', 1, 'b', 2)", "UTF8_BINARY", @@ -1333,10 +1336,10 @@ class CollationSQLExpressionsSuite test("Support ParseJson & TryParseJson variant expressions with collation") { case class ParseJsonTestCase( - input: String, - collationName: String, - result: String - ) + input: String, + collationName: String, + result: String + ) val testCases = Seq( ParseJsonTestCase("{\"a\":1,\"b\":2}", "UTF8_BINARY", "{\"a\":1,\"b\":2}"), @@ -1405,10 +1408,10 @@ class CollationSQLExpressionsSuite test("Support IsVariantNull variant expressions with collation") { case class IsVariantNullTestCase( - input: String, - collationName: String, - result: Boolean - ) + input: String, + collationName: String, + result: Boolean + ) val testCases = Seq( IsVariantNullTestCase("'null'", "UTF8_BINARY", result = true), @@ -1433,13 +1436,13 @@ class CollationSQLExpressionsSuite test("Support VariantGet & TryVariantGet variant expressions with collation") { case class VariantGetTestCase( - input: String, - path: String, - variantType: String, - collationName: String, - result: Any, - resultType: DataType - ) + input: String, + path: String, + variantType: String, + collationName: String, + result: Any, + resultType: DataType + ) val testCases = Seq( VariantGetTestCase("{\"a\": 1}", "$.a", "int", "UTF8_BINARY", 1, IntegerType), @@ -1509,20 +1512,20 @@ class CollationSQLExpressionsSuite test("Support VariantExplode variant expressions with collation") { case class VariantExplodeTestCase( - input: String, - collationName: String, - result: String, - resultType: Seq[StructField] - ) + input: String, + collationName: String, + result: String, + resultType: Seq[StructField] + ) val testCases = Seq( VariantExplodeTestCase("[\"hello\", \"world\"]", "UTF8_BINARY", - Row(0, "null", "\"hello\"").toString() + Row(1, "null", "\"world\"").toString(), - Seq[StructField]( - StructField("pos", IntegerType, nullable = false), - StructField("key", StringType("UTF8_BINARY")), - StructField("value", VariantType, nullable = false) - ) + Row(0, "null", "\"hello\"").toString() + Row(1, "null", "\"world\"").toString(), + Seq[StructField]( + StructField("pos", IntegerType, nullable = false), + StructField("key", StringType("UTF8_BINARY")), + StructField("value", VariantType, nullable = false) + ) ), VariantExplodeTestCase("[\"Spark\", \"SQL\"]", "UTF8_LCASE", Row(0, "null", "\"Spark\"").toString() + Row(1, "null", "\"SQL\"").toString(), @@ -1568,10 +1571,10 @@ class CollationSQLExpressionsSuite test("Support SchemaOfVariant variant expressions with collation") { case class SchemaOfVariantTestCase( - input: String, - collationName: String, - result: String - ) + input: String, + collationName: String, + result: String + ) val testCases = Seq( SchemaOfVariantTestCase("null", "UTF8_BINARY", "VOID"), @@ -1599,10 +1602,10 @@ class CollationSQLExpressionsSuite test("Support SchemaOfVariantAgg variant expressions with collation") { case class SchemaOfVariantAggTestCase( - input: String, - collationName: String, - result: String - ) + input: String, + collationName: String, + result: String + ) val testCases = Seq( SchemaOfVariantAggTestCase("('1'), ('2'), ('3')", "UTF8_BINARY", "BIGINT"), @@ -1750,7 +1753,7 @@ class CollationSQLExpressionsSuite UTF8String.fromString("d") -> 2L, UTF8String.fromString("e") -> 1L) - val bufferValuesComplex = bufferValuesUTF8String.map{ + val bufferValuesComplex = bufferValuesUTF8String.map { case (k, v) => (InternalRow.fromSeq(Seq(k, k, k)), v) } val testCasesUTF8String = Seq( @@ -1895,7 +1898,7 @@ class CollationSQLExpressionsSuite sql(s"INSERT INTO ${tableName} VALUES " + valuesToAdd) val query = s"SELECT lower(element_at(element_at(mode(i), 1).s1.a2, 1)) FROM ${tableName}" - checkAnswer(sql(query), Row(t.result)) + checkAnswer(sql(query), Row(t.result)) } }) } @@ -1934,6 +1937,7 @@ class CollationSQLExpressionsSuite }.mkString(",") valuesToAdd } + val tableName = s"t_${t1.collationId}_mode_nested_map_struct1" withTable(tableName) { sql(s"CREATE TABLE ${tableName}(" + @@ -2005,9 +2009,9 @@ class CollationSQLExpressionsSuite testSuppCollations.foreach(collationName => { val query = s""" - |select to_unix_timestamp(collate('2021-01-01 00:00:00', '${collationName}'), - |collate('yyyy-MM-dd HH:mm:ss', '${collationName}')) - |""".stripMargin + |select to_unix_timestamp(collate('2021-01-01 00:00:00', '${collationName}'), + |collate('yyyy-MM-dd HH:mm:ss', '${collationName}')) + |""".stripMargin // Result & data type check val testQuery = sql(query) val dataType = LongType @@ -2022,8 +2026,8 @@ class CollationSQLExpressionsSuite testSuppCollations.foreach(collationName => { val query = s""" - |select from_unixtime(1609488000, collate('yyyy-MM-dd HH:mm:ss', '${collationName}')) - |""".stripMargin + |select from_unixtime(1609488000, collate('yyyy-MM-dd HH:mm:ss', '${collationName}')) + |""".stripMargin // Result & data type check withSQLConf(SqlApiConf.DEFAULT_COLLATION -> collationName) { val testQuery = sql(query) @@ -2040,8 +2044,8 @@ class CollationSQLExpressionsSuite testSuppCollations.foreach(collationName => { val query = s""" - |select next_day('2015-01-14', collate('TU', '${collationName}')) - |""".stripMargin + |select next_day('2015-01-14', collate('TU', '${collationName}')) + |""".stripMargin // Result & data type check withSQLConf(SqlApiConf.DEFAULT_COLLATION -> collationName) { val testQuery = sql(query) @@ -2058,9 +2062,9 @@ class CollationSQLExpressionsSuite testSuppCollations.foreach(collationName => { val query = s""" - |select from_utc_timestamp(collate('2016-08-31', '${collationName}'), - |collate('Asia/Seoul', '${collationName}')) - |""".stripMargin + |select from_utc_timestamp(collate('2016-08-31', '${collationName}'), + |collate('Asia/Seoul', '${collationName}')) + |""".stripMargin // Result & data type check val testQuery = sql(query) val dataType = TimestampType @@ -2075,9 +2079,9 @@ class CollationSQLExpressionsSuite testSuppCollations.foreach(collationName => { val query = s""" - |select to_utc_timestamp(collate('2016-08-31 09:00:00', '${collationName}'), - |collate('Asia/Seoul', '${collationName}')) - |""".stripMargin + |select to_utc_timestamp(collate('2016-08-31 09:00:00', '${collationName}'), + |collate('Asia/Seoul', '${collationName}')) + |""".stripMargin // Result & data type check val testQuery = sql(query) val dataType = TimestampType @@ -2092,9 +2096,9 @@ class CollationSQLExpressionsSuite testSuppCollations.foreach(collationName => { val query = s""" - |select to_date(collate('2016-12-31', '${collationName}'), - |collate('yyyy-MM-dd', '${collationName}')) - |""".stripMargin + |select to_date(collate('2016-12-31', '${collationName}'), + |collate('yyyy-MM-dd', '${collationName}')) + |""".stripMargin // Result & data type check val testQuery = sql(query) val dataType = DateType @@ -2109,9 +2113,9 @@ class CollationSQLExpressionsSuite testSuppCollations.foreach(collationName => { val query = s""" - |select to_timestamp(collate('2016-12-31 23:59:59', '${collationName}'), - |collate('yyyy-MM-dd HH:mm:ss', '${collationName}')) - |""".stripMargin + |select to_timestamp(collate('2016-12-31 23:59:59', '${collationName}'), + |collate('yyyy-MM-dd HH:mm:ss', '${collationName}')) + |""".stripMargin // Result & data type check val testQuery = sql(query) val dataType = TimestampType @@ -2126,8 +2130,8 @@ class CollationSQLExpressionsSuite testSuppCollations.foreach(collationName => { val query = s""" - |select trunc(collate('2016-12-31 23:59:59', '${collationName}'), 'MM') - |""".stripMargin + |select trunc(collate('2016-12-31 23:59:59', '${collationName}'), 'MM') + |""".stripMargin // Result & data type check val testQuery = sql(query) val dataType = DateType @@ -2142,9 +2146,9 @@ class CollationSQLExpressionsSuite testSuppCollations.foreach(collationName => { val query = s""" - |select date_trunc(collate('HOUR', '${collationName}'), - |collate('2015-03-05T09:32:05.359', '${collationName}')) - |""".stripMargin + |select date_trunc(collate('HOUR', '${collationName}'), + |collate('2015-03-05T09:32:05.359', '${collationName}')) + |""".stripMargin // Result & data type check val testQuery = sql(query) val dataType = TimestampType @@ -2159,8 +2163,8 @@ class CollationSQLExpressionsSuite testSuppCollations.foreach(collationName => { val query = s""" - |select make_timestamp(2014, 12, 28, 6, 30, 45.887, collate('CET', '${collationName}')) - |""".stripMargin + |select make_timestamp(2014, 12, 28, 6, 30, 45.887, collate('CET', '${collationName}')) + |""".stripMargin // Result & data type check val testQuery = sql(query) val dataType = TimestampType @@ -2228,9 +2232,9 @@ class CollationSQLExpressionsSuite testSuppCollations.foreach(collationName => { val query = s""" - |select date_part(collate('Week', '${collationName}'), - |collate('2019-08-12 01:00:00.123456', '${collationName}')) - |""".stripMargin + |select date_part(collate('Week', '${collationName}'), + |collate('2019-08-12 01:00:00.123456', '${collationName}')) + |""".stripMargin // Result & data type check val testQuery = sql(query) val dataType = IntegerType @@ -2323,10 +2327,10 @@ class CollationSQLExpressionsSuite testSuppCollations.foreach(collationName => { val query = s""" - |select date_format(convert_timezone(collate('America/Los_Angeles', '${collationName}'), - |collate('UTC', '${collationName}'), collate('2021-12-06 00:00:00', '${collationName}')), - |'yyyy-MM-dd HH:mm:ss.S') - |""".stripMargin + |select date_format(convert_timezone(collate('America/Los_Angeles', '${collationName}'), + |collate('UTC', '${collationName}'), collate('2021-12-06 00:00:00', '${collationName}')), + |'yyyy-MM-dd HH:mm:ss.S') + |""".stripMargin // Result & data type check val testQuery = sql(query) val dataType = StringType @@ -2340,12 +2344,12 @@ class CollationSQLExpressionsSuite // be aware that output of java.util.UUID.fromString is always lowercase case class ReflectExpressions( - left: String, - leftCollation: String, - right: String, - rightCollation: String, - result: Boolean - ) + left: String, + leftCollation: String, + right: String, + rightCollation: String, + result: Boolean + ) val testCases = Seq( ReflectExpressions("a5cf6c42-0c85-418f-af6c-3e4e5b1328f2", "utf8_binary", @@ -3057,9 +3061,9 @@ class CollationSQLExpressionsSuite ) // check result row data type val dataType = ArrayType(StructType( - StructField("key", StringType(collation), false) :: + StructField("key", StringType(collation), false) :: StructField("value", IntegerType, false) :: Nil - ), false) + ), false) assert(sql(query).schema.head.dataType == dataType) } } @@ -3216,10 +3220,10 @@ class CollationSQLExpressionsSuite test("Support HyperLogLogPlusPlus expression with collation") { case class HyperLogLogPlusPlusTestCase( - collation: String, - input: Seq[String], - output: Seq[Row] - ) + collation: String, + input: Seq[String], + output: Seq[Row] + ) val testCases = Seq( HyperLogLogPlusPlusTestCase("utf8_binary", Seq("a", "a", "A", "z", "zz", "ZZ", "w", "AA", @@ -3232,12 +3236,12 @@ class CollationSQLExpressionsSuite "aA", "Aa", "aa"), Seq(Row(5))) ) - testCases.foreach( t => { + testCases.foreach(t => { // Using explicit collate clause val query = s""" |SELECT approx_count_distinct(col) FROM VALUES - |${t.input.map(s => s"('${s}' collate ${t.collation})").mkString(", ") } tab(col) + |${t.input.map(s => s"('${s}' collate ${t.collation})").mkString(", ")} tab(col) |""".stripMargin checkAnswer(sql(query), t.output) @@ -3246,7 +3250,7 @@ class CollationSQLExpressionsSuite val query = s""" |SELECT approx_count_distinct(col) FROM VALUES - |${t.input.map(s => s"('${s}')").mkString(", ") } tab(col) + |${t.input.map(s => s"('${s}')").mkString(", ")} tab(col) |""".stripMargin checkAnswer(sql(query), t.output) } From 535b16bc58f155d5e77c6732a739f7dda66998de Mon Sep 17 00:00:00 2001 From: Gideon P Date: Fri, 13 Sep 2024 10:40:53 -0400 Subject: [PATCH 04/27] Revert "formatting" This reverts commit d6974039d717065aa94500e6a4e94964ecdad266. --- .../sql/CollationSQLExpressionsSuite.scala | 482 +++++++++--------- 1 file changed, 239 insertions(+), 243 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala index a2b306bc10568..cffcb43bfbe2b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala @@ -35,7 +35,7 @@ import org.apache.spark.util.collection.OpenHashMap // scalastyle:off nonascii class CollationSQLExpressionsSuite - extends QueryTest + extends QueryTest with SharedSparkSession with ExpressionEvalHelper { @@ -43,10 +43,10 @@ class CollationSQLExpressionsSuite test("Support Md5 hash expression with collation") { case class Md5TestCase( - input: String, - collationName: String, - result: String - ) + input: String, + collationName: String, + result: String + ) val testCases = Seq( Md5TestCase("Spark", "UTF8_BINARY", "8cde774d6f7333752ed72cacddb05126"), @@ -73,11 +73,11 @@ class CollationSQLExpressionsSuite test("Support Sha2 hash expression with collation") { case class Sha2TestCase( - input: String, - collationName: String, - bitLength: Int, - result: String - ) + input: String, + collationName: String, + bitLength: Int, + result: String + ) val testCases = Seq( Sha2TestCase("Spark", "UTF8_BINARY", 256, @@ -108,10 +108,10 @@ class CollationSQLExpressionsSuite test("Support Sha1 hash expression with collation") { case class Sha1TestCase( - input: String, - collationName: String, - result: String - ) + input: String, + collationName: String, + result: String + ) val testCases = Seq( Sha1TestCase("Spark", "UTF8_BINARY", "85f5955f4b27a9a4c2aab6ffe5d7189fc298b92c"), @@ -138,10 +138,10 @@ class CollationSQLExpressionsSuite test("Support Crc32 hash expression with collation") { case class Crc321TestCase( - input: String, - collationName: String, - result: Int - ) + input: String, + collationName: String, + result: Int + ) val testCases = Seq( Crc321TestCase("Spark", "UTF8_BINARY", 1557323817), @@ -166,10 +166,10 @@ class CollationSQLExpressionsSuite test("Support Murmur3Hash hash expression with collation") { case class Murmur3HashTestCase( - input: String, - collationName: String, - result: Int - ) + input: String, + collationName: String, + result: Int + ) val testCases = Seq( Murmur3HashTestCase("Spark", "UTF8_BINARY", 228093765), @@ -194,10 +194,10 @@ class CollationSQLExpressionsSuite test("Support XxHash64 hash expression with collation") { case class XxHash64TestCase( - input: String, - collationName: String, - result: Long - ) + input: String, + collationName: String, + result: Long + ) val testCases = Seq( XxHash64TestCase("Spark", "UTF8_BINARY", -4294468057691064905L), @@ -222,10 +222,10 @@ class CollationSQLExpressionsSuite test("Support UrlEncode hash expression with collation") { case class UrlEncodeTestCase( - input: String, - collationName: String, - result: String - ) + input: String, + collationName: String, + result: String + ) val testCases = Seq( UrlEncodeTestCase("https://spark.apache.org", "UTF8_BINARY", @@ -256,10 +256,10 @@ class CollationSQLExpressionsSuite test("Support UrlDecode hash expression with collation") { case class UrlDecodeTestCase( - input: String, - collationName: String, - result: String - ) + input: String, + collationName: String, + result: String + ) val testCases = Seq( UrlDecodeTestCase("https%3A%2F%2Fspark.apache.org", "UTF8_BINARY", @@ -290,11 +290,11 @@ class CollationSQLExpressionsSuite test("Support ParseUrl hash expression with collation") { case class ParseUrlTestCase( - input: String, - collationName: String, - path: String, - result: String - ) + input: String, + collationName: String, + path: String, + result: String + ) val testCases = Seq( ParseUrlTestCase("http://spark.apache.org/path?query=1", "UTF8_BINARY", "HOST", @@ -325,13 +325,13 @@ class CollationSQLExpressionsSuite test("Support CsvToStructs csv expression with collation") { case class CsvToStructsTestCase( - input: String, - collationName: String, - schema: String, - options: String, - result: Row, - structFields: Seq[StructField] - ) + input: String, + collationName: String, + schema: String, + options: String, + result: Row, + structFields: Seq[StructField] + ) val testCases = Seq( CsvToStructsTestCase("1", "UTF8_BINARY", "'a INT'", "", @@ -374,10 +374,10 @@ class CollationSQLExpressionsSuite test("Support SchemaOfCsv csv expression with collation") { case class SchemaOfCsvTestCase( - input: String, - collationName: String, - result: String - ) + input: String, + collationName: String, + result: String + ) val testCases = Seq( SchemaOfCsvTestCase("1", "UTF8_BINARY", "STRUCT<_c0: INT>"), @@ -406,10 +406,10 @@ class CollationSQLExpressionsSuite test("Support StructsToCsv csv expression with collation") { case class StructsToCsvTestCase( - input: String, - collationName: String, - result: String - ) + input: String, + collationName: String, + result: String + ) val testCases = Seq( StructsToCsvTestCase("named_struct('a', 1, 'b', 2)", "UTF8_BINARY", "1,2"), @@ -438,11 +438,11 @@ class CollationSQLExpressionsSuite test("Conv expression with collation") { // Supported collations case class ConvTestCase( - num: String, - from_base: String, - to_base: String, - collationName: String, - result: String) + num: String, + from_base: String, + to_base: String, + collationName: String, + result: String) val testCases = Seq( ConvTestCase("100", "2", "10", "UTF8_BINARY", "4"), @@ -464,9 +464,9 @@ class CollationSQLExpressionsSuite test("Bin expression with collation") { // Supported collations case class BinTestCase( - num: String, - collationName: String, - result: String) + num: String, + collationName: String, + result: String) val testCases = Seq( BinTestCase("13", "UTF8_BINARY", "1101"), @@ -489,9 +489,9 @@ class CollationSQLExpressionsSuite test("Hex with non-string input expression with collation") { case class HexTestCase( - num: String, - collationName: String, - result: String) + num: String, + collationName: String, + result: String) val testCases = Seq( HexTestCase("13", "UTF8_BINARY", "D"), @@ -514,9 +514,9 @@ class CollationSQLExpressionsSuite test("Hex with string input expression with collation") { case class HexTestCase( - num: String, - collationName: String, - result: String) + num: String, + collationName: String, + result: String) val testCases = Seq( HexTestCase("Spark SQL", "UTF8_BINARY", "537061726B2053514C"), @@ -537,9 +537,9 @@ class CollationSQLExpressionsSuite test("UnHex expression with collation") { case class UnHexTestCase( - num: String, - collationName: String, - result: String) + num: String, + collationName: String, + result: String) val testCases = Seq( UnHexTestCase("537061726B2053514C", "UTF8_BINARY", "Spark SQL"), @@ -560,13 +560,13 @@ class CollationSQLExpressionsSuite test("Support XPath expressions with collation") { case class XPathTestCase( - xml: String, - xpath: String, - functionName: String, - collationName: String, - result: Any, - resultType: DataType - ) + xml: String, + xpath: String, + functionName: String, + collationName: String, + result: Any, + resultType: DataType + ) val testCases = Seq( XPathTestCase("1", "a/b", @@ -604,10 +604,10 @@ class CollationSQLExpressionsSuite test("Support StringSpace expression with collation") { case class StringSpaceTestCase( - input: Int, - collationName: String, - result: String - ) + input: Int, + collationName: String, + result: String + ) val testCases = Seq( StringSpaceTestCase(1, "UTF8_BINARY", " "), @@ -634,12 +634,12 @@ class CollationSQLExpressionsSuite test("Support ToNumber & TryToNumber expressions with collation") { case class ToNumberTestCase( - input: String, - collationName: String, - format: String, - result: Any, - resultType: DataType - ) + input: String, + collationName: String, + format: String, + result: Any, + resultType: DataType + ) val testCases = Seq( ToNumberTestCase("123", "UTF8_BINARY", "999", 123, DecimalType(3, 0)), @@ -705,11 +705,11 @@ class CollationSQLExpressionsSuite test("Support ToChar expression with collation") { case class ToCharTestCase( - input: Int, - collationName: String, - format: String, - result: String - ) + input: Int, + collationName: String, + format: String, + result: String + ) val testCases = Seq( ToCharTestCase(12, "UTF8_BINARY", "999", " 12"), @@ -736,11 +736,11 @@ class CollationSQLExpressionsSuite test("Support GetJsonObject json expression with collation") { case class GetJsonObjectTestCase( - input: String, - path: String, - collationName: String, - result: String - ) + input: String, + path: String, + collationName: String, + result: String + ) val testCases = Seq( GetJsonObjectTestCase("{\"a\":\"b\"}", "$.a", "UTF8_BINARY", "b"), @@ -767,11 +767,11 @@ class CollationSQLExpressionsSuite test("Support JsonTuple json expression with collation") { case class JsonTupleTestCase( - input: String, - names: String, - collationName: String, - result: Row - ) + input: String, + names: String, + collationName: String, + result: Row + ) val testCases = Seq( JsonTupleTestCase("{\"a\":1, \"b\":2}", "'a', 'b'", "UTF8_BINARY", @@ -802,11 +802,11 @@ class CollationSQLExpressionsSuite test("Support JsonToStructs json expression with collation") { case class JsonToStructsTestCase( - input: String, - schema: String, - collationName: String, - result: Row - ) + input: String, + schema: String, + collationName: String, + result: Row + ) val testCases = Seq( JsonToStructsTestCase("{\"a\":1, \"b\":2.0}", "a INT, b DOUBLE", @@ -837,10 +837,10 @@ class CollationSQLExpressionsSuite test("Support StructsToJson json expression with collation") { case class StructsToJsonTestCase( - struct: String, - collationName: String, - result: Row - ) + struct: String, + collationName: String, + result: Row + ) val testCases = Seq( StructsToJsonTestCase("named_struct('a', 1, 'b', 2)", @@ -871,10 +871,10 @@ class CollationSQLExpressionsSuite test("Support LengthOfJsonArray json expression with collation") { case class LengthOfJsonArrayTestCase( - input: String, - collationName: String, - result: Row - ) + input: String, + collationName: String, + result: Row + ) val testCases = Seq( LengthOfJsonArrayTestCase("'[1,2,3,4]'", "UTF8_BINARY", Row(4)), @@ -900,10 +900,10 @@ class CollationSQLExpressionsSuite test("Support JsonObjectKeys json expression with collation") { case class JsonObjectKeysJsonArrayTestCase( - input: String, - collationName: String, - result: Row - ) + input: String, + collationName: String, + result: Row + ) val testCases = Seq( JsonObjectKeysJsonArrayTestCase("{}", "UTF8_BINARY", @@ -934,10 +934,10 @@ class CollationSQLExpressionsSuite test("Support SchemaOfJson json expression with collation") { case class SchemaOfJsonTestCase( - input: String, - collationName: String, - result: Row - ) + input: String, + collationName: String, + result: Row + ) val testCases = Seq( SchemaOfJsonTestCase("'[{\"col\":0}]'", @@ -968,11 +968,11 @@ class CollationSQLExpressionsSuite test("Support `StringToMap` expression with collation") { case class StringToMapTestCase[R]( - text: String, - pairDelim: String, - keyValueDelim: String, - collation: String, - result: R) + text: String, + pairDelim: String, + keyValueDelim: String, + collation: String, + result: R) val testCases = Seq( StringToMapTestCase("a:1,b:2,c:3", ",", ":", "UTF8_BINARY", Map("a" -> "1", "b" -> "2", "c" -> "3")), @@ -1095,11 +1095,11 @@ class CollationSQLExpressionsSuite test("Support AesEncrypt misc expression with collation") { // Supported collations case class AesEncryptTestCase( - input: String, - collationName: String, - params: String, - result: String - ) + input: String, + collationName: String, + params: String, + result: String + ) val testCases = Seq( AesEncryptTestCase("Spark", "UTF8_BINARY", "'1234567890abcdef', 'ECB'", "8DE7DB79A23F3E8ED530994DDEA98913"), @@ -1127,11 +1127,11 @@ class CollationSQLExpressionsSuite test("Support AesDecrypt misc expression with collation") { // Supported collations case class AesDecryptTestCase( - input: String, - collationName: String, - params: String, - result: String - ) + input: String, + collationName: String, + params: String, + result: String + ) val testCases = Seq( AesDecryptTestCase("8DE7DB79A23F3E8ED530994DDEA98913", "UTF8_BINARY", "'1234567890abcdef', 'ECB'", "Spark"), @@ -1156,7 +1156,7 @@ class CollationSQLExpressionsSuite test("Support Mask expression with collation") { // Supported collations case class MaskTestCase[R](i: String, u: String, l: String, d: String, o: String, c: String, - result: R) + result: R) val testCases = Seq( MaskTestCase("ab-CD-12-@$", null, null, null, null, "UTF8_BINARY", "ab-CD-12-@$"), MaskTestCase("ab-CD-12-@$", "X", null, null, null, "UTF8_LCASE", "ab-XX-12-@$"), @@ -1165,7 +1165,6 @@ class CollationSQLExpressionsSuite ) testCases.foreach(t => { def col(s: String): String = if (s == null) "null" else s"collate('$s', '${t.c}')" - val query = s"SELECT mask(${col(t.i)}, ${col(t.u)}, ${col(t.l)}, ${col(t.d)}, ${col(t.o)})" // Result & data type var result = sql(query) @@ -1178,9 +1177,7 @@ class CollationSQLExpressionsSuite ) testCasting.foreach(t => { def col(s: String): String = if (s == null) "null" else s"collate('$s', '${t.c}')" - def str(s: String): String = if (s == null) "null" else s"'$s'" - val query1 = s"SELECT mask(${col(t.i)}, ${str(t.u)}, ${str(t.l)}, ${str(t.d)}, ${str(t.o)})" val query2 = s"SELECT mask(${str(t.i)}, ${col(t.u)}, ${str(t.l)}, ${str(t.d)}, ${str(t.o)})" val query3 = s"SELECT mask(${str(t.i)}, ${str(t.u)}, ${col(t.l)}, ${str(t.d)}, ${str(t.o)})" @@ -1204,13 +1201,13 @@ class CollationSQLExpressionsSuite test("Support XmlToStructs xml expression with collation") { case class XmlToStructsTestCase( - input: String, - collationName: String, - schema: String, - options: String, - result: Row, - structFields: Seq[StructField] - ) + input: String, + collationName: String, + schema: String, + options: String, + result: Row, + structFields: Seq[StructField] + ) val testCases = Seq( XmlToStructsTestCase("

1

", "UTF8_BINARY", "'a INT'", "", @@ -1252,10 +1249,10 @@ class CollationSQLExpressionsSuite test("Support SchemaOfXml xml expression with collation") { case class SchemaOfXmlTestCase( - input: String, - collationName: String, - result: String - ) + input: String, + collationName: String, + result: String + ) val testCases = Seq( SchemaOfXmlTestCase("

1

", "UTF8_BINARY", "STRUCT"), @@ -1284,10 +1281,10 @@ class CollationSQLExpressionsSuite test("Support StructsToXml xml expression with collation") { case class StructsToXmlTestCase( - input: String, - collationName: String, - result: String - ) + input: String, + collationName: String, + result: String + ) val testCases = Seq( StructsToXmlTestCase("named_struct('a', 1, 'b', 2)", "UTF8_BINARY", @@ -1336,10 +1333,10 @@ class CollationSQLExpressionsSuite test("Support ParseJson & TryParseJson variant expressions with collation") { case class ParseJsonTestCase( - input: String, - collationName: String, - result: String - ) + input: String, + collationName: String, + result: String + ) val testCases = Seq( ParseJsonTestCase("{\"a\":1,\"b\":2}", "UTF8_BINARY", "{\"a\":1,\"b\":2}"), @@ -1408,10 +1405,10 @@ class CollationSQLExpressionsSuite test("Support IsVariantNull variant expressions with collation") { case class IsVariantNullTestCase( - input: String, - collationName: String, - result: Boolean - ) + input: String, + collationName: String, + result: Boolean + ) val testCases = Seq( IsVariantNullTestCase("'null'", "UTF8_BINARY", result = true), @@ -1436,13 +1433,13 @@ class CollationSQLExpressionsSuite test("Support VariantGet & TryVariantGet variant expressions with collation") { case class VariantGetTestCase( - input: String, - path: String, - variantType: String, - collationName: String, - result: Any, - resultType: DataType - ) + input: String, + path: String, + variantType: String, + collationName: String, + result: Any, + resultType: DataType + ) val testCases = Seq( VariantGetTestCase("{\"a\": 1}", "$.a", "int", "UTF8_BINARY", 1, IntegerType), @@ -1512,20 +1509,20 @@ class CollationSQLExpressionsSuite test("Support VariantExplode variant expressions with collation") { case class VariantExplodeTestCase( - input: String, - collationName: String, - result: String, - resultType: Seq[StructField] - ) + input: String, + collationName: String, + result: String, + resultType: Seq[StructField] + ) val testCases = Seq( VariantExplodeTestCase("[\"hello\", \"world\"]", "UTF8_BINARY", - Row(0, "null", "\"hello\"").toString() + Row(1, "null", "\"world\"").toString(), - Seq[StructField]( - StructField("pos", IntegerType, nullable = false), - StructField("key", StringType("UTF8_BINARY")), - StructField("value", VariantType, nullable = false) - ) + Row(0, "null", "\"hello\"").toString() + Row(1, "null", "\"world\"").toString(), + Seq[StructField]( + StructField("pos", IntegerType, nullable = false), + StructField("key", StringType("UTF8_BINARY")), + StructField("value", VariantType, nullable = false) + ) ), VariantExplodeTestCase("[\"Spark\", \"SQL\"]", "UTF8_LCASE", Row(0, "null", "\"Spark\"").toString() + Row(1, "null", "\"SQL\"").toString(), @@ -1571,10 +1568,10 @@ class CollationSQLExpressionsSuite test("Support SchemaOfVariant variant expressions with collation") { case class SchemaOfVariantTestCase( - input: String, - collationName: String, - result: String - ) + input: String, + collationName: String, + result: String + ) val testCases = Seq( SchemaOfVariantTestCase("null", "UTF8_BINARY", "VOID"), @@ -1602,10 +1599,10 @@ class CollationSQLExpressionsSuite test("Support SchemaOfVariantAgg variant expressions with collation") { case class SchemaOfVariantAggTestCase( - input: String, - collationName: String, - result: String - ) + input: String, + collationName: String, + result: String + ) val testCases = Seq( SchemaOfVariantAggTestCase("('1'), ('2'), ('3')", "UTF8_BINARY", "BIGINT"), @@ -1753,7 +1750,7 @@ class CollationSQLExpressionsSuite UTF8String.fromString("d") -> 2L, UTF8String.fromString("e") -> 1L) - val bufferValuesComplex = bufferValuesUTF8String.map { + val bufferValuesComplex = bufferValuesUTF8String.map{ case (k, v) => (InternalRow.fromSeq(Seq(k, k, k)), v) } val testCasesUTF8String = Seq( @@ -1898,7 +1895,7 @@ class CollationSQLExpressionsSuite sql(s"INSERT INTO ${tableName} VALUES " + valuesToAdd) val query = s"SELECT lower(element_at(element_at(mode(i), 1).s1.a2, 1)) FROM ${tableName}" - checkAnswer(sql(query), Row(t.result)) + checkAnswer(sql(query), Row(t.result)) } }) } @@ -1937,7 +1934,6 @@ class CollationSQLExpressionsSuite }.mkString(",") valuesToAdd } - val tableName = s"t_${t1.collationId}_mode_nested_map_struct1" withTable(tableName) { sql(s"CREATE TABLE ${tableName}(" + @@ -2009,9 +2005,9 @@ class CollationSQLExpressionsSuite testSuppCollations.foreach(collationName => { val query = s""" - |select to_unix_timestamp(collate('2021-01-01 00:00:00', '${collationName}'), - |collate('yyyy-MM-dd HH:mm:ss', '${collationName}')) - |""".stripMargin + |select to_unix_timestamp(collate('2021-01-01 00:00:00', '${collationName}'), + |collate('yyyy-MM-dd HH:mm:ss', '${collationName}')) + |""".stripMargin // Result & data type check val testQuery = sql(query) val dataType = LongType @@ -2026,8 +2022,8 @@ class CollationSQLExpressionsSuite testSuppCollations.foreach(collationName => { val query = s""" - |select from_unixtime(1609488000, collate('yyyy-MM-dd HH:mm:ss', '${collationName}')) - |""".stripMargin + |select from_unixtime(1609488000, collate('yyyy-MM-dd HH:mm:ss', '${collationName}')) + |""".stripMargin // Result & data type check withSQLConf(SqlApiConf.DEFAULT_COLLATION -> collationName) { val testQuery = sql(query) @@ -2044,8 +2040,8 @@ class CollationSQLExpressionsSuite testSuppCollations.foreach(collationName => { val query = s""" - |select next_day('2015-01-14', collate('TU', '${collationName}')) - |""".stripMargin + |select next_day('2015-01-14', collate('TU', '${collationName}')) + |""".stripMargin // Result & data type check withSQLConf(SqlApiConf.DEFAULT_COLLATION -> collationName) { val testQuery = sql(query) @@ -2062,9 +2058,9 @@ class CollationSQLExpressionsSuite testSuppCollations.foreach(collationName => { val query = s""" - |select from_utc_timestamp(collate('2016-08-31', '${collationName}'), - |collate('Asia/Seoul', '${collationName}')) - |""".stripMargin + |select from_utc_timestamp(collate('2016-08-31', '${collationName}'), + |collate('Asia/Seoul', '${collationName}')) + |""".stripMargin // Result & data type check val testQuery = sql(query) val dataType = TimestampType @@ -2079,9 +2075,9 @@ class CollationSQLExpressionsSuite testSuppCollations.foreach(collationName => { val query = s""" - |select to_utc_timestamp(collate('2016-08-31 09:00:00', '${collationName}'), - |collate('Asia/Seoul', '${collationName}')) - |""".stripMargin + |select to_utc_timestamp(collate('2016-08-31 09:00:00', '${collationName}'), + |collate('Asia/Seoul', '${collationName}')) + |""".stripMargin // Result & data type check val testQuery = sql(query) val dataType = TimestampType @@ -2096,9 +2092,9 @@ class CollationSQLExpressionsSuite testSuppCollations.foreach(collationName => { val query = s""" - |select to_date(collate('2016-12-31', '${collationName}'), - |collate('yyyy-MM-dd', '${collationName}')) - |""".stripMargin + |select to_date(collate('2016-12-31', '${collationName}'), + |collate('yyyy-MM-dd', '${collationName}')) + |""".stripMargin // Result & data type check val testQuery = sql(query) val dataType = DateType @@ -2113,9 +2109,9 @@ class CollationSQLExpressionsSuite testSuppCollations.foreach(collationName => { val query = s""" - |select to_timestamp(collate('2016-12-31 23:59:59', '${collationName}'), - |collate('yyyy-MM-dd HH:mm:ss', '${collationName}')) - |""".stripMargin + |select to_timestamp(collate('2016-12-31 23:59:59', '${collationName}'), + |collate('yyyy-MM-dd HH:mm:ss', '${collationName}')) + |""".stripMargin // Result & data type check val testQuery = sql(query) val dataType = TimestampType @@ -2130,8 +2126,8 @@ class CollationSQLExpressionsSuite testSuppCollations.foreach(collationName => { val query = s""" - |select trunc(collate('2016-12-31 23:59:59', '${collationName}'), 'MM') - |""".stripMargin + |select trunc(collate('2016-12-31 23:59:59', '${collationName}'), 'MM') + |""".stripMargin // Result & data type check val testQuery = sql(query) val dataType = DateType @@ -2146,9 +2142,9 @@ class CollationSQLExpressionsSuite testSuppCollations.foreach(collationName => { val query = s""" - |select date_trunc(collate('HOUR', '${collationName}'), - |collate('2015-03-05T09:32:05.359', '${collationName}')) - |""".stripMargin + |select date_trunc(collate('HOUR', '${collationName}'), + |collate('2015-03-05T09:32:05.359', '${collationName}')) + |""".stripMargin // Result & data type check val testQuery = sql(query) val dataType = TimestampType @@ -2163,8 +2159,8 @@ class CollationSQLExpressionsSuite testSuppCollations.foreach(collationName => { val query = s""" - |select make_timestamp(2014, 12, 28, 6, 30, 45.887, collate('CET', '${collationName}')) - |""".stripMargin + |select make_timestamp(2014, 12, 28, 6, 30, 45.887, collate('CET', '${collationName}')) + |""".stripMargin // Result & data type check val testQuery = sql(query) val dataType = TimestampType @@ -2232,9 +2228,9 @@ class CollationSQLExpressionsSuite testSuppCollations.foreach(collationName => { val query = s""" - |select date_part(collate('Week', '${collationName}'), - |collate('2019-08-12 01:00:00.123456', '${collationName}')) - |""".stripMargin + |select date_part(collate('Week', '${collationName}'), + |collate('2019-08-12 01:00:00.123456', '${collationName}')) + |""".stripMargin // Result & data type check val testQuery = sql(query) val dataType = IntegerType @@ -2327,10 +2323,10 @@ class CollationSQLExpressionsSuite testSuppCollations.foreach(collationName => { val query = s""" - |select date_format(convert_timezone(collate('America/Los_Angeles', '${collationName}'), - |collate('UTC', '${collationName}'), collate('2021-12-06 00:00:00', '${collationName}')), - |'yyyy-MM-dd HH:mm:ss.S') - |""".stripMargin + |select date_format(convert_timezone(collate('America/Los_Angeles', '${collationName}'), + |collate('UTC', '${collationName}'), collate('2021-12-06 00:00:00', '${collationName}')), + |'yyyy-MM-dd HH:mm:ss.S') + |""".stripMargin // Result & data type check val testQuery = sql(query) val dataType = StringType @@ -2344,12 +2340,12 @@ class CollationSQLExpressionsSuite // be aware that output of java.util.UUID.fromString is always lowercase case class ReflectExpressions( - left: String, - leftCollation: String, - right: String, - rightCollation: String, - result: Boolean - ) + left: String, + leftCollation: String, + right: String, + rightCollation: String, + result: Boolean + ) val testCases = Seq( ReflectExpressions("a5cf6c42-0c85-418f-af6c-3e4e5b1328f2", "utf8_binary", @@ -3061,9 +3057,9 @@ class CollationSQLExpressionsSuite ) // check result row data type val dataType = ArrayType(StructType( - StructField("key", StringType(collation), false) :: + StructField("key", StringType(collation), false) :: StructField("value", IntegerType, false) :: Nil - ), false) + ), false) assert(sql(query).schema.head.dataType == dataType) } } @@ -3220,10 +3216,10 @@ class CollationSQLExpressionsSuite test("Support HyperLogLogPlusPlus expression with collation") { case class HyperLogLogPlusPlusTestCase( - collation: String, - input: Seq[String], - output: Seq[Row] - ) + collation: String, + input: Seq[String], + output: Seq[Row] + ) val testCases = Seq( HyperLogLogPlusPlusTestCase("utf8_binary", Seq("a", "a", "A", "z", "zz", "ZZ", "w", "AA", @@ -3236,12 +3232,12 @@ class CollationSQLExpressionsSuite "aA", "Aa", "aa"), Seq(Row(5))) ) - testCases.foreach(t => { + testCases.foreach( t => { // Using explicit collate clause val query = s""" |SELECT approx_count_distinct(col) FROM VALUES - |${t.input.map(s => s"('${s}' collate ${t.collation})").mkString(", ")} tab(col) + |${t.input.map(s => s"('${s}' collate ${t.collation})").mkString(", ") } tab(col) |""".stripMargin checkAnswer(sql(query), t.output) @@ -3250,7 +3246,7 @@ class CollationSQLExpressionsSuite val query = s""" |SELECT approx_count_distinct(col) FROM VALUES - |${t.input.map(s => s"('${s}')").mkString(", ")} tab(col) + |${t.input.map(s => s"('${s}')").mkString(", ") } tab(col) |""".stripMargin checkAnswer(sql(query), t.output) } From 066ebd4957bb171896df256f76c1a3bbcfdeaf8b Mon Sep 17 00:00:00 2001 From: Gideon P Date: Fri, 13 Sep 2024 10:41:25 -0400 Subject: [PATCH 05/27] formatting --- .../apache/spark/sql/catalyst/expressions/aggregate/Mode.scala | 1 - 1 file changed, 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala index 3c7caad6dcac6..61c08d4c90dc1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala @@ -30,7 +30,6 @@ import org.apache.spark.sql.types.{AbstractDataType, AnyDataType, ArrayType, Boo import org.apache.spark.unsafe.types.UTF8String import org.apache.spark.util.collection.OpenHashMap - case class Mode( child: Expression, mutableAggBufferOffset: Int = 0, From f2d0503d8e38bed1db5914911b48f1056c9ad389 Mon Sep 17 00:00:00 2001 From: Gideon P Date: Fri, 13 Sep 2024 10:43:14 -0400 Subject: [PATCH 06/27] formatting --- .../org/apache/spark/sql/CollationSQLExpressionsSuite.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala index cffcb43bfbe2b..f9ccb55a58812 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala @@ -1739,9 +1739,9 @@ class CollationSQLExpressionsSuite test("Support Mode.eval(buffer) with complex types") { case class UTF8StringModeTestCase[R]( - collationId: String, - bufferValues: Map[InternalRow, Long], - result: R) + collationId: String, + bufferValues: Map[InternalRow, Long], + result: R) val bufferValuesUTF8String: Map[Any, Long] = Map( UTF8String.fromString("a") -> 5L, From 4f0cfbe1f23a3c83762d796cdb5f14b597dc1916 Mon Sep 17 00:00:00 2001 From: Gideon P Date: Fri, 13 Sep 2024 10:44:20 -0400 Subject: [PATCH 07/27] formatting --- .../org/apache/spark/sql/CollationSQLExpressionsSuite.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala index f9ccb55a58812..abd5e75ab1712 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala @@ -1712,9 +1712,9 @@ class CollationSQLExpressionsSuite test("Support Mode.eval(buffer)") { case class UTF8StringModeTestCase[R]( - collationId: String, - bufferValues: Map[UTF8String, Long], - result: R) + collationId: String, + bufferValues: Map[UTF8String, Long], + result: R) val bufferValuesUTF8String = Map( UTF8String.fromString("a") -> 5L, From 432de23290c89e3b2cfd4595de5a2c30e07edc24 Mon Sep 17 00:00:00 2001 From: Gideon P Date: Tue, 17 Sep 2024 10:16:36 -0400 Subject: [PATCH 08/27] move reason --- common/utils/src/main/resources/error/error-conditions.json | 2 +- .../apache/spark/sql/catalyst/expressions/aggregate/Mode.scala | 3 +-- .../org/apache/spark/sql/CollationSQLExpressionsSuite.scala | 3 +-- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json index 2bc681453556c..f143907bda5ad 100644 --- a/common/utils/src/main/resources/error/error-conditions.json +++ b/common/utils/src/main/resources/error/error-conditions.json @@ -1007,7 +1007,7 @@ }, "UNSUPPORTED_MODE_DATA_TYPE" : { "message" : [ - "The does not support the data type, because ." + "The does not support the data type, because there is MapType with collated fields." ] }, "UNSUPPORTED_UDF_INPUT_TYPE" : { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala index 61c08d4c90dc1..af92ed0c6765c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala @@ -65,8 +65,7 @@ case class Mode( TypeCheckResult.DataTypeMismatch("UNSUPPORTED_MODE_DATA_TYPE", messageParameters = Map("child" -> toSQLType(child.dataType), - "mode" -> toSQLId(prettyName), - "reason" -> "MapType with collated fields")) + "mode" -> toSQLId(prettyName))) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala index abd5e75ab1712..7414c7ee0f5c8 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala @@ -1919,8 +1919,7 @@ class CollationSQLExpressionsSuite parameters = Map( ("sqlExpr", "\"mode(i)\""), ("child", c1), - ("mode", "`mode`"), - ("reason", "MapType with collated fields")), + ("mode", "`mode`")), queryContext = Seq(ExpectedContext("mode(i)", 18, 24)).toArray ) } From a8d626b98e8455cc116c566dc0844497fbc80e0e Mon Sep 17 00:00:00 2001 From: Gideon P Date: Fri, 20 Sep 2024 09:26:50 -0400 Subject: [PATCH 09/27] four spaces for classes --- .../spark/sql/CollationSQLExpressionsSuite.scala | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala index 7414c7ee0f5c8..9f08e4412dada 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala @@ -1712,9 +1712,9 @@ class CollationSQLExpressionsSuite test("Support Mode.eval(buffer)") { case class UTF8StringModeTestCase[R]( - collationId: String, - bufferValues: Map[UTF8String, Long], - result: R) + collationId: String, + bufferValues: Map[UTF8String, Long], + result: R) val bufferValuesUTF8String = Map( UTF8String.fromString("a") -> 5L, @@ -1739,9 +1739,9 @@ class CollationSQLExpressionsSuite test("Support Mode.eval(buffer) with complex types") { case class UTF8StringModeTestCase[R]( - collationId: String, - bufferValues: Map[InternalRow, Long], - result: R) + collationId: String, + bufferValues: Map[InternalRow, Long], + result: R) val bufferValuesUTF8String: Map[Any, Long] = Map( UTF8String.fromString("a") -> 5L, From d621b8aec4d28b010346273d800bf49786c1c418 Mon Sep 17 00:00:00 2001 From: Gideon P Date: Fri, 20 Sep 2024 09:31:12 -0400 Subject: [PATCH 10/27] fix indentation of method params --- .../spark/sql/catalyst/expressions/aggregate/Mode.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala index af92ed0c6765c..393e3168e4ff2 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala @@ -92,8 +92,8 @@ case class Mode( } private def getCollationAwareBuffer( - childDataType: DataType, - buffer: OpenHashMap[AnyRef, Long]): Iterable[(AnyRef, Long)] = { + childDataType: DataType, + buffer: OpenHashMap[AnyRef, Long]): Iterable[(AnyRef, Long)] = { def groupAndReduceBuffer(groupingFunction: AnyRef => _): Iterable[(AnyRef, Long)] = { buffer.groupMapReduce(t => groupingFunction(t._1))(x => x)((x, y) => (x._1, x._2 + y._2)).values From af97fe8cb39fe37ef3ab0c93498fd2ae414fe44d Mon Sep 17 00:00:00 2001 From: Gideon P Date: Fri, 20 Sep 2024 09:32:00 -0400 Subject: [PATCH 11/27] fix indentation of method params --- .../spark/sql/catalyst/expressions/aggregate/Mode.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala index 393e3168e4ff2..14d969af96b7b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala @@ -99,8 +99,8 @@ case class Mode( groupingFunction(t._1))(x => x)((x, y) => (x._1, x._2 + y._2)).values } def determineBufferingFunction( - childDataType: DataType): Option[AnyRef => _] = { - childDataType match { + childDataType: DataType): Option[AnyRef => _] = { + childDataType match {å case _ if UnsafeRowUtils.isBinaryStable(child.dataType) => None case _ => Some(collationAwareTransform(_, childDataType)) } From bf91fe912d25d27a55c56412aa78459e652950e7 Mon Sep 17 00:00:00 2001 From: Gideon P Date: Fri, 20 Sep 2024 09:36:20 -0400 Subject: [PATCH 12/27] fix indentation of method params --- .../apache/spark/sql/catalyst/expressions/aggregate/Mode.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala index 14d969af96b7b..a9648693d66f1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala @@ -100,7 +100,7 @@ case class Mode( } def determineBufferingFunction( childDataType: DataType): Option[AnyRef => _] = { - childDataType match {å + childDataType match { case _ if UnsafeRowUtils.isBinaryStable(child.dataType) => None case _ => Some(collationAwareTransform(_, childDataType)) } From 0b7364fabbac9520c93a7f07acd1302df3f0764c Mon Sep 17 00:00:00 2001 From: Gideon P Date: Fri, 20 Sep 2024 09:40:03 -0400 Subject: [PATCH 13/27] fix indentation of method params --- .../apache/spark/sql/catalyst/expressions/aggregate/Mode.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala index a9648693d66f1..09b0b4976379f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala @@ -123,7 +123,7 @@ case class Mode( } private def processStructTypeWithBuffer( - tuples: Seq[(Any, StructField)]): Seq[Any] = { + tuples: Seq[(Any, StructField)]): Seq[Any] = { tuples.map(t => collationAwareTransform(t._1.asInstanceOf[AnyRef], t._2.dataType)) } From ca564d3a9f48bf97cc5d627eae100cea53a9e9aa Mon Sep 17 00:00:00 2001 From: Gideon P Date: Fri, 20 Sep 2024 09:40:35 -0400 Subject: [PATCH 14/27] fix indentation of method params --- .../spark/sql/catalyst/expressions/aggregate/Mode.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala index 09b0b4976379f..2c6af14fc18b3 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala @@ -128,8 +128,8 @@ case class Mode( } private def processArrayTypeWithBuffer( - a: ArrayType, - data: ArrayData): Seq[Any] = { + a: ArrayType, + data: ArrayData): Seq[Any] = { (0 until data.numElements()).map(i => collationAwareTransform(data.get(i, a.elementType), a.elementType)) } From 96c742f5a9ba67ddf24bf001cb8ab482a1c9689c Mon Sep 17 00:00:00 2001 From: Gideon Potok <31429832+GideonPotok@users.noreply.github.com> Date: Fri, 20 Sep 2024 09:42:21 -0400 Subject: [PATCH 15/27] Update common/utils/src/main/resources/error/error-conditions.json --- common/utils/src/main/resources/error/error-conditions.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json index f143907bda5ad..a316190214923 100644 --- a/common/utils/src/main/resources/error/error-conditions.json +++ b/common/utils/src/main/resources/error/error-conditions.json @@ -1007,7 +1007,7 @@ }, "UNSUPPORTED_MODE_DATA_TYPE" : { "message" : [ - "The does not support the data type, because there is MapType with collated fields." + "The does not support the data type, because there is a \"MAP\" type with keys and/or values that have collated sub-fields." ] }, "UNSUPPORTED_UDF_INPUT_TYPE" : { From d5552cd05434390280dd78bd8babdd0c0c538f97 Mon Sep 17 00:00:00 2001 From: Gideon Potok <31429832+GideonPotok@users.noreply.github.com> Date: Fri, 20 Sep 2024 12:10:33 -0400 Subject: [PATCH 16/27] Update sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala Co-authored-by: Maxim Gekk --- .../org/apache/spark/sql/CollationSQLExpressionsSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala index 9f08e4412dada..2ea66105e0e9a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala @@ -1759,7 +1759,7 @@ class CollationSQLExpressionsSuite UTF8StringModeTestCase("unicode_ci", bufferValuesComplex, "[b,b,b]"), UTF8StringModeTestCase("unicode", bufferValuesComplex, "[a,a,a]")) - testCasesUTF8String.foreach(t => { + testCasesUTF8String.foreach { t => val buffer = new OpenHashMap[AnyRef, Long](5) val myMode = Mode(child = Literal.create(null, StructType(Seq( StructField("f1", StringType(t.collationId), true), From ce8986fd8bf5fb64b574ecb64dee10397285fa4f Mon Sep 17 00:00:00 2001 From: Gideon Potok <31429832+GideonPotok@users.noreply.github.com> Date: Fri, 20 Sep 2024 12:10:42 -0400 Subject: [PATCH 17/27] Update sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala Co-authored-by: Maxim Gekk --- .../org/apache/spark/sql/CollationSQLExpressionsSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala index 2ea66105e0e9a..aab9cf24c29d9 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala @@ -1907,7 +1907,7 @@ class CollationSQLExpressionsSuite ModeTestCase("unicode", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "{a -> 1}"), ModeTestCase("utf8_lcase", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "{b -> 1}"), ModeTestCase("unicode_ci", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "{b -> 1}") - ).foreach(t1 => { + ).foreach { t1 => def checkThisError(t: ModeTestCase, query: String): Any = { val c = s"STRUCT>" val c1 = s"\"${c}\"" From 2632b91192706bf015b5b38841116ade55b2484f Mon Sep 17 00:00:00 2001 From: Gideon Potok <31429832+GideonPotok@users.noreply.github.com> Date: Fri, 20 Sep 2024 12:10:51 -0400 Subject: [PATCH 18/27] Update sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala Co-authored-by: Maxim Gekk --- .../org/apache/spark/sql/CollationSQLExpressionsSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala index aab9cf24c29d9..f42e839c5828a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala @@ -1855,7 +1855,7 @@ class CollationSQLExpressionsSuite ModeTestCase("unicode", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "a"), ModeTestCase("unicode_ci", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "b") ) - testCases.foreach(t => { + testCases.foreach { t => val valuesToAdd = t.bufferValues.map { case (elt, numRepeats) => (0L to numRepeats).map(_ => s"array(array(array(collate('$elt', '${t.collationId}'))))").mkString(",") From 72483ac07f464aecd504119c76070a989868553a Mon Sep 17 00:00:00 2001 From: Gideon Potok <31429832+GideonPotok@users.noreply.github.com> Date: Fri, 20 Sep 2024 12:11:17 -0400 Subject: [PATCH 19/27] Update sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala Co-authored-by: Maxim Gekk --- .../org/apache/spark/sql/CollationSQLExpressionsSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala index f42e839c5828a..56801b0a87eb8 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala @@ -1829,7 +1829,7 @@ class CollationSQLExpressionsSuite ModeTestCase("unicode", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "a"), ModeTestCase("unicode_ci", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "b") ) - testCases.foreach(t => { + testCases.foreach { t => val valuesToAdd = t.bufferValues.map { case (elt, numRepeats) => (0L to numRepeats).map(_ => s"array(named_struct('f2', " + s"collate('$elt', '${t.collationId}'), 'f3', 1))").mkString(",") From 46954629365aed6687bfc1d919e6047618c11d70 Mon Sep 17 00:00:00 2001 From: Gideon P Date: Fri, 20 Sep 2024 15:07:03 -0400 Subject: [PATCH 20/27] fix call to throw SparkUnsupportedOperationException --- .../catalyst/expressions/aggregate/Mode.scala | 6 ++++- .../sql/CollationSQLExpressionsSuite.scala | 23 +++++++++---------- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala index 2c6af14fc18b3..3800bc4abffcf 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala @@ -118,7 +118,11 @@ case class Mode( CollationFactory.getCollationKey(data.asInstanceOf[UTF8String], st.collationId) case _ => throw new SparkUnsupportedOperationException( - s"Unsupported data type for collation-aware mode: $dataType") + "UNSUPPORTED_MODE_DATA_TYPE", + messageParameters = + Map("child" -> toSQLType(child.dataType), + "mode" -> toSQLId(prettyName)) + ) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala index 56801b0a87eb8..e704dae7f3917 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala @@ -1729,12 +1729,12 @@ class CollationSQLExpressionsSuite UTF8StringModeTestCase("unicode_ci", bufferValuesUTF8String, "b"), UTF8StringModeTestCase("unicode", bufferValuesUTF8String, "a")) - testCasesUTF8String.foreach(t => { + testCasesUTF8String.foreach { t => val buffer = new OpenHashMap[AnyRef, Long](5) val myMode = Mode(child = Literal.create("some_column_name", StringType(t.collationId))) t.bufferValues.foreach { case (k, v) => buffer.update(k, v) } assert(myMode.eval(buffer).toString.toLowerCase() == t.result.toLowerCase()) - }) + } } test("Support Mode.eval(buffer) with complex types") { @@ -1768,7 +1768,7 @@ class CollationSQLExpressionsSuite )))) t.bufferValues.foreach { case (k, v) => buffer.update(k, v) } assert(myMode.eval(buffer).toString.toLowerCase() == t.result.toLowerCase()) - }) + } } test("Support mode for string expression with collated strings in struct") { @@ -1779,7 +1779,7 @@ class CollationSQLExpressionsSuite ModeTestCase("unicode", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "a"), ModeTestCase("unicode_ci", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "b") ) - testCases.foreach(t => { + testCases.foreach { t => val valuesToAdd = t.bufferValues.map { case (elt, numRepeats) => (0L to numRepeats).map(_ => s"named_struct('f1'," + s" collate('$elt', '${t.collationId}'), 'f2', 1)").mkString(",") @@ -1793,7 +1793,7 @@ class CollationSQLExpressionsSuite val query = s"SELECT lower(mode(i).f1) FROM ${tableName}" checkAnswer(sql(query), Row(t.result)) } - }) + } } test("Support mode for string expression with collated strings in recursively nested struct") { @@ -1804,7 +1804,7 @@ class CollationSQLExpressionsSuite ModeTestCase("unicode", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "a"), ModeTestCase("unicode_ci", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "b") ) - testCases.foreach(t => { + testCases.foreach { t => val valuesToAdd = t.bufferValues.map { case (elt, numRepeats) => (0L to numRepeats).map(_ => s"named_struct('f1', " + s"named_struct('f2', collate('$elt', '${t.collationId}')), 'f3', 1)").mkString(",") @@ -1818,7 +1818,7 @@ class CollationSQLExpressionsSuite val query = s"SELECT lower(mode(i).f1.f2) FROM ${tableName}" checkAnswer(sql(query), Row(t.result)) } - }) + } } test("Support mode for string expression with collated strings in array complex type") { @@ -1844,7 +1844,7 @@ class CollationSQLExpressionsSuite val query = s"SELECT lower(element_at(mode(i).f2, 1)) FROM ${tableName}" checkAnswer(sql(query), Row(t.result)) } - }) + } } test("Support mode for string expression with collated strings in 3D array type") { @@ -1870,7 +1870,7 @@ class CollationSQLExpressionsSuite s"element_at(element_at(element_at(mode(i),1),1),1)) FROM ${tableName}" checkAnswer(sql(query), Row(t.result)) } - }) + } } test("Support mode for string expression with collated complex type - Highly nested") { @@ -1881,7 +1881,7 @@ class CollationSQLExpressionsSuite ModeTestCase("unicode", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "a"), ModeTestCase("unicode_ci", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "b") ) - testCases.foreach(t => { + testCases.foreach { t => val valuesToAdd = t.bufferValues.map { case (elt, numRepeats) => (0L to numRepeats).map(_ => s"array(named_struct('s1', named_struct('a2', " + s"array(collate('$elt', '${t.collationId}'))), 'f3', 1))").mkString(",") @@ -1897,7 +1897,7 @@ class CollationSQLExpressionsSuite checkAnswer(sql(query), Row(t.result)) } - }) + } } test("Support mode expression with collated in recursively nested struct with map with keys") { @@ -1947,7 +1947,6 @@ class CollationSQLExpressionsSuite } } } - ) } test("SPARK-48430: Map value extraction with collations") { From b285a6f0b717d39bd0de77082ce6d9a9ff276e32 Mon Sep 17 00:00:00 2001 From: Gideon Potok <31429832+GideonPotok@users.noreply.github.com> Date: Fri, 20 Sep 2024 16:29:23 -0400 Subject: [PATCH 21/27] Apply suggestions from code review --- .../apache/spark/sql/CollationSQLExpressionsSuite.scala | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala index e704dae7f3917..3b7e5a14c967e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala @@ -1729,12 +1729,12 @@ class CollationSQLExpressionsSuite UTF8StringModeTestCase("unicode_ci", bufferValuesUTF8String, "b"), UTF8StringModeTestCase("unicode", bufferValuesUTF8String, "a")) - testCasesUTF8String.foreach { t => + testCasesUTF8String.foreach ( t => { val buffer = new OpenHashMap[AnyRef, Long](5) val myMode = Mode(child = Literal.create("some_column_name", StringType(t.collationId))) t.bufferValues.foreach { case (k, v) => buffer.update(k, v) } assert(myMode.eval(buffer).toString.toLowerCase() == t.result.toLowerCase()) - } + }) } test("Support Mode.eval(buffer) with complex types") { @@ -1779,7 +1779,7 @@ class CollationSQLExpressionsSuite ModeTestCase("unicode", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "a"), ModeTestCase("unicode_ci", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "b") ) - testCases.foreach { t => + testCases.foreach(t => { val valuesToAdd = t.bufferValues.map { case (elt, numRepeats) => (0L to numRepeats).map(_ => s"named_struct('f1'," + s" collate('$elt', '${t.collationId}'), 'f2', 1)").mkString(",") @@ -1793,7 +1793,7 @@ class CollationSQLExpressionsSuite val query = s"SELECT lower(mode(i).f1) FROM ${tableName}" checkAnswer(sql(query), Row(t.result)) } - } + }) } test("Support mode for string expression with collated strings in recursively nested struct") { From e3306982442c35f5ae2f242d4e2d3a731827bb7d Mon Sep 17 00:00:00 2001 From: Gideon P Date: Tue, 24 Sep 2024 17:33:34 -0400 Subject: [PATCH 22/27] fix --- .../catalyst/expressions/aggregate/Mode.scala | 7 +-- .../sql/CollationSQLExpressionsSuite.scala | 43 ++++++++++++++++++- 2 files changed, 44 insertions(+), 6 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala index 3800bc4abffcf..19a7f77699870 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala @@ -108,7 +108,7 @@ case class Mode( determineBufferingFunction(childDataType).map(groupAndReduceBuffer).getOrElse(buffer) } - private def collationAwareTransform(data: AnyRef, dataType: DataType): AnyRef = { + protected[sql] def collationAwareTransform(data: AnyRef, dataType: DataType): AnyRef = { dataType match { case _ if UnsafeRowUtils.isBinaryStable(dataType) => data case st: StructType => @@ -118,10 +118,7 @@ case class Mode( CollationFactory.getCollationKey(data.asInstanceOf[UTF8String], st.collationId) case _ => throw new SparkUnsupportedOperationException( - "UNSUPPORTED_MODE_DATA_TYPE", - messageParameters = - Map("child" -> toSQLType(child.dataType), - "mode" -> toSQLId(prettyName)) + "DIVIDE_BY_ZERO" ) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala index 3b7e5a14c967e..7363af48094e1 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala @@ -23,7 +23,7 @@ import java.util.Locale import scala.collection.immutable.Seq -import org.apache.spark.{SparkConf, SparkException, SparkIllegalArgumentException, SparkRuntimeException, SparkThrowable} +import org.apache.spark.{SparkConf, SparkException, SparkIllegalArgumentException, SparkRuntimeException, SparkThrowable, SparkUnsupportedOperationException} import org.apache.spark.sql.catalyst.{ExtendedAnalysisException, InternalRow} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.aggregate.Mode @@ -1949,6 +1949,47 @@ class CollationSQLExpressionsSuite } } + test("UDT with collation - Mode (throw exception)") { + /* + * - UTF8_BINARY -> 0 + * - UTF8_LCASE -> 1 + * - UNICODE -> 0x20000000 + * - UNICODE_AI -> 0x20010000 + * - UNICODE_CI -> 0x20020000 + * - UNICODE_CI_AI -> 0x20030000 + * - af -> 0x20000001 + * - af_CI_AI -> 0x20030001 + */ + case class ModeTestCase(collationId: String, bufferValues: Map[String, Long], result: String) + Seq( + ModeTestCase("utf8_binary", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "a"), + ModeTestCase("utf8_lcase", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "b"), + ModeTestCase("unicode", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "a"), + ModeTestCase("unicode_ci", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "b") + ).foreach { t1 => + if (t1.collationId != "utf8_binary") { + checkError( + exception = intercept[SparkUnsupportedOperationException] { + Mode( + child = Literal.create(null, + MapType(StringType(t1.collationId), IntegerType)) + ).collationAwareTransform( + data = Map.empty[String, Any], + dataType = MapType(StringType(t1.collationId), IntegerType) + ) + }, + condition = "DATATYPE_MISMATCH", + parameters = Map( + ("sqlExpr", "\"mode(null)\""), + ("child", "\"MapType(StringType(UTF8_LCASE),IntegerType)\""), + ("mode", "`mode`")), + queryContext = Seq(ExpectedContext("mode(null)", 18, 24)).toArray + ) + } + } + + } + test("SPARK-48430: Map value extraction with collations") { for { collateKey <- Seq(true, false) From f4074be42b8f5d026bc9820b800f2a5d85319bc1 Mon Sep 17 00:00:00 2001 From: Gideon P Date: Thu, 26 Sep 2024 12:26:29 -0400 Subject: [PATCH 23/27] hello --- .../catalyst/expressions/aggregate/Mode.scala | 5 ++- .../sql/CollationSQLExpressionsSuite.scala | 32 ++++++------------- 2 files changed, 14 insertions(+), 23 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala index 19a7f77699870..80f3986b4b1eb 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala @@ -118,7 +118,10 @@ case class Mode( CollationFactory.getCollationKey(data.asInstanceOf[UTF8String], st.collationId) case _ => throw new SparkUnsupportedOperationException( - "DIVIDE_BY_ZERO" + errorClass = "COMPLEX_EXPRESSION_UNSUPPORTED_INPUT.NO_INPUT", + messageParameters = Map( + "functionName" -> toSQLType(prettyName), + "dataType" -> toSQLType(child.dataType)) ) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala index 7363af48094e1..05fb2d9368b68 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala @@ -23,7 +23,7 @@ import java.util.Locale import scala.collection.immutable.Seq -import org.apache.spark.{SparkConf, SparkException, SparkIllegalArgumentException, SparkRuntimeException, SparkThrowable, SparkUnsupportedOperationException} +import org.apache.spark.{SparkConf, SparkException, SparkIllegalArgumentException, SparkRuntimeException, SparkThrowable} import org.apache.spark.sql.catalyst.{ExtendedAnalysisException, InternalRow} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.aggregate.Mode @@ -1950,16 +1950,6 @@ class CollationSQLExpressionsSuite } test("UDT with collation - Mode (throw exception)") { - /* - * - UTF8_BINARY -> 0 - * - UTF8_LCASE -> 1 - * - UNICODE -> 0x20000000 - * - UNICODE_AI -> 0x20010000 - * - UNICODE_CI -> 0x20020000 - * - UNICODE_CI_AI -> 0x20030000 - * - af -> 0x20000001 - * - af_CI_AI -> 0x20030001 - */ case class ModeTestCase(collationId: String, bufferValues: Map[String, Long], result: String) Seq( ModeTestCase("utf8_binary", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "a"), @@ -1968,9 +1958,10 @@ class CollationSQLExpressionsSuite ModeTestCase("unicode_ci", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "b") ).foreach { t1 => if (t1.collationId != "utf8_binary") { - checkError( - exception = intercept[SparkUnsupportedOperationException] { - Mode( + checkError( // org.apache.spark.SparkException: [INTERNAL_ERROR] Cannot + // find sub error class 'COMPLEX_EXPRESSION_UNSUPPORTED_INPUT.NO_INPUT' SQLSTATE: XX000 + exception = intercept[SparkException] { + Mode( child = Literal.create(null, MapType(StringType(t1.collationId), IntegerType)) ).collationAwareTransform( @@ -1978,16 +1969,13 @@ class CollationSQLExpressionsSuite dataType = MapType(StringType(t1.collationId), IntegerType) ) }, - condition = "DATATYPE_MISMATCH", - parameters = Map( - ("sqlExpr", "\"mode(null)\""), - ("child", "\"MapType(StringType(UTF8_LCASE),IntegerType)\""), - ("mode", "`mode`")), - queryContext = Seq(ExpectedContext("mode(null)", 18, 24)).toArray - ) + condition = "COMPLEX_EXPRESSION_UNSUPPORTED_INPUT", + parameters = Map("message" -> "Cannot find sub" + + " error class 'COMPLEX_EXPRESSION_UNSUPPORTED_INPUT.NO_INPUT'") + // Map("function" -> "mode(i)", "dataType" -> "MAP") + ) } } - } test("SPARK-48430: Map value extraction with collations") { From adae8f385aed7da2b722988ed163c7801b45e737 Mon Sep 17 00:00:00 2001 From: Gideon P Date: Sat, 28 Sep 2024 13:39:05 -0400 Subject: [PATCH 24/27] passing tests --- .../apache/spark/sql/CollationSQLExpressionsSuite.scala | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala index 05fb2d9368b68..23c35f71c401b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala @@ -1952,12 +1952,10 @@ class CollationSQLExpressionsSuite test("UDT with collation - Mode (throw exception)") { case class ModeTestCase(collationId: String, bufferValues: Map[String, Long], result: String) Seq( - ModeTestCase("utf8_binary", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "a"), ModeTestCase("utf8_lcase", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "b"), ModeTestCase("unicode", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "a"), ModeTestCase("unicode_ci", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "b") ).foreach { t1 => - if (t1.collationId != "utf8_binary") { checkError( // org.apache.spark.SparkException: [INTERNAL_ERROR] Cannot // find sub error class 'COMPLEX_EXPRESSION_UNSUPPORTED_INPUT.NO_INPUT' SQLSTATE: XX000 exception = intercept[SparkException] { @@ -1969,13 +1967,12 @@ class CollationSQLExpressionsSuite dataType = MapType(StringType(t1.collationId), IntegerType) ) }, - condition = "COMPLEX_EXPRESSION_UNSUPPORTED_INPUT", - parameters = Map("message" -> "Cannot find sub" + - " error class 'COMPLEX_EXPRESSION_UNSUPPORTED_INPUT.NO_INPUT'") + condition = "INTERNAL_ERROR", + parameters = Map("message" -> + "Cannot find sub error class 'COMPLEX_EXPRESSION_UNSUPPORTED_INPUT.NO_INPUT'") // Map("function" -> "mode(i)", "dataType" -> "MAP") ) } - } } test("SPARK-48430: Map value extraction with collations") { From 37efd0cf25e61db5ef4256845173766c3fa8b25a Mon Sep 17 00:00:00 2001 From: Gideon P Date: Sat, 28 Sep 2024 13:44:30 -0400 Subject: [PATCH 25/27] passing tests --- .../spark/sql/catalyst/expressions/aggregate/Mode.scala | 4 ++-- .../org/apache/spark/sql/CollationSQLExpressionsSuite.scala | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala index 80f3986b4b1eb..a69de9e2e70e6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.catalyst.expressions.aggregate -import org.apache.spark.SparkUnsupportedOperationException +import org.apache.spark.SparkIllegalArgumentException import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.{ExpressionBuilder, TypeCheckResult, UnresolvedWithinGroup} import org.apache.spark.sql.catalyst.expressions.{Ascending, Descending, Expression, ExpressionDescription, ImplicitCastInputTypes, SortOrder} @@ -117,7 +117,7 @@ case class Mode( case st: StringType => CollationFactory.getCollationKey(data.asInstanceOf[UTF8String], st.collationId) case _ => - throw new SparkUnsupportedOperationException( + throw new SparkIllegalArgumentException( errorClass = "COMPLEX_EXPRESSION_UNSUPPORTED_INPUT.NO_INPUT", messageParameters = Map( "functionName" -> toSQLType(prettyName), diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala index 23c35f71c401b..2fbe920a30e71 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala @@ -1956,8 +1956,7 @@ class CollationSQLExpressionsSuite ModeTestCase("unicode", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "a"), ModeTestCase("unicode_ci", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "b") ).foreach { t1 => - checkError( // org.apache.spark.SparkException: [INTERNAL_ERROR] Cannot - // find sub error class 'COMPLEX_EXPRESSION_UNSUPPORTED_INPUT.NO_INPUT' SQLSTATE: XX000 + checkError( exception = intercept[SparkException] { Mode( child = Literal.create(null, From afd123bc90f788a9632703040fc71f5dfc59d23b Mon Sep 17 00:00:00 2001 From: Gideon P Date: Sun, 29 Sep 2024 13:26:57 -0400 Subject: [PATCH 26/27] Added COMPLEX_EXPRESSION_UNSUPPORTED_INPUT.BAD_INPUTS. Tests pass. --- .../src/main/resources/error/error-conditions.json | 5 +++++ .../sql/catalyst/expressions/aggregate/Mode.scala | 4 +++- .../spark/sql/CollationSQLExpressionsSuite.scala | 11 ++++++----- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json index a316190214923..b591d59a3f938 100644 --- a/common/utils/src/main/resources/error/error-conditions.json +++ b/common/utils/src/main/resources/error/error-conditions.json @@ -634,6 +634,11 @@ "message" : [ "The collection of input data types must not be empty." ] + }, + "BAD_INPUTS" : { + "message" : [ + "The input data types to must be valid, but found the input types ." + ] } }, "sqlState" : "42K09" diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala index a69de9e2e70e6..8998348f0571b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala @@ -21,6 +21,7 @@ import org.apache.spark.SparkIllegalArgumentException import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.{ExpressionBuilder, TypeCheckResult, UnresolvedWithinGroup} import org.apache.spark.sql.catalyst.expressions.{Ascending, Descending, Expression, ExpressionDescription, ImplicitCastInputTypes, SortOrder} +import org.apache.spark.sql.catalyst.expressions.Cast.toSQLExpr import org.apache.spark.sql.catalyst.trees.UnaryLike import org.apache.spark.sql.catalyst.types.PhysicalDataType import org.apache.spark.sql.catalyst.util.{ArrayData, CollationFactory, GenericArrayData, UnsafeRowUtils} @@ -118,8 +119,9 @@ case class Mode( CollationFactory.getCollationKey(data.asInstanceOf[UTF8String], st.collationId) case _ => throw new SparkIllegalArgumentException( - errorClass = "COMPLEX_EXPRESSION_UNSUPPORTED_INPUT.NO_INPUT", + errorClass = "COMPLEX_EXPRESSION_UNSUPPORTED_INPUT.BAD_INPUTS", messageParameters = Map( + "expression" -> toSQLExpr(this), "functionName" -> toSQLType(prettyName), "dataType" -> toSQLType(child.dataType)) ) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala index 2fbe920a30e71..ecce2da6b6211 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala @@ -1957,7 +1957,7 @@ class CollationSQLExpressionsSuite ModeTestCase("unicode_ci", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "b") ).foreach { t1 => checkError( - exception = intercept[SparkException] { + exception = intercept[SparkIllegalArgumentException] { Mode( child = Literal.create(null, MapType(StringType(t1.collationId), IntegerType)) @@ -1966,10 +1966,11 @@ class CollationSQLExpressionsSuite dataType = MapType(StringType(t1.collationId), IntegerType) ) }, - condition = "INTERNAL_ERROR", - parameters = Map("message" -> - "Cannot find sub error class 'COMPLEX_EXPRESSION_UNSUPPORTED_INPUT.NO_INPUT'") - // Map("function" -> "mode(i)", "dataType" -> "MAP") + condition = "COMPLEX_EXPRESSION_UNSUPPORTED_INPUT.BAD_INPUTS", + parameters = Map( + "expression" -> "\"mode(NULL)\"", + "functionName" -> "\"MODE\"", + "dataType" -> s"\"MAP\"") ) } } From f4c39b17b5f61a3c7fda4873355b5cfa22498085 Mon Sep 17 00:00:00 2001 From: Gideon P Date: Mon, 30 Sep 2024 09:02:04 -0400 Subject: [PATCH 27/27] reformat error-conditions.json for test 'Error conditions are correctly formatted' in SparkThrowableSuite --- .../src/main/resources/error/error-conditions.json | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json index b591d59a3f938..7a0aa5e80e8de 100644 --- a/common/utils/src/main/resources/error/error-conditions.json +++ b/common/utils/src/main/resources/error/error-conditions.json @@ -625,6 +625,11 @@ "Cannot process input data types for the expression: ." ], "subClass" : { + "BAD_INPUTS" : { + "message" : [ + "The input data types to must be valid, but found the input types ." + ] + }, "MISMATCHED_TYPES" : { "message" : [ "All input types must be the same except nullable, containsNull, valueContainsNull flags, but found the input types ." @@ -634,11 +639,6 @@ "message" : [ "The collection of input data types must not be empty." ] - }, - "BAD_INPUTS" : { - "message" : [ - "The input data types to must be valid, but found the input types ." - ] } }, "sqlState" : "42K09"