From 9241b8e8c0dfe35fbe1631fd440527eb72d88de8 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Tue, 14 May 2024 14:08:30 +0800 Subject: [PATCH] [SPARK-48229][SQL] Add collation support for inputFile expressions ### What changes were proposed in this pull request? Introduce collation awareness for inputFile expressions: input_file_name. ### Why are the changes needed? Add collation support for inputFile expressions in Spark. ### Does this PR introduce _any_ user-facing change? Yes, users should now be able to use collated strings within arguments for inputFile functions: input_file_name. ### How was this patch tested? E2e sql tests. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #46503 from uros-db/input-file-block. Authored-by: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Signed-off-by: Wenchen Fan --- .../catalyst/expressions/inputFileBlock.scala | 5 +++-- .../sql/CollationSQLExpressionsSuite.scala | 17 +++++++++++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/inputFileBlock.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/inputFileBlock.scala index 6cd88367aa9a0..65eb995ff32ff 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/inputFileBlock.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/inputFileBlock.scala @@ -21,7 +21,8 @@ import org.apache.spark.rdd.InputFileBlockHolder import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodeGenerator, ExprCode, FalseLiteral} import org.apache.spark.sql.catalyst.expressions.codegen.Block._ -import org.apache.spark.sql.types.{DataType, LongType, StringType} +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.{DataType, LongType} import org.apache.spark.unsafe.types.UTF8String // scalastyle:off whitespace.end.of.line @@ -39,7 +40,7 @@ case class InputFileName() extends LeafExpression with Nondeterministic { override def nullable: Boolean = false - override def dataType: DataType = StringType + override def dataType: DataType = SQLConf.get.defaultStringType override def prettyName: String = "input_file_name" diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala index dd5703d1284a3..22b29154cd78a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala @@ -1275,6 +1275,23 @@ class CollationSQLExpressionsSuite }) } + test("Support InputFileName expression with collation") { + // Supported collations + Seq("UTF8_BINARY", "UTF8_BINARY_LCASE", "UNICODE", "UNICODE_CI").foreach(collationName => { + val query = + s""" + |select input_file_name() + |""".stripMargin + // Result + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> collationName) { + val testQuery = sql(query) + checkAnswer(testQuery, Row("")) + val dataType = StringType(collationName) + assert(testQuery.schema.fields.head.dataType.sameType(dataType)) + } + }) + } + // TODO: Add more tests for other SQL expressions }