From 441fa06808cbc2c32bba5a6236581caf546f7abf Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 22 Apr 2026 11:09:39 -0600 Subject: [PATCH 1/3] fix: fall back to Spark for shuffle, sort, and aggregate on non-default collated strings The Spark 4.0 `CometTypeShim.isStringCollationType` check compared against `StringTypeWithCollation`, which is an `AbstractDataType` used only for expression type matching, so it never matched a concrete `StringType` with a non-default collation. As a result, Comet's columnar shuffle would hash-partition a collated column using byte-level hashing, sending rows that compare equal under the collation to different partitions. A downstream DISTINCT or GROUP BY then failed to deduplicate, producing incorrect aggregate results, for example on `listagg-collations.sql` query #5 in the Spark 4.0 SQL test suite. Detect non-default collations by comparing `collationId` against the default `StringType` object, and reject collated string columns in shuffle hash/range partitioning, sort, and aggregate grouping so those operators fall back to Spark. The previously disabled `listagg-collations.sql` query is re-enabled. Closes #1947 --- .../apache/comet/shims/CometTypeShim.scala | 13 +++-- dev/diffs/4.0.1.diff | 54 ------------------- .../apache/comet/serde/QueryPlanSerde.scala | 6 ++- .../shuffle/CometShuffleExchangeExec.scala | 18 ++++++- .../apache/spark/sql/comet/operators.scala | 10 +++- .../spark/sql/CometCollationSuite.scala | 48 +++++++++++++++++ 6 files changed, 88 insertions(+), 61 deletions(-) create mode 100644 spark/src/test/spark-4.0/org/apache/spark/sql/CometCollationSuite.scala diff --git a/common/src/main/spark-4.0/org/apache/comet/shims/CometTypeShim.scala b/common/src/main/spark-4.0/org/apache/comet/shims/CometTypeShim.scala index 8bcb721603..1b82c04b20 100644 --- a/common/src/main/spark-4.0/org/apache/comet/shims/CometTypeShim.scala +++ b/common/src/main/spark-4.0/org/apache/comet/shims/CometTypeShim.scala @@ -19,9 +19,16 @@ package org.apache.comet.shims -import org.apache.spark.sql.internal.types.StringTypeWithCollation -import org.apache.spark.sql.types.DataType +import org.apache.spark.sql.types.{DataType, StringType} trait CometTypeShim { - def isStringCollationType(dt: DataType): Boolean = dt.isInstanceOf[StringTypeWithCollation] + // A `StringType` carries collation metadata in Spark 4.0. Only non-default (non-UTF8_BINARY) + // collations have semantics Comet's byte-level hashing/sorting/equality cannot honor. The + // default `StringType` object is `StringType(UTF8_BINARY_COLLATION_ID)`, so comparing + // `collationId` against that instance's id picks out non-default collations without needing + // `private[sql]` helpers on `StringType`. + def isStringCollationType(dt: DataType): Boolean = dt match { + case st: StringType => st.collationId != StringType.collationId + case _ => false + } } diff --git a/dev/diffs/4.0.1.diff b/dev/diffs/4.0.1.diff index d846189720..58e49abdf4 100644 --- a/dev/diffs/4.0.1.diff +++ b/dev/diffs/4.0.1.diff @@ -150,26 +150,6 @@ index 4410fe50912..43bcce2a038 100644 case _ => Map[String, String]() } val childrenInfo = children.flatMap { -diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/listagg-collations.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/listagg-collations.sql.out -index 7aca17dcb25..8afeb3b4a2f 100644 ---- a/sql/core/src/test/resources/sql-tests/analyzer-results/listagg-collations.sql.out -+++ b/sql/core/src/test/resources/sql-tests/analyzer-results/listagg-collations.sql.out -@@ -64,15 +64,6 @@ WithCTE - +- CTERelationRef xxxx, true, [c1#x], false, false - - ---- !query --SELECT lower(listagg(DISTINCT c1 COLLATE utf8_lcase) WITHIN GROUP (ORDER BY c1 COLLATE utf8_lcase)) FROM (VALUES ('a'), ('B'), ('b'), ('A')) AS t(c1) ---- !query analysis --Aggregate [lower(listagg(distinct collate(c1#x, utf8_lcase), null, collate(c1#x, utf8_lcase) ASC NULLS FIRST, 0, 0)) AS lower(listagg(DISTINCT collate(c1, utf8_lcase), NULL) WITHIN GROUP (ORDER BY collate(c1, utf8_lcase) ASC NULLS FIRST))#x] --+- SubqueryAlias t -- +- Project [col1#x AS c1#x] -- +- LocalRelation [col1#x] -- -- - -- !query - WITH t(c1) AS (SELECT replace(listagg(DISTINCT col1 COLLATE unicode_rtrim) COLLATE utf8_binary, ' ', '') FROM (VALUES ('xbc '), ('xbc '), ('a'), ('xbc'))) SELECT len(c1), regexp_count(c1, 'a'), regexp_count(c1, 'xbc') FROM t - -- !query analysis diff --git a/sql/core/src/test/resources/sql-tests/inputs/collations.sql b/sql/core/src/test/resources/sql-tests/inputs/collations.sql index 17815ed5dde..baad440b1ce 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/collations.sql @@ -230,21 +210,6 @@ index 698ca009b4f..57d774a3617 100644 -- Test tables CREATE table explain_temp1 (key int, val int) USING PARQUET; -diff --git a/sql/core/src/test/resources/sql-tests/inputs/listagg-collations.sql b/sql/core/src/test/resources/sql-tests/inputs/listagg-collations.sql -index aa3d02dc2fb..c4f878d9908 100644 ---- a/sql/core/src/test/resources/sql-tests/inputs/listagg-collations.sql -+++ b/sql/core/src/test/resources/sql-tests/inputs/listagg-collations.sql -@@ -5,7 +5,9 @@ WITH t(c1) AS (SELECT listagg(col1) WITHIN GROUP (ORDER BY col1) FROM (VALUES (' - -- Test cases with utf8_lcase. Lower expression added for determinism - SELECT lower(listagg(c1) WITHIN GROUP (ORDER BY c1 COLLATE utf8_lcase)) FROM (VALUES ('a'), ('A'), ('b'), ('B')) AS t(c1); - WITH t(c1) AS (SELECT lower(listagg(DISTINCT col1 COLLATE utf8_lcase)) FROM (VALUES ('a'), ('A'), ('b'), ('B'))) SELECT len(c1), regexp_count(c1, 'a'), regexp_count(c1, 'b') FROM t; --SELECT lower(listagg(DISTINCT c1 COLLATE utf8_lcase) WITHIN GROUP (ORDER BY c1 COLLATE utf8_lcase)) FROM (VALUES ('a'), ('B'), ('b'), ('A')) AS t(c1); -+-- TODO https://github.com/apache/datafusion-comet/issues/1947 -+-- TODO fix Comet for this query -+-- SELECT lower(listagg(DISTINCT c1 COLLATE utf8_lcase) WITHIN GROUP (ORDER BY c1 COLLATE utf8_lcase)) FROM (VALUES ('a'), ('B'), ('b'), ('A')) AS t(c1); - -- Test cases with unicode_rtrim. - WITH t(c1) AS (SELECT replace(listagg(DISTINCT col1 COLLATE unicode_rtrim) COLLATE utf8_binary, ' ', '') FROM (VALUES ('xbc '), ('xbc '), ('a'), ('xbc'))) SELECT len(c1), regexp_count(c1, 'a'), regexp_count(c1, 'xbc') FROM t; - WITH t(c1) AS (SELECT listagg(col1) WITHIN GROUP (ORDER BY col1 COLLATE unicode_rtrim) FROM (VALUES ('abc '), ('abc\n'), ('abc'), ('x'))) SELECT replace(replace(c1, ' ', ''), '\n', '$') FROM t; diff --git a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/aggregates_part3.sql b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/aggregates_part3.sql index 41fd4de2a09..162d5a817b6 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/aggregates_part3.sql @@ -367,25 +332,6 @@ index 21a3ce1e122..f4762ab98f0 100644 SET spark.sql.ansi.enabled = false; -- In COMPENSATION views get invalidated if the type can't cast -diff --git a/sql/core/src/test/resources/sql-tests/results/listagg-collations.sql.out b/sql/core/src/test/resources/sql-tests/results/listagg-collations.sql.out -index 1f8c5822e7d..b7de4e28813 100644 ---- a/sql/core/src/test/resources/sql-tests/results/listagg-collations.sql.out -+++ b/sql/core/src/test/resources/sql-tests/results/listagg-collations.sql.out -@@ -40,14 +40,6 @@ struct - 2 1 1 - - ---- !query --SELECT lower(listagg(DISTINCT c1 COLLATE utf8_lcase) WITHIN GROUP (ORDER BY c1 COLLATE utf8_lcase)) FROM (VALUES ('a'), ('B'), ('b'), ('A')) AS t(c1) ---- !query schema --struct ---- !query output --ab -- -- - -- !query - WITH t(c1) AS (SELECT replace(listagg(DISTINCT col1 COLLATE unicode_rtrim) COLLATE utf8_binary, ' ', '') FROM (VALUES ('xbc '), ('xbc '), ('a'), ('xbc'))) SELECT len(c1), regexp_count(c1, 'a'), regexp_count(c1, 'xbc') FROM t - -- !query schema diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala index 0f42502f1d9..e9ff802141f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala diff --git a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala index b74785bd1f..006b933517 100644 --- a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala +++ b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala @@ -39,12 +39,12 @@ import org.apache.comet.serde.ExprOuterClass.{AggExpr, Expr, ScalarFunc} import org.apache.comet.serde.Types.{DataType => ProtoDataType} import org.apache.comet.serde.Types.DataType._ import org.apache.comet.serde.literals.CometLiteral -import org.apache.comet.shims.CometExprShim +import org.apache.comet.shims.{CometExprShim, CometTypeShim} /** * An utility object for query plan and expression serialization. */ -object QueryPlanSerde extends Logging with CometExprShim { +object QueryPlanSerde extends Logging with CometExprShim with CometTypeShim { private val arrayExpressions: Map[Class[_ <: Expression], CometExpressionSerde[_]] = Map( classOf[ArrayAppend] -> CometArrayAppend, @@ -800,6 +800,8 @@ object QueryPlanSerde extends Logging with CometExprShim { // scalastyle:on def supportedScalarSortElementType(dt: DataType): Boolean = { dt match { + // Collated strings require collation-aware ordering; Comet only compares raw bytes. + case st: StringType if isStringCollationType(st) => false case _: ByteType | _: ShortType | _: IntegerType | _: LongType | _: FloatType | _: DoubleType | _: DecimalType | _: DateType | _: TimestampType | _: TimestampNTZType | _: BooleanType | _: BinaryType | _: StringType => diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleExchangeExec.scala b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleExchangeExec.scala index 9d12846642..cc7a63f0cb 100644 --- a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleExchangeExec.scala +++ b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleExchangeExec.scala @@ -53,7 +53,7 @@ import org.apache.comet.CometConf.{COMET_EXEC_SHUFFLE_ENABLED, COMET_SHUFFLE_MOD import org.apache.comet.CometSparkSessionExtensions.{hasExplainInfo, isCometShuffleManagerEnabled, withInfos} import org.apache.comet.serde.{Compatible, OperatorOuterClass, QueryPlanSerde, SupportLevel, Unsupported} import org.apache.comet.serde.operator.CometSink -import org.apache.comet.shims.ShimCometShuffleExchangeExec +import org.apache.comet.shims.{CometTypeShim, ShimCometShuffleExchangeExec} /** * Performs a shuffle that will result in the desired partitioning. @@ -219,6 +219,7 @@ case class CometShuffleExchangeExec( object CometShuffleExchangeExec extends CometSink[ShuffleExchangeExec] with ShimCometShuffleExchangeExec + with CometTypeShim with SQLConfHelper { override def getSupportLevel(op: ShuffleExchangeExec): SupportLevel = { @@ -316,6 +317,9 @@ object CometShuffleExchangeExec * hashing complex types, see hash_funcs/utils.rs */ def supportedHashPartitioningDataType(dt: DataType): Boolean = dt match { + // Collated strings require collation-aware hashing; Comet only hashes raw bytes, + // which would misroute rows that compare equal under the collation. + case st: StringType if isStringCollationType(st) => false case _: BooleanType | _: ByteType | _: ShortType | _: IntegerType | _: LongType | _: FloatType | _: DoubleType | _: StringType | _: BinaryType | _: TimestampType | _: TimestampNTZType | _: DateType => @@ -338,6 +342,8 @@ object CometShuffleExchangeExec * complex types. */ def supportedRangePartitioningDataType(dt: DataType): Boolean = dt match { + // Collated strings require collation-aware ordering; Comet only compares raw bytes. + case st: StringType if isStringCollationType(st) => false case _: BooleanType | _: ByteType | _: ShortType | _: IntegerType | _: LongType | _: FloatType | _: DoubleType | _: StringType | _: BinaryType | _: TimestampType | _: TimestampNTZType | _: DecimalType | _: DateType => @@ -498,6 +504,11 @@ object CometShuffleExchangeExec reasons += s"unsupported hash partitioning expression: $expr" } } + for (dt <- expressions.map(_.dataType).distinct) { + if (isStringCollationType(dt)) { + reasons += s"unsupported hash partitioning data type for columnar shuffle: $dt" + } + } case SinglePartition => // we already checked that the input types are supported case RoundRobinPartitioning(_) => @@ -508,6 +519,11 @@ object CometShuffleExchangeExec reasons += s"unsupported range partitioning sort order: $o" } } + for (dt <- orderings.map(_.dataType).distinct) { + if (isStringCollationType(dt)) { + reasons += s"unsupported range partitioning data type for columnar shuffle: $dt" + } + } case _ => reasons += s"unsupported Spark partitioning for columnar shuffle: ${partitioning.getClass.getName}" diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala b/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala index a6f6b03330..5f7e91529d 100644 --- a/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala +++ b/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala @@ -56,7 +56,7 @@ import org.apache.comet.CometSparkSessionExtensions.{isCometShuffleEnabled, with import org.apache.comet.parquet.CometParquetUtils import org.apache.comet.serde.{CometOperatorSerde, Compatible, Incompatible, OperatorOuterClass, SupportLevel, Unsupported} import org.apache.comet.serde.OperatorOuterClass.{AggregateMode => CometAggregateMode, Operator} -import org.apache.comet.serde.QueryPlanSerde.{aggExprToProto, exprToProto, supportedSortType} +import org.apache.comet.serde.QueryPlanSerde.{aggExprToProto, exprToProto, isStringCollationType, supportedSortType} import org.apache.comet.serde.operator.CometSink /** @@ -1386,6 +1386,14 @@ trait CometBaseAggregate { return None } + if (groupingExpressions.exists(expr => isStringCollationType(expr.dataType))) { + // Collation-aware grouping requires collation-aware hashing/equality; Comet only + // compares raw bytes, which would put rows that compare equal under the collation + // into different groups. + withInfo(aggregate, "Grouping on non-default collated strings is not supported") + return None + } + val groupingExprsWithInput = groupingExpressions.map(expr => expr.name -> exprToProto(expr, child.output)) diff --git a/spark/src/test/spark-4.0/org/apache/spark/sql/CometCollationSuite.scala b/spark/src/test/spark-4.0/org/apache/spark/sql/CometCollationSuite.scala new file mode 100644 index 0000000000..329d8dd0d2 --- /dev/null +++ b/spark/src/test/spark-4.0/org/apache/spark/sql/CometCollationSuite.scala @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql + +class CometCollationSuite extends CometTestBase { + + test("listagg DISTINCT with utf8_lcase collation (issue #1947)") { + checkSparkAnswer( + "SELECT lower(listagg(DISTINCT c1 COLLATE utf8_lcase) " + + "WITHIN GROUP (ORDER BY c1 COLLATE utf8_lcase)) " + + "FROM (VALUES ('a'), ('B'), ('b'), ('A')) AS t(c1)") + } + + test("DISTINCT on utf8_lcase collated string groups case-insensitively") { + checkSparkAnswer( + "SELECT DISTINCT c1 COLLATE utf8_lcase AS c " + + "FROM (VALUES ('a'), ('A'), ('b'), ('B')) AS t(c1) ORDER BY c") + } + + test("GROUP BY utf8_lcase collated string groups case-insensitively") { + checkSparkAnswer( + "SELECT lower(c1 COLLATE utf8_lcase) AS k, count(*) " + + "FROM (VALUES ('a'), ('A'), ('b'), ('B')) AS t(c1) " + + "GROUP BY c1 COLLATE utf8_lcase ORDER BY k") + } + + test("default UTF8_BINARY string still runs through Comet") { + // Sanity check that the collation fallback does not over-block the default string type. + checkSparkAnswer("SELECT DISTINCT c1 FROM (VALUES ('a'), ('b'), ('a')) AS t(c1) ORDER BY c1") + } +} From 4bfe4cb7f7e6852c9661544dd3bb06626d75b977 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 22 Apr 2026 13:37:53 -0600 Subject: [PATCH 2/3] ci: add CometCollationSuite to pr_build workflows --- .github/workflows/pr_build_linux.yml | 1 + .github/workflows/pr_build_macos.yml | 1 + 2 files changed, 2 insertions(+) diff --git a/.github/workflows/pr_build_linux.yml b/.github/workflows/pr_build_linux.yml index 4823153f7b..91bc8714c3 100644 --- a/.github/workflows/pr_build_linux.yml +++ b/.github/workflows/pr_build_linux.yml @@ -346,6 +346,7 @@ jobs: - name: "sql" value: | org.apache.spark.sql.CometToPrettyStringSuite + org.apache.spark.sql.CometCollationSuite fail-fast: false name: ${{ matrix.profile.name }}/${{ matrix.profile.scan_impl }} [${{ matrix.suite.name }}] runs-on: ${{ github.repository_owner == 'apache' && format('runs-on={0},family=m8a+m7a+c8a,cpu=16,image=ubuntu24-full-x64,extras=s3-cache,disk=large,tag=datafusion-comet', github.run_id) || 'ubuntu-latest' }} diff --git a/.github/workflows/pr_build_macos.yml b/.github/workflows/pr_build_macos.yml index 6d6ac14ec9..3525687873 100644 --- a/.github/workflows/pr_build_macos.yml +++ b/.github/workflows/pr_build_macos.yml @@ -222,6 +222,7 @@ jobs: - name: "sql" value: | org.apache.spark.sql.CometToPrettyStringSuite + org.apache.spark.sql.CometCollationSuite fail-fast: false name: ${{ matrix.os }}/${{ matrix.profile.name }} [${{ matrix.suite.name }}] From 8bc38c0e9918d7df3b495f3050abae4105c69c69 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 23 Apr 2026 10:18:44 -0600 Subject: [PATCH 3/3] test: tighten CometCollationSuite to pin fallback reasons and add ORDER BY case Address PR #4035 review feedback: - Replace checkSparkAnswer with checkSparkAnswerAndFallbackReason on the listagg/DISTINCT/GROUP BY tests so regressions that silently re-enable the native path are caught. The shuffle-exchange rule is the first layer that fires for collated keys, so pin its reason string rather than the defense-in-depth aggregate check. - Add an ORDER BY test covering the range-partitioning fallback. - Rework the default-UTF8_BINARY sanity test to use withParquetTable so checkSparkAnswerAndOperator has a real Comet scan to attach to. --- .../spark/sql/CometCollationSuite.scala | 35 +++++++++++++++---- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/spark/src/test/spark-4.0/org/apache/spark/sql/CometCollationSuite.scala b/spark/src/test/spark-4.0/org/apache/spark/sql/CometCollationSuite.scala index 329d8dd0d2..463e169b66 100644 --- a/spark/src/test/spark-4.0/org/apache/spark/sql/CometCollationSuite.scala +++ b/spark/src/test/spark-4.0/org/apache/spark/sql/CometCollationSuite.scala @@ -21,28 +21,49 @@ package org.apache.spark.sql class CometCollationSuite extends CometTestBase { + // Queries that group, sort, or shuffle on a non-default collated string must fall back to + // Spark because Comet's shuffle/sort/aggregate compare raw bytes rather than collation-aware + // keys. The shuffle-exchange rule is the primary line of defense (see #1947), so these tests + // pin down the fallback reason it emits. + private val hashShuffleCollationReason = + "unsupported hash partitioning data type for columnar shuffle" + private val rangeShuffleCollationReason = + "unsupported range partitioning data type for columnar shuffle" + test("listagg DISTINCT with utf8_lcase collation (issue #1947)") { - checkSparkAnswer( + checkSparkAnswerAndFallbackReason( "SELECT lower(listagg(DISTINCT c1 COLLATE utf8_lcase) " + "WITHIN GROUP (ORDER BY c1 COLLATE utf8_lcase)) " + - "FROM (VALUES ('a'), ('B'), ('b'), ('A')) AS t(c1)") + "FROM (VALUES ('a'), ('B'), ('b'), ('A')) AS t(c1)", + hashShuffleCollationReason) } test("DISTINCT on utf8_lcase collated string groups case-insensitively") { - checkSparkAnswer( + checkSparkAnswerAndFallbackReason( "SELECT DISTINCT c1 COLLATE utf8_lcase AS c " + - "FROM (VALUES ('a'), ('A'), ('b'), ('B')) AS t(c1) ORDER BY c") + "FROM (VALUES ('a'), ('A'), ('b'), ('B')) AS t(c1) ORDER BY c", + hashShuffleCollationReason) } test("GROUP BY utf8_lcase collated string groups case-insensitively") { - checkSparkAnswer( + checkSparkAnswerAndFallbackReason( "SELECT lower(c1 COLLATE utf8_lcase) AS k, count(*) " + "FROM (VALUES ('a'), ('A'), ('b'), ('B')) AS t(c1) " + - "GROUP BY c1 COLLATE utf8_lcase ORDER BY k") + "GROUP BY c1 COLLATE utf8_lcase ORDER BY k", + hashShuffleCollationReason) + } + + test("ORDER BY utf8_lcase collated string sorts case-insensitively") { + checkSparkAnswerAndFallbackReason( + "SELECT c1 COLLATE utf8_lcase AS c " + + "FROM (VALUES ('A'), ('b'), ('a'), ('B')) AS t(c1) ORDER BY c", + rangeShuffleCollationReason) } test("default UTF8_BINARY string still runs through Comet") { // Sanity check that the collation fallback does not over-block the default string type. - checkSparkAnswer("SELECT DISTINCT c1 FROM (VALUES ('a'), ('b'), ('a')) AS t(c1) ORDER BY c1") + withParquetTable(Seq(("a", 1), ("b", 2), ("a", 3)), "tbl") { + checkSparkAnswerAndOperator("SELECT DISTINCT _1 FROM tbl ORDER BY _1") + } } }