From 441fa06808cbc2c32bba5a6236581caf546f7abf Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Wed, 22 Apr 2026 11:09:39 -0600
Subject: [PATCH 1/3] fix: fall back to Spark for shuffle, sort, and aggregate
 on non-default collated strings

The Spark 4.0 `CometTypeShim.isStringCollationType` check compared against
`StringTypeWithCollation`, which is an `AbstractDataType` used only for
expression type matching, so it never matched a concrete `StringType` with a
non-default collation.

As a result, Comet's columnar shuffle would hash-partition a collated column
using byte-level hashing, sending rows that compare equal under the collation
to different partitions. A downstream DISTINCT or GROUP BY then failed to
deduplicate, producing incorrect aggregate results, for example on
`listagg-collations.sql` query #5 in the Spark 4.0 SQL test suite.

Detect non-default collations by comparing `collationId` against the default
`StringType` object, and reject collated string columns in shuffle hash/range
partitioning, sort, and aggregate grouping so those operators fall back to
Spark. The previously disabled `listagg-collations.sql` query is re-enabled.

Closes #1947
---
 .../apache/comet/shims/CometTypeShim.scala    | 13 +++--
 dev/diffs/4.0.1.diff                          | 54 -------------------
 .../apache/comet/serde/QueryPlanSerde.scala   |  6 ++-
 .../shuffle/CometShuffleExchangeExec.scala    | 18 ++++++-
 .../apache/spark/sql/comet/operators.scala    | 10 +++-
 .../spark/sql/CometCollationSuite.scala       | 48 +++++++++++++++++
 6 files changed, 88 insertions(+), 61 deletions(-)
 create mode 100644 spark/src/test/spark-4.0/org/apache/spark/sql/CometCollationSuite.scala

diff --git a/common/src/main/spark-4.0/org/apache/comet/shims/CometTypeShim.scala b/common/src/main/spark-4.0/org/apache/comet/shims/CometTypeShim.scala
index 8bcb721603..1b82c04b20 100644
--- a/common/src/main/spark-4.0/org/apache/comet/shims/CometTypeShim.scala
+++ b/common/src/main/spark-4.0/org/apache/comet/shims/CometTypeShim.scala
@@ -19,9 +19,16 @@
 
 package org.apache.comet.shims
 
-import org.apache.spark.sql.internal.types.StringTypeWithCollation
-import org.apache.spark.sql.types.DataType
+import org.apache.spark.sql.types.{DataType, StringType}
 
 trait CometTypeShim {
-  def isStringCollationType(dt: DataType): Boolean = dt.isInstanceOf[StringTypeWithCollation]
+  // A `StringType` carries collation metadata in Spark 4.0. Only non-default (non-UTF8_BINARY)
+  // collations have semantics Comet's byte-level hashing/sorting/equality cannot honor. The
+  // default `StringType` object is `StringType(UTF8_BINARY_COLLATION_ID)`, so comparing
+  // `collationId` against that instance's id picks out non-default collations without needing
+  // `private[sql]` helpers on `StringType`.
+  def isStringCollationType(dt: DataType): Boolean = dt match {
+    case st: StringType => st.collationId != StringType.collationId
+    case _ => false
+  }
 }
diff --git a/dev/diffs/4.0.1.diff b/dev/diffs/4.0.1.diff
index d846189720..58e49abdf4 100644
--- a/dev/diffs/4.0.1.diff
+++ b/dev/diffs/4.0.1.diff
@@ -150,26 +150,6 @@ index 4410fe50912..43bcce2a038 100644
        case _ => Map[String, String]()
      }
      val childrenInfo = children.flatMap {
-diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/listagg-collations.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/listagg-collations.sql.out
-index 7aca17dcb25..8afeb3b4a2f 100644
---- a/sql/core/src/test/resources/sql-tests/analyzer-results/listagg-collations.sql.out
-+++ b/sql/core/src/test/resources/sql-tests/analyzer-results/listagg-collations.sql.out
-@@ -64,15 +64,6 @@ WithCTE
-       +- CTERelationRef xxxx, true, [c1#x], false, false
- 
- 
---- !query
--SELECT lower(listagg(DISTINCT c1 COLLATE utf8_lcase) WITHIN GROUP (ORDER BY c1 COLLATE utf8_lcase)) FROM (VALUES ('a'), ('B'), ('b'), ('A')) AS t(c1)
---- !query analysis
--Aggregate [lower(listagg(distinct collate(c1#x, utf8_lcase), null, collate(c1#x, utf8_lcase) ASC NULLS FIRST, 0, 0)) AS lower(listagg(DISTINCT collate(c1, utf8_lcase), NULL) WITHIN GROUP (ORDER BY collate(c1, utf8_lcase) ASC NULLS FIRST))#x]
--+- SubqueryAlias t
--   +- Project [col1#x AS c1#x]
--      +- LocalRelation [col1#x]
--
--
- -- !query
- WITH t(c1) AS (SELECT replace(listagg(DISTINCT col1 COLLATE unicode_rtrim) COLLATE utf8_binary, ' ', '') FROM (VALUES ('xbc  '), ('xbc '), ('a'), ('xbc'))) SELECT len(c1), regexp_count(c1, 'a'), regexp_count(c1, 'xbc') FROM t
- -- !query analysis
 diff --git a/sql/core/src/test/resources/sql-tests/inputs/collations.sql b/sql/core/src/test/resources/sql-tests/inputs/collations.sql
 index 17815ed5dde..baad440b1ce 100644
 --- a/sql/core/src/test/resources/sql-tests/inputs/collations.sql
@@ -230,21 +210,6 @@ index 698ca009b4f..57d774a3617 100644
  
  -- Test tables
  CREATE table  explain_temp1 (key int, val int) USING PARQUET;
-diff --git a/sql/core/src/test/resources/sql-tests/inputs/listagg-collations.sql b/sql/core/src/test/resources/sql-tests/inputs/listagg-collations.sql
-index aa3d02dc2fb..c4f878d9908 100644
---- a/sql/core/src/test/resources/sql-tests/inputs/listagg-collations.sql
-+++ b/sql/core/src/test/resources/sql-tests/inputs/listagg-collations.sql
-@@ -5,7 +5,9 @@ WITH t(c1) AS (SELECT listagg(col1) WITHIN GROUP (ORDER BY col1) FROM (VALUES ('
- -- Test cases with utf8_lcase. Lower expression added for determinism
- SELECT lower(listagg(c1) WITHIN GROUP (ORDER BY c1 COLLATE utf8_lcase)) FROM (VALUES ('a'), ('A'), ('b'), ('B')) AS t(c1);
- WITH t(c1) AS (SELECT lower(listagg(DISTINCT col1 COLLATE utf8_lcase)) FROM (VALUES ('a'), ('A'), ('b'), ('B'))) SELECT len(c1), regexp_count(c1, 'a'), regexp_count(c1, 'b') FROM t;
--SELECT lower(listagg(DISTINCT c1 COLLATE utf8_lcase) WITHIN GROUP (ORDER BY c1 COLLATE utf8_lcase)) FROM (VALUES ('a'), ('B'), ('b'), ('A')) AS t(c1);
-+-- TODO https://github.com/apache/datafusion-comet/issues/1947
-+-- TODO fix Comet for this query
-+-- SELECT lower(listagg(DISTINCT c1 COLLATE utf8_lcase) WITHIN GROUP (ORDER BY c1 COLLATE utf8_lcase)) FROM (VALUES ('a'), ('B'), ('b'), ('A')) AS t(c1);
- -- Test cases with unicode_rtrim.
- WITH t(c1) AS (SELECT replace(listagg(DISTINCT col1 COLLATE unicode_rtrim) COLLATE utf8_binary, ' ', '') FROM (VALUES ('xbc  '), ('xbc '), ('a'), ('xbc'))) SELECT len(c1), regexp_count(c1, 'a'), regexp_count(c1, 'xbc') FROM t;
- WITH t(c1) AS (SELECT listagg(col1) WITHIN GROUP (ORDER BY col1 COLLATE unicode_rtrim) FROM (VALUES ('abc '), ('abc\n'), ('abc'), ('x'))) SELECT replace(replace(c1, ' ', ''), '\n', '$') FROM t;
 diff --git a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/aggregates_part3.sql b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/aggregates_part3.sql
 index 41fd4de2a09..162d5a817b6 100644
 --- a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/aggregates_part3.sql
@@ -367,25 +332,6 @@ index 21a3ce1e122..f4762ab98f0 100644
  SET spark.sql.ansi.enabled = false;
  
  -- In COMPENSATION views get invalidated if the type can't cast
-diff --git a/sql/core/src/test/resources/sql-tests/results/listagg-collations.sql.out b/sql/core/src/test/resources/sql-tests/results/listagg-collations.sql.out
-index 1f8c5822e7d..b7de4e28813 100644
---- a/sql/core/src/test/resources/sql-tests/results/listagg-collations.sql.out
-+++ b/sql/core/src/test/resources/sql-tests/results/listagg-collations.sql.out
-@@ -40,14 +40,6 @@ struct<len(c1):int,regexp_count(c1, a):int,regexp_count(c1, b):int>
- 2	1	1
- 
- 
---- !query
--SELECT lower(listagg(DISTINCT c1 COLLATE utf8_lcase) WITHIN GROUP (ORDER BY c1 COLLATE utf8_lcase)) FROM (VALUES ('a'), ('B'), ('b'), ('A')) AS t(c1)
---- !query schema
--struct<lower(listagg(DISTINCT collate(c1, utf8_lcase), NULL) WITHIN GROUP (ORDER BY collate(c1, utf8_lcase) ASC NULLS FIRST)):string collate UTF8_LCASE>
---- !query output
--ab
--
--
- -- !query
- WITH t(c1) AS (SELECT replace(listagg(DISTINCT col1 COLLATE unicode_rtrim) COLLATE utf8_binary, ' ', '') FROM (VALUES ('xbc  '), ('xbc '), ('a'), ('xbc'))) SELECT len(c1), regexp_count(c1, 'a'), regexp_count(c1, 'xbc') FROM t
- -- !query schema
 diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
 index 0f42502f1d9..e9ff802141f 100644
 --- a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
diff --git a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala
index b74785bd1f..006b933517 100644
--- a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala
+++ b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala
@@ -39,12 +39,12 @@ import org.apache.comet.serde.ExprOuterClass.{AggExpr, Expr, ScalarFunc}
 import org.apache.comet.serde.Types.{DataType => ProtoDataType}
 import org.apache.comet.serde.Types.DataType._
 import org.apache.comet.serde.literals.CometLiteral
-import org.apache.comet.shims.CometExprShim
+import org.apache.comet.shims.{CometExprShim, CometTypeShim}
 
 /**
  * An utility object for query plan and expression serialization.
  */
-object QueryPlanSerde extends Logging with CometExprShim {
+object QueryPlanSerde extends Logging with CometExprShim with CometTypeShim {
 
   private val arrayExpressions: Map[Class[_ <: Expression], CometExpressionSerde[_]] = Map(
     classOf[ArrayAppend] -> CometArrayAppend,
@@ -800,6 +800,8 @@ object QueryPlanSerde extends Logging with CometExprShim {
   // scalastyle:on
   def supportedScalarSortElementType(dt: DataType): Boolean = {
     dt match {
+      // Collated strings require collation-aware ordering; Comet only compares raw bytes.
+      case st: StringType if isStringCollationType(st) => false
       case _: ByteType | _: ShortType | _: IntegerType | _: LongType | _: FloatType |
           _: DoubleType | _: DecimalType | _: DateType | _: TimestampType | _: TimestampNTZType |
           _: BooleanType | _: BinaryType | _: StringType =>
diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleExchangeExec.scala b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleExchangeExec.scala
index 9d12846642..cc7a63f0cb 100644
--- a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleExchangeExec.scala
+++ b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleExchangeExec.scala
@@ -53,7 +53,7 @@ import org.apache.comet.CometConf.{COMET_EXEC_SHUFFLE_ENABLED, COMET_SHUFFLE_MOD
 import org.apache.comet.CometSparkSessionExtensions.{hasExplainInfo, isCometShuffleManagerEnabled, withInfos}
 import org.apache.comet.serde.{Compatible, OperatorOuterClass, QueryPlanSerde, SupportLevel, Unsupported}
 import org.apache.comet.serde.operator.CometSink
-import org.apache.comet.shims.ShimCometShuffleExchangeExec
+import org.apache.comet.shims.{CometTypeShim, ShimCometShuffleExchangeExec}
 
 /**
  * Performs a shuffle that will result in the desired partitioning.
@@ -219,6 +219,7 @@ case class CometShuffleExchangeExec(
 object CometShuffleExchangeExec
     extends CometSink[ShuffleExchangeExec]
     with ShimCometShuffleExchangeExec
+    with CometTypeShim
     with SQLConfHelper {
 
   override def getSupportLevel(op: ShuffleExchangeExec): SupportLevel = {
@@ -316,6 +317,9 @@ object CometShuffleExchangeExec
      * hashing complex types, see hash_funcs/utils.rs
      */
     def supportedHashPartitioningDataType(dt: DataType): Boolean = dt match {
+      // Collated strings require collation-aware hashing; Comet only hashes raw bytes,
+      // which would misroute rows that compare equal under the collation.
+      case st: StringType if isStringCollationType(st) => false
       case _: BooleanType | _: ByteType | _: ShortType | _: IntegerType | _: LongType |
           _: FloatType | _: DoubleType | _: StringType | _: BinaryType | _: TimestampType |
           _: TimestampNTZType | _: DateType =>
@@ -338,6 +342,8 @@ object CometShuffleExchangeExec
      * complex types.
      */
     def supportedRangePartitioningDataType(dt: DataType): Boolean = dt match {
+      // Collated strings require collation-aware ordering; Comet only compares raw bytes.
+      case st: StringType if isStringCollationType(st) => false
       case _: BooleanType | _: ByteType | _: ShortType | _: IntegerType | _: LongType |
           _: FloatType | _: DoubleType | _: StringType | _: BinaryType | _: TimestampType |
           _: TimestampNTZType | _: DecimalType | _: DateType =>
@@ -498,6 +504,11 @@ object CometShuffleExchangeExec
             reasons += s"unsupported hash partitioning expression: $expr"
           }
         }
+        for (dt <- expressions.map(_.dataType).distinct) {
+          if (isStringCollationType(dt)) {
+            reasons += s"unsupported hash partitioning data type for columnar shuffle: $dt"
+          }
+        }
       case SinglePartition =>
       // we already checked that the input types are supported
       case RoundRobinPartitioning(_) =>
@@ -508,6 +519,11 @@ object CometShuffleExchangeExec
             reasons += s"unsupported range partitioning sort order: $o"
           }
         }
+        for (dt <- orderings.map(_.dataType).distinct) {
+          if (isStringCollationType(dt)) {
+            reasons += s"unsupported range partitioning data type for columnar shuffle: $dt"
+          }
+        }
       case _ =>
         reasons +=
           s"unsupported Spark partitioning for columnar shuffle: ${partitioning.getClass.getName}"
diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala b/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala
index a6f6b03330..5f7e91529d 100644
--- a/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala
+++ b/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala
@@ -56,7 +56,7 @@ import org.apache.comet.CometSparkSessionExtensions.{isCometShuffleEnabled, with
 import org.apache.comet.parquet.CometParquetUtils
 import org.apache.comet.serde.{CometOperatorSerde, Compatible, Incompatible, OperatorOuterClass, SupportLevel, Unsupported}
 import org.apache.comet.serde.OperatorOuterClass.{AggregateMode => CometAggregateMode, Operator}
-import org.apache.comet.serde.QueryPlanSerde.{aggExprToProto, exprToProto, supportedSortType}
+import org.apache.comet.serde.QueryPlanSerde.{aggExprToProto, exprToProto, isStringCollationType, supportedSortType}
 import org.apache.comet.serde.operator.CometSink
 
 /**
@@ -1386,6 +1386,14 @@ trait CometBaseAggregate {
       return None
     }
 
+    if (groupingExpressions.exists(expr => isStringCollationType(expr.dataType))) {
+      // Collation-aware grouping requires collation-aware hashing/equality; Comet only
+      // compares raw bytes, which would put rows that compare equal under the collation
+      // into different groups.
+      withInfo(aggregate, "Grouping on non-default collated strings is not supported")
+      return None
+    }
+
     val groupingExprsWithInput =
       groupingExpressions.map(expr => expr.name -> exprToProto(expr, child.output))
 
diff --git a/spark/src/test/spark-4.0/org/apache/spark/sql/CometCollationSuite.scala b/spark/src/test/spark-4.0/org/apache/spark/sql/CometCollationSuite.scala
new file mode 100644
index 0000000000..329d8dd0d2
--- /dev/null
+++ b/spark/src/test/spark-4.0/org/apache/spark/sql/CometCollationSuite.scala
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.spark.sql
+
+class CometCollationSuite extends CometTestBase {
+
+  test("listagg DISTINCT with utf8_lcase collation (issue #1947)") {
+    checkSparkAnswer(
+      "SELECT lower(listagg(DISTINCT c1 COLLATE utf8_lcase) " +
+        "WITHIN GROUP (ORDER BY c1 COLLATE utf8_lcase)) " +
+        "FROM (VALUES ('a'), ('B'), ('b'), ('A')) AS t(c1)")
+  }
+
+  test("DISTINCT on utf8_lcase collated string groups case-insensitively") {
+    checkSparkAnswer(
+      "SELECT DISTINCT c1 COLLATE utf8_lcase AS c " +
+        "FROM (VALUES ('a'), ('A'), ('b'), ('B')) AS t(c1) ORDER BY c")
+  }
+
+  test("GROUP BY utf8_lcase collated string groups case-insensitively") {
+    checkSparkAnswer(
+      "SELECT lower(c1 COLLATE utf8_lcase) AS k, count(*) " +
+        "FROM (VALUES ('a'), ('A'), ('b'), ('B')) AS t(c1) " +
+        "GROUP BY c1 COLLATE utf8_lcase ORDER BY k")
+  }
+
+  test("default UTF8_BINARY string still runs through Comet") {
+    // Sanity check that the collation fallback does not over-block the default string type.
+    checkSparkAnswer("SELECT DISTINCT c1 FROM (VALUES ('a'), ('b'), ('a')) AS t(c1) ORDER BY c1")
+  }
+}

From 4bfe4cb7f7e6852c9661544dd3bb06626d75b977 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Wed, 22 Apr 2026 13:37:53 -0600
Subject: [PATCH 2/3] ci: add CometCollationSuite to pr_build workflows

---
 .github/workflows/pr_build_linux.yml | 1 +
 .github/workflows/pr_build_macos.yml | 1 +
 2 files changed, 2 insertions(+)

diff --git a/.github/workflows/pr_build_linux.yml b/.github/workflows/pr_build_linux.yml
index 4823153f7b..91bc8714c3 100644
--- a/.github/workflows/pr_build_linux.yml
+++ b/.github/workflows/pr_build_linux.yml
@@ -346,6 +346,7 @@ jobs:
           - name: "sql"
             value: |
               org.apache.spark.sql.CometToPrettyStringSuite
+              org.apache.spark.sql.CometCollationSuite
       fail-fast: false
     name: ${{ matrix.profile.name }}/${{ matrix.profile.scan_impl }} [${{ matrix.suite.name }}]
     runs-on: ${{ github.repository_owner == 'apache' && format('runs-on={0},family=m8a+m7a+c8a,cpu=16,image=ubuntu24-full-x64,extras=s3-cache,disk=large,tag=datafusion-comet', github.run_id) || 'ubuntu-latest' }}
diff --git a/.github/workflows/pr_build_macos.yml b/.github/workflows/pr_build_macos.yml
index 6d6ac14ec9..3525687873 100644
--- a/.github/workflows/pr_build_macos.yml
+++ b/.github/workflows/pr_build_macos.yml
@@ -222,6 +222,7 @@ jobs:
           - name: "sql"
             value: |
               org.apache.spark.sql.CometToPrettyStringSuite
+              org.apache.spark.sql.CometCollationSuite
 
       fail-fast: false
     name: ${{ matrix.os }}/${{ matrix.profile.name }} [${{ matrix.suite.name }}]

From 8bc38c0e9918d7df3b495f3050abae4105c69c69 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Thu, 23 Apr 2026 10:18:44 -0600
Subject: [PATCH 3/3] test: tighten CometCollationSuite to pin fallback reasons
 and add ORDER BY case

Address PR #4035 review feedback:

- Replace checkSparkAnswer with checkSparkAnswerAndFallbackReason on the
  listagg/DISTINCT/GROUP BY tests so regressions that silently re-enable the
  native path are caught. The shuffle-exchange rule is the first layer that
  fires for collated keys, so pin its reason string rather than the
  defense-in-depth aggregate check.
- Add an ORDER BY test covering the range-partitioning fallback.
- Rework the default-UTF8_BINARY sanity test to use withParquetTable so
  checkSparkAnswerAndOperator has a real Comet scan to attach to.
---
 .../spark/sql/CometCollationSuite.scala       | 35 +++++++++++++++----
 1 file changed, 28 insertions(+), 7 deletions(-)

diff --git a/spark/src/test/spark-4.0/org/apache/spark/sql/CometCollationSuite.scala b/spark/src/test/spark-4.0/org/apache/spark/sql/CometCollationSuite.scala
index 329d8dd0d2..463e169b66 100644
--- a/spark/src/test/spark-4.0/org/apache/spark/sql/CometCollationSuite.scala
+++ b/spark/src/test/spark-4.0/org/apache/spark/sql/CometCollationSuite.scala
@@ -21,28 +21,49 @@ package org.apache.spark.sql
 
 class CometCollationSuite extends CometTestBase {
 
+  // Queries that group, sort, or shuffle on a non-default collated string must fall back to
+  // Spark because Comet's shuffle/sort/aggregate compare raw bytes rather than collation-aware
+  // keys. The shuffle-exchange rule is the primary line of defense (see #1947), so these tests
+  // pin down the fallback reason it emits.
+  private val hashShuffleCollationReason =
+    "unsupported hash partitioning data type for columnar shuffle"
+  private val rangeShuffleCollationReason =
+    "unsupported range partitioning data type for columnar shuffle"
+
   test("listagg DISTINCT with utf8_lcase collation (issue #1947)") {
-    checkSparkAnswer(
+    checkSparkAnswerAndFallbackReason(
       "SELECT lower(listagg(DISTINCT c1 COLLATE utf8_lcase) " +
         "WITHIN GROUP (ORDER BY c1 COLLATE utf8_lcase)) " +
-        "FROM (VALUES ('a'), ('B'), ('b'), ('A')) AS t(c1)")
+        "FROM (VALUES ('a'), ('B'), ('b'), ('A')) AS t(c1)",
+      hashShuffleCollationReason)
   }
 
   test("DISTINCT on utf8_lcase collated string groups case-insensitively") {
-    checkSparkAnswer(
+    checkSparkAnswerAndFallbackReason(
       "SELECT DISTINCT c1 COLLATE utf8_lcase AS c " +
-        "FROM (VALUES ('a'), ('A'), ('b'), ('B')) AS t(c1) ORDER BY c")
+        "FROM (VALUES ('a'), ('A'), ('b'), ('B')) AS t(c1) ORDER BY c",
+      hashShuffleCollationReason)
   }
 
   test("GROUP BY utf8_lcase collated string groups case-insensitively") {
-    checkSparkAnswer(
+    checkSparkAnswerAndFallbackReason(
       "SELECT lower(c1 COLLATE utf8_lcase) AS k, count(*) " +
         "FROM (VALUES ('a'), ('A'), ('b'), ('B')) AS t(c1) " +
-        "GROUP BY c1 COLLATE utf8_lcase ORDER BY k")
+        "GROUP BY c1 COLLATE utf8_lcase ORDER BY k",
+      hashShuffleCollationReason)
+  }
+
+  test("ORDER BY utf8_lcase collated string sorts case-insensitively") {
+    checkSparkAnswerAndFallbackReason(
+      "SELECT c1 COLLATE utf8_lcase AS c " +
+        "FROM (VALUES ('A'), ('b'), ('a'), ('B')) AS t(c1) ORDER BY c",
+      rangeShuffleCollationReason)
   }
 
   test("default UTF8_BINARY string still runs through Comet") {
     // Sanity check that the collation fallback does not over-block the default string type.
-    checkSparkAnswer("SELECT DISTINCT c1 FROM (VALUES ('a'), ('b'), ('a')) AS t(c1) ORDER BY c1")
+    withParquetTable(Seq(("a", 1), ("b", 2), ("a", 3)), "tbl") {
+      checkSparkAnswerAndOperator("SELECT DISTINCT _1 FROM tbl ORDER BY _1")
+    }
   }
 }