Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -382,7 +382,8 @@ case class ListAgg(
inputAggBufferOffset: Int = 0)
extends Collect[mutable.ArrayBuffer[Any]]
with SupportsOrderingWithinGroup
with ImplicitCastInputTypes {
with ImplicitCastInputTypes
with AliasHelper {

override def orderingFilled: Boolean = orderExpressions.nonEmpty

Expand Down Expand Up @@ -588,7 +589,8 @@ case class ListAgg(
if (someOrder.isEmpty) {
return true
}
if (someOrder.size == 1 && someOrder.head.child.semanticEquals(child)) {
if (someOrder.size == 1 &&
trimAliases(someOrder.head.child).semanticEquals(trimAliases(child))) {
Comment thread
mihailoale-db marked this conversation as resolved.
return true
}
false
Expand Down Expand Up @@ -679,7 +681,7 @@ case class ListAgg(
if (orderExpressions.size != 1) return OrderDeterminismResult.NonDeterministicMismatch
child match {
case Cast(castChild, castType, _, _)
if orderExpressions.head.child.semanticEquals(castChild) =>
if trimAliases(orderExpressions.head.child).semanticEquals(trimAliases(castChild)) =>
if (isCastEqualityPreserving(castChild.dataType) &&
isCastTargetEqualityPreserving(castType)) {
OrderDeterminismResult.Deterministic
Expand Down
103 changes: 103 additions & 0 deletions sql/core/src/test/resources/sql-tests/analyzer-results/listagg.sql.out
Original file line number Diff line number Diff line change
Expand Up @@ -702,3 +702,106 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
"inputType" : "\"TIMESTAMP\""
}
}


-- !query
SELECT listagg(DISTINCT v:a::string, ',') WITHIN GROUP (ORDER BY v:a::string) FROM (SELECT parse_json('{"a": "x"}') v UNION ALL SELECT parse_json('{"a": "y"}') UNION ALL SELECT parse_json('{"a": "x"}'))
-- !query analysis
Aggregate [listagg(distinct cast(variant_get(v#x, $.a, VariantType, true, Some(America/Los_Angeles)) as string), ,, cast(variant_get(v#x, $.a, VariantType, true, Some(America/Los_Angeles)) as string) ASC NULLS FIRST, 0, 0) AS listagg(DISTINCT CAST(variant_get(v, $.a) AS a AS STRING), ,) WITHIN GROUP (ORDER BY CAST(variant_get(v, $.a) AS a AS STRING) ASC NULLS FIRST)#x]
+- SubqueryAlias __auto_generated_subquery_name
+- Union false, false
:- Union false, false
: :- Project [parse_json({"a": "x"}, true) AS v#x]
: : +- OneRowRelation
: +- Project [parse_json({"a": "y"}, true) AS parse_json({"a": "y"})#x]
: +- OneRowRelation
+- Project [parse_json({"a": "x"}, true) AS parse_json({"a": "x"})#x]
+- OneRowRelation


-- !query
SELECT listagg(v:a::string, ',') WITHIN GROUP (ORDER BY v:a::string) FROM (SELECT parse_json('{"a": "x"}') v UNION ALL SELECT parse_json('{"a": "y"}') UNION ALL SELECT parse_json('{"a": "x"}'))
-- !query analysis
Aggregate [listagg(cast(variant_get(v#x, $.a, VariantType, true, Some(America/Los_Angeles)) as string), ,, cast(variant_get(v#x, $.a, VariantType, true, Some(America/Los_Angeles)) as string) ASC NULLS FIRST, 0, 0) AS listagg(CAST(variant_get(v, $.a) AS a AS STRING), ,) WITHIN GROUP (ORDER BY CAST(variant_get(v, $.a) AS a AS STRING) ASC NULLS FIRST)#x]
+- SubqueryAlias __auto_generated_subquery_name
+- Union false, false
:- Union false, false
: :- Project [parse_json({"a": "x"}, true) AS v#x]
: : +- OneRowRelation
: +- Project [parse_json({"a": "y"}, true) AS parse_json({"a": "y"})#x]
: +- OneRowRelation
+- Project [parse_json({"a": "x"}, true) AS parse_json({"a": "x"})#x]
+- OneRowRelation


-- !query
SELECT listagg(DISTINCT v:a::string, ',') WITHIN GROUP (ORDER BY v:a::string DESC) FROM (SELECT parse_json('{"a": "x"}') v UNION ALL SELECT parse_json('{"a": "y"}') UNION ALL SELECT parse_json('{"a": "x"}'))
-- !query analysis
Aggregate [listagg(distinct cast(variant_get(v#x, $.a, VariantType, true, Some(America/Los_Angeles)) as string), ,, cast(variant_get(v#x, $.a, VariantType, true, Some(America/Los_Angeles)) as string) DESC NULLS LAST, 0, 0) AS listagg(DISTINCT CAST(variant_get(v, $.a) AS a AS STRING), ,) WITHIN GROUP (ORDER BY CAST(variant_get(v, $.a) AS a AS STRING) DESC NULLS LAST)#x]
+- SubqueryAlias __auto_generated_subquery_name
+- Union false, false
:- Union false, false
: :- Project [parse_json({"a": "x"}, true) AS v#x]
: : +- OneRowRelation
: +- Project [parse_json({"a": "y"}, true) AS parse_json({"a": "y"})#x]
: +- OneRowRelation
+- Project [parse_json({"a": "x"}, true) AS parse_json({"a": "x"})#x]
+- OneRowRelation


-- !query
SELECT listagg(DISTINCT v:a.b::string, ',') WITHIN GROUP (ORDER BY v:a.b::string) FROM (SELECT parse_json('{"a": {"b": "x"}}') v UNION ALL SELECT parse_json('{"a": {"b": "y"}}') UNION ALL SELECT parse_json('{"a": {"b": "x"}}'))
-- !query analysis
Aggregate [listagg(distinct cast(variant_get(v#x, $.a.b, VariantType, true, Some(America/Los_Angeles)) as string), ,, cast(variant_get(v#x, $.a.b, VariantType, true, Some(America/Los_Angeles)) as string) ASC NULLS FIRST, 0, 0) AS listagg(DISTINCT CAST(variant_get(v, $.a.b) AS b AS STRING), ,) WITHIN GROUP (ORDER BY CAST(variant_get(v, $.a.b) AS b AS STRING) ASC NULLS FIRST)#x]
+- SubqueryAlias __auto_generated_subquery_name
+- Union false, false
:- Union false, false
: :- Project [parse_json({"a": {"b": "x"}}, true) AS v#x]
: : +- OneRowRelation
: +- Project [parse_json({"a": {"b": "y"}}, true) AS parse_json({"a": {"b": "y"}})#x]
: +- OneRowRelation
+- Project [parse_json({"a": {"b": "x"}}, true) AS parse_json({"a": {"b": "x"}})#x]
+- OneRowRelation


-- !query
SELECT grp, listagg(DISTINCT v:a::string, ',') WITHIN GROUP (ORDER BY v:a::string) FROM (SELECT 1 grp, parse_json('{"a": "x"}') v UNION ALL SELECT 1, parse_json('{"a": "y"}') UNION ALL SELECT 2, parse_json('{"a": "x"}') UNION ALL SELECT 2, parse_json('{"a": "x"}') UNION ALL SELECT 1, parse_json('{"a": "x"}')) GROUP BY grp
-- !query analysis
Aggregate [grp#x], [grp#x, listagg(distinct cast(variant_get(v#x, $.a, VariantType, true, Some(America/Los_Angeles)) as string), ,, cast(variant_get(v#x, $.a, VariantType, true, Some(America/Los_Angeles)) as string) ASC NULLS FIRST, 0, 0) AS listagg(DISTINCT CAST(variant_get(v, $.a) AS a AS STRING), ,) WITHIN GROUP (ORDER BY CAST(variant_get(v, $.a) AS a AS STRING) ASC NULLS FIRST)#x]
+- SubqueryAlias __auto_generated_subquery_name
+- Union false, false
:- Union false, false
: :- Union false, false
: : :- Union false, false
: : : :- Project [1 AS grp#x, parse_json({"a": "x"}, true) AS v#x]
: : : : +- OneRowRelation
: : : +- Project [1 AS 1#x, parse_json({"a": "y"}, true) AS parse_json({"a": "y"})#x]
: : : +- OneRowRelation
: : +- Project [2 AS 2#x, parse_json({"a": "x"}, true) AS parse_json({"a": "x"})#x]
: : +- OneRowRelation
: +- Project [2 AS 2#x, parse_json({"a": "x"}, true) AS parse_json({"a": "x"})#x]
: +- OneRowRelation
+- Project [1 AS 1#x, parse_json({"a": "x"}, true) AS parse_json({"a": "x"})#x]
+- OneRowRelation


-- !query
SELECT listagg(DISTINCT v:a::string, ',') WITHIN GROUP (ORDER BY v:a) FROM (SELECT parse_json('{"a": "x"}') v UNION ALL SELECT parse_json('{"a": "y"}') UNION ALL SELECT parse_json('{"a": "x"}'))
-- !query analysis
org.apache.spark.sql.catalyst.ExtendedAnalysisException
{
"errorClass" : "DATATYPE_MISMATCH.INVALID_ORDERING_TYPE",
"sqlState" : "42K09",
"messageParameters" : {
"dataType" : "\"VARIANT\"",
"functionName" : "`sortorder`",
"sqlExpr" : "\"variant_get(v, $.a) ASC NULLS FIRST\""
},
"queryContext" : [ {
"objectType" : "",
"objectName" : "",
"startIndex" : 66,
"stopIndex" : 68,
"fragment" : "v:a"
} ]
}
15 changes: 15 additions & 0 deletions sql/core/src/test/resources/sql-tests/inputs/listagg.sql
Original file line number Diff line number Diff line change
Expand Up @@ -61,3 +61,18 @@ SELECT listagg(DISTINCT col1) WITHIN GROUP (ORDER BY col1, col2) FROM df;
SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col) FROM VALUES (cast(1.1 as double)), (cast(2.2 as double)), (cast(2.2 as double)), (cast(3.3 as double)) AS t(col);
SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col) FROM VALUES (cast(1.0 as float)), (cast(2.0 as float)), (cast(2.0 as float)) AS t(col);
SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col) FROM VALUES (TIMESTAMP'2024-01-01 10:00:00'), (TIMESTAMP'2024-01-02 12:00:00'), (TIMESTAMP'2024-01-01 10:00:00') AS t(col);

-- LISTAGG with semi-structured extract (parser wraps v:a in Alias with fresh ExprId)
-- Tests that isOrderCompatible strips Alias wrappers before comparing via semanticEquals
SELECT listagg(DISTINCT v:a::string, ',') WITHIN GROUP (ORDER BY v:a::string) FROM (SELECT parse_json('{"a": "x"}') v UNION ALL SELECT parse_json('{"a": "y"}') UNION ALL SELECT parse_json('{"a": "x"}'));
Comment thread
mihailoale-db marked this conversation as resolved.
-- Semi-structured extract without DISTINCT
SELECT listagg(v:a::string, ',') WITHIN GROUP (ORDER BY v:a::string) FROM (SELECT parse_json('{"a": "x"}') v UNION ALL SELECT parse_json('{"a": "y"}') UNION ALL SELECT parse_json('{"a": "x"}'));
-- Semi-structured extract with DESC ordering
SELECT listagg(DISTINCT v:a::string, ',') WITHIN GROUP (ORDER BY v:a::string DESC) FROM (SELECT parse_json('{"a": "x"}') v UNION ALL SELECT parse_json('{"a": "y"}') UNION ALL SELECT parse_json('{"a": "x"}'));
-- Semi-structured extract with nested path
SELECT listagg(DISTINCT v:a.b::string, ',') WITHIN GROUP (ORDER BY v:a.b::string) FROM (SELECT parse_json('{"a": {"b": "x"}}') v UNION ALL SELECT parse_json('{"a": {"b": "y"}}') UNION ALL SELECT parse_json('{"a": {"b": "x"}}'));
-- Semi-structured extract with GROUP BY
SELECT grp, listagg(DISTINCT v:a::string, ',') WITHIN GROUP (ORDER BY v:a::string) FROM (SELECT 1 grp, parse_json('{"a": "x"}') v UNION ALL SELECT 1, parse_json('{"a": "y"}') UNION ALL SELECT 2, parse_json('{"a": "x"}') UNION ALL SELECT 2, parse_json('{"a": "x"}') UNION ALL SELECT 1, parse_json('{"a": "x"}')) GROUP BY grp;
-- Semi-structured extract: DISTINCT cast with non-equality-preserving order (variant)
-- Tests that checkOrderValueDeterminism strips Alias wrappers before comparing via semanticEquals
SELECT listagg(DISTINCT v:a::string, ',') WITHIN GROUP (ORDER BY v:a) FROM (SELECT parse_json('{"a": "x"}') v UNION ALL SELECT parse_json('{"a": "y"}') UNION ALL SELECT parse_json('{"a": "x"}'));
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The comment above says this exercises checkOrderValueDeterminism's alias stripping, but ORDER BY v:a is VARIANT and fails with DATATYPE_MISMATCH.INVALID_ORDERING_TYPE from SortOrder.checkInputDataTypes before listagg's checks run — see this query's analyzer-results golden output: only that error is raised, no functionAndOrderExpressionUnsafeCastError. The test would pass identically with or without the new trimAliases you added at line 684.

To actually hit the Cast(castChild, ...) arm in checkOrderValueDeterminism, the order column needs to be orderable but non-equality-preserving when cast to String, e.g. LISTAGG(DISTINCT (v:a)::double::string, ',') WITHIN GROUP (ORDER BY (v:a)::double). Without the trim, the single-pass aliases inside castChild and orderExpressions.head.child mismatch and you'd get NonDeterministicMismatchfunctionAndOrderExpressionMismatchError; with the trim, they match and you get NonDeterministicCast(Double, String)functionAndOrderExpressionUnsafeCastError. That divergence is what pins the fix.

65 changes: 65 additions & 0 deletions sql/core/src/test/resources/sql-tests/results/listagg.sql.out
Original file line number Diff line number Diff line change
Expand Up @@ -563,3 +563,68 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
"inputType" : "\"TIMESTAMP\""
}
}


-- !query
SELECT listagg(DISTINCT v:a::string, ',') WITHIN GROUP (ORDER BY v:a::string) FROM (SELECT parse_json('{"a": "x"}') v UNION ALL SELECT parse_json('{"a": "y"}') UNION ALL SELECT parse_json('{"a": "x"}'))
-- !query schema
struct<listagg(DISTINCT CAST(variant_get(v, $.a) AS a AS STRING), ,) WITHIN GROUP (ORDER BY CAST(variant_get(v, $.a) AS a AS STRING) ASC NULLS FIRST):string>
-- !query output
x,y


-- !query
SELECT listagg(v:a::string, ',') WITHIN GROUP (ORDER BY v:a::string) FROM (SELECT parse_json('{"a": "x"}') v UNION ALL SELECT parse_json('{"a": "y"}') UNION ALL SELECT parse_json('{"a": "x"}'))
-- !query schema
struct<listagg(CAST(variant_get(v, $.a) AS a AS STRING), ,) WITHIN GROUP (ORDER BY CAST(variant_get(v, $.a) AS a AS STRING) ASC NULLS FIRST):string>
-- !query output
x,x,y


-- !query
SELECT listagg(DISTINCT v:a::string, ',') WITHIN GROUP (ORDER BY v:a::string DESC) FROM (SELECT parse_json('{"a": "x"}') v UNION ALL SELECT parse_json('{"a": "y"}') UNION ALL SELECT parse_json('{"a": "x"}'))
-- !query schema
struct<listagg(DISTINCT CAST(variant_get(v, $.a) AS a AS STRING), ,) WITHIN GROUP (ORDER BY CAST(variant_get(v, $.a) AS a AS STRING) DESC NULLS LAST):string>
-- !query output
y,x


-- !query
SELECT listagg(DISTINCT v:a.b::string, ',') WITHIN GROUP (ORDER BY v:a.b::string) FROM (SELECT parse_json('{"a": {"b": "x"}}') v UNION ALL SELECT parse_json('{"a": {"b": "y"}}') UNION ALL SELECT parse_json('{"a": {"b": "x"}}'))
-- !query schema
struct<listagg(DISTINCT CAST(variant_get(v, $.a.b) AS b AS STRING), ,) WITHIN GROUP (ORDER BY CAST(variant_get(v, $.a.b) AS b AS STRING) ASC NULLS FIRST):string>
-- !query output
x,y


-- !query
SELECT grp, listagg(DISTINCT v:a::string, ',') WITHIN GROUP (ORDER BY v:a::string) FROM (SELECT 1 grp, parse_json('{"a": "x"}') v UNION ALL SELECT 1, parse_json('{"a": "y"}') UNION ALL SELECT 2, parse_json('{"a": "x"}') UNION ALL SELECT 2, parse_json('{"a": "x"}') UNION ALL SELECT 1, parse_json('{"a": "x"}')) GROUP BY grp
-- !query schema
struct<grp:int,listagg(DISTINCT CAST(variant_get(v, $.a) AS a AS STRING), ,) WITHIN GROUP (ORDER BY CAST(variant_get(v, $.a) AS a AS STRING) ASC NULLS FIRST):string>
-- !query output
1 x,y
2 x


-- !query
SELECT listagg(DISTINCT v:a::string, ',') WITHIN GROUP (ORDER BY v:a) FROM (SELECT parse_json('{"a": "x"}') v UNION ALL SELECT parse_json('{"a": "y"}') UNION ALL SELECT parse_json('{"a": "x"}'))
-- !query schema
struct<>
-- !query output
org.apache.spark.sql.catalyst.ExtendedAnalysisException
{
"errorClass" : "DATATYPE_MISMATCH.INVALID_ORDERING_TYPE",
"sqlState" : "42K09",
"messageParameters" : {
"dataType" : "\"VARIANT\"",
"functionName" : "`sortorder`",
"sqlExpr" : "\"variant_get(v, $.a) ASC NULLS FIRST\""
},
"queryContext" : [ {
"objectType" : "",
"objectName" : "",
"startIndex" : 66,
"stopIndex" : 68,
"fragment" : "v:a"
} ]
}