Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -1868,7 +1868,14 @@ class Analyzer(
// Only Project, Aggregate, CollectMetrics can host star expressions.
case u @ (_: Project | _: Aggregate | _: CollectMetrics) =>
Try(s.expand(u.children.head, resolver)) match {
case Success(expanded) => expanded.map(wrapOuterReference)
case Success(expanded) =>
expanded.map {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Design: this expand helper is shared with the function-arg / CreateStruct / CreateArray / In / Murmur3Hash / XxHash64 callers further down (lines 1969-1995). Only the project-list path (buildExpandedProjectList) exposes Project.output, which is where the ExprId leak you're fixing actually surfaces. The other callers don't need an Alias wrap — and as the selectExcept.sql.out and ResolveSubquerySuite diffs show, wrapping them changes the auto-generated column name (the inner AS c1, AS c2 gets baked into the surrounding alias's toPrettySQL before CleanupAliases runs).

Could the wrapping be moved to buildExpandedProjectList (and similar project-list call sites) so function-arg expansions keep bare OuterReferences? That would scope the change to where the leak actually exists and avoid the column-name regression.

case alias: Alias =>
alias.withNewChildren(Seq(wrapOuterReference(alias.child)))
.asInstanceOf[Alias]
Comment on lines +1874 to +1875
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Minor: wrapOuterReference[E <: Expression](e: E): E recursively transforms Attribute nodes and preserves the rest of the tree, so applying it to the whole alias produces the same Alias(GetStructField(OuterReference(attr), i), name) and avoids the withNewChildren + cast dance.

Suggested change
alias.withNewChildren(Seq(wrapOuterReference(alias.child)))
.asInstanceOf[Alias]
wrapOuterReference(alias)

case e =>
Alias(wrapOuterReference(e), toPrettySQL(e))()
Comment on lines +1876 to +1877
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: the inner case e => shadows the outer case e: AnalysisException from the surrounding catch. It still works (the outer e is referenced from case Failure(_) => throw e at a different scope) but reads confusingly.

Suggested change
case e =>
Alias(wrapOuterReference(e), toPrettySQL(e))()
case other =>
Alias(wrapOuterReference(other), toPrettySQL(other))()

}
case Failure(_) => throw e
}
// Do not use the outer plan to resolve the star expression
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -225,12 +225,15 @@ class ResolveSubquerySuite extends AnalysisTest {
test("SPARK-35618: lateral join with star expansion in functions") {
val outerA = OuterReference(a.withQualifier(Seq("t1")))
val outerB = OuterReference(b.withQualifier(Seq("t1")))
val aliasedOuterA = Alias(outerA, a.name)()
val aliasedOuterB = Alias(outerB, b.name)()
val array = CreateArray(Seq(star("t1")))
val newArray = CreateArray(Seq(outerA, outerB))
val aliasedNewArray = CreateArray(Seq(aliasedOuterA, aliasedOuterB))
checkAnalysis(
lateralJoin(t1.as("t1"), t0.select(array)),
LateralJoin(t1,
LateralSubquery(t0.select(newArray.as(newArray.sql)), Seq(a, b)), Inner, None)
LateralSubquery(t0.select(newArray.as(aliasedNewArray.sql)), Seq(a, b)), Inner, None)
)
assertAnalysisErrorCondition(
lateralJoin(t1.as("t1"), t0.select(Count(star("t1")))),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -507,7 +507,7 @@ Project [c1#x, c2#x, c3#x, c4#x, c5#x]
+- LateralJoin lateral-subquery#x [c1#x && c2#x && c3#x && c4#x && c5#x], Inner
: +- SubqueryAlias T
: +- Project [c1#x AS c1#x, c2#x AS c2#x, c3#x AS c3#x, c4#x AS c4#x, c5#x AS c5#x]
: +- Project [outer(c1#x), outer(c2#x), outer(c3#x), outer(c4#x), outer(c5#x)]
: +- Project [outer(c1#x) AS c1#x, outer(c2#x) AS c2#x, outer(c3#x) AS c3#x, outer(c4#x) AS c4#x, outer(c5#x) AS c5#x]
: +- OneRowRelation
+- SubqueryAlias v1
+- View (`v1`, [c1#x, c2#x, c3#x, c4#x, c5#x])
Expand All @@ -522,8 +522,8 @@ SELECT T.* FROM v1, LATERAL (SELECT COALESCE(v1.*)) AS T(x)
Project [x#x]
+- LateralJoin lateral-subquery#x [c1#x && c2#x && c3#x && c4#x && c5#x], Inner
: +- SubqueryAlias T
: +- Project [coalesce(outer(v1.c1), outer(v1.c2), outer(v1.c3), outer(v1.c4), outer(v1.c5))#x AS x#x]
: +- Project [coalesce(outer(c1#x), outer(c2#x), cast(outer(c3#x) as int), outer(c4#x), outer(c5#x)) AS coalesce(outer(v1.c1), outer(v1.c2), outer(v1.c3), outer(v1.c4), outer(v1.c5))#x]
: +- Project [coalesce(outer(v1.c1) AS c1, outer(v1.c2) AS c2, outer(v1.c3) AS c3, outer(v1.c4) AS c4, outer(v1.c5) AS c5)#x AS x#x]
: +- Project [coalesce(outer(c1#x), outer(c2#x), cast(outer(c3#x) as int), outer(c4#x), outer(c5#x)) AS coalesce(outer(v1.c1) AS c1, outer(v1.c2) AS c2, outer(v1.c3) AS c3, outer(v1.c4) AS c4, outer(v1.c5) AS c5)#x]
: +- OneRowRelation
+- SubqueryAlias v1
+- View (`v1`, [c1#x, c2#x, c3#x, c4#x, c5#x])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1075,3 +1075,73 @@ Project [c1#x, c2#x, scalar-subquery#x [c1#x] AS scalarsubquery(c1)#xL]
+- View (`t1`, [c1#x, c2#x])
+- Project [cast(col1#x as int) AS c1#x, cast(col2#x as int) AS c2#x]
+- LocalRelation [col1#x, col2#x]


-- !query
SELECT (SELECT t1.* FROM VALUES(2) AS t2(col1) LIMIT 1) FROM VALUES(1) AS t1(col1)
-- !query analysis
Project [scalar-subquery#x [col1#x] AS scalarsubquery(col1)#x]
: +- GlobalLimit 1
: +- LocalLimit 1
: +- Project [outer(col1#x) AS col1#x]
: +- SubqueryAlias t2
: +- LocalRelation [col1#x]
+- SubqueryAlias t1
+- LocalRelation [col1#x]


-- !query
SELECT (SELECT t1.s.* FROM VALUES(2) AS t2(col1) LIMIT 1)
FROM (SELECT named_struct('a', 1) AS s) AS t1
-- !query analysis
Project [scalar-subquery#x [s#x] AS scalarsubquery(s)#x]
: +- GlobalLimit 1
: +- LocalLimit 1
: +- Project [outer(s#x).a AS a#x]
: +- SubqueryAlias t2
: +- LocalRelation [col1#x]
+- SubqueryAlias t1
+- Project [named_struct(a, 1) AS s#x]
+- OneRowRelation


-- !query
SELECT (SELECT * FROM VALUES(2) AS t2(col1) LIMIT 1) FROM VALUES(1) AS t1(col1)
-- !query analysis
Project [scalar-subquery#x [] AS scalarsubquery()#x]
: +- GlobalLimit 1
: +- LocalLimit 1
: +- Project [col1#x]
: +- SubqueryAlias t2
: +- LocalRelation [col1#x]
+- SubqueryAlias t1
+- LocalRelation [col1#x]


-- !query
SELECT (SELECT t1.* FROM (SELECT 3 AS col1) AS t1 LIMIT 1) FROM VALUES(1) AS t1(col1)
-- !query analysis
Project [scalar-subquery#x [] AS scalarsubquery()#x]
: +- GlobalLimit 1
: +- LocalLimit 1
: +- Project [col1#x]
: +- SubqueryAlias t1
: +- Project [3 AS col1#x]
: +- OneRowRelation
+- SubqueryAlias t1
+- LocalRelation [col1#x]


-- !query
SELECT (SELECT * FROM (SELECT t1.* FROM VALUES(2) AS t2(col1) LIMIT 1)) FROM VALUES(1) AS t1(col1)
-- !query analysis
Project [scalar-subquery#x [col1#x] AS scalarsubquery(col1)#x]
: +- Project [col1#x]
: +- SubqueryAlias __auto_generated_subquery_name
: +- GlobalLimit 1
: +- LocalLimit 1
: +- Project [outer(col1#x) AS col1#x]
: +- SubqueryAlias t2
: +- LocalRelation [col1#x]
+- SubqueryAlias t1
+- LocalRelation [col1#x]
Original file line number Diff line number Diff line change
Expand Up @@ -258,4 +258,20 @@ select * from (
where t.c2 is not null;

-- SPARK-43838: Subquery on single table with having clause
SELECT c1, c2, (SELECT count(*) cnt FROM t1 t2 WHERE t1.c1 = t2.c1 HAVING cnt = 0) FROM t1
SELECT c1, c2, (SELECT count(*) cnt FROM t1 t2 WHERE t1.c1 = t2.c1 HAVING cnt = 0) FROM t1;

-- Outer star expansion in scalar subquery
SELECT (SELECT t1.* FROM VALUES(2) AS t2(col1) LIMIT 1) FROM VALUES(1) AS t1(col1);

-- Outer struct star expansion in scalar subquery
SELECT (SELECT t1.s.* FROM VALUES(2) AS t2(col1) LIMIT 1)
FROM (SELECT named_struct('a', 1) AS s) AS t1;

-- Untargeted star in subquery should NOT expand from outer scope
SELECT (SELECT * FROM VALUES(2) AS t2(col1) LIMIT 1) FROM VALUES(1) AS t1(col1);

-- Inner scope wins when star target matches both inner and outer scope
SELECT (SELECT t1.* FROM (SELECT 3 AS col1) AS t1 LIMIT 1) FROM VALUES(1) AS t1(col1);

-- Outer star expansion through a derived table wrapper
SELECT (SELECT * FROM (SELECT t1.* FROM VALUES(2) AS t2(col1) LIMIT 1)) FROM VALUES(1) AS t1(col1);
Original file line number Diff line number Diff line change
Expand Up @@ -607,3 +607,44 @@ struct<c1:int,c2:int,scalarsubquery(c1):bigint>
-- !query output
0 1 NULL
1 2 NULL


-- !query
SELECT (SELECT t1.* FROM VALUES(2) AS t2(col1) LIMIT 1) FROM VALUES(1) AS t1(col1)
-- !query schema
struct<scalarsubquery(col1):int>
-- !query output
1


-- !query
SELECT (SELECT t1.s.* FROM VALUES(2) AS t2(col1) LIMIT 1)
FROM (SELECT named_struct('a', 1) AS s) AS t1
-- !query schema
struct<scalarsubquery(s):int>
-- !query output
1


-- !query
SELECT (SELECT * FROM VALUES(2) AS t2(col1) LIMIT 1) FROM VALUES(1) AS t1(col1)
-- !query schema
struct<scalarsubquery():int>
-- !query output
2


-- !query
SELECT (SELECT t1.* FROM (SELECT 3 AS col1) AS t1 LIMIT 1) FROM VALUES(1) AS t1(col1)
-- !query schema
struct<scalarsubquery():int>
-- !query output
3


-- !query
SELECT (SELECT * FROM (SELECT t1.* FROM VALUES(2) AS t2(col1) LIMIT 1)) FROM VALUES(1) AS t1(col1)
-- !query schema
struct<scalarsubquery(col1):int>
-- !query output
1