Skip to content
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,9 @@ import org.apache.spark.sql.catalyst.trees.CurrentOrigin.withOrigin
import org.apache.spark.sql.catalyst.trees.TreePattern._
import org.apache.spark.sql.connector.catalog.CatalogManager
import org.apache.spark.sql.errors.{DataTypeErrorsBase, QueryCompilationErrors}
import org.apache.spark.sql.catalyst.util.toPrettySQL
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types.NullType

trait ColumnResolutionHelper extends Logging with DataTypeErrorsBase {

Expand Down Expand Up @@ -180,7 +182,19 @@ trait ColumnResolutionHelper extends Logging with DataTypeErrorsBase {
field
}
if (newChild.resolved) {
ExtractValue(child = newChild, extraction = resolvedField, resolver = resolver)
// applyOrNull propagates NULL when the base is NullType instead of throwing
// INVALID_EXTRACT_BASE_FIELD_TYPE, consistent with multipart field access (col.a).
val extracted = ExtractValue.applyOrNull(
child = newChild, extraction = resolvedField, resolver = resolver)
// A NullType base yields a bare NULL literal, which would otherwise produce an output
// column named `NULL`. Alias it with the extraction's text (e.g. `col[0]`) to keep a
// stable column name; CleanupAliases later trims this alias where it's not a top-level
// projection output.
if (newChild.dataType == NullType) {
Alias(extracted, toPrettySQL(u.copy(child = newChild, extraction = resolvedField)))()
} else {
extracted
}
} else {
u.copy(child = newChild, extraction = resolvedField)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -290,8 +290,9 @@ object TableOutputResolver extends SQLConfHelper with Logging {
* that exceed the column length are caught at runtime. Uses `getRawType` so it works for both
* V1 and V2 tables. Shared by the by-name and by-position default-fill paths.
*
* `applyColumnMetadata` strips the default's outer alias and re-wraps it with the required
* metadata, so the length check is applied to the default value itself (the alias child).
* We unwrap the default's outer alias before the length check so the check wraps the
* default value itself, not the alias; `applyColumnMetadata` then re-adds the required
* alias and metadata afterward.
*/
private def applyDefaultWithLengthCheck(
defaultExpr: Expression,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,25 @@ object ExtractValue {
}
}

/**
* Resolution-time variant of [[apply]]: extracting a field/element/key from a NULL (`NullType`)
* base yields NULL (SQL NULL propagation) instead of throwing `INVALID_EXTRACT_BASE_FIELD_TYPE`.
* A `NullType` column can arise e.g. from schema evolution with missing columns. This is used by
* the user-facing extraction resolution sites (multipart name resolution and
* `UnresolvedExtractValue` resolution). `extractValue` itself is left unchanged, so the other
* direct consumers keep their prior (throwing) behavior.
*/
def applyOrNull(
child: Expression,
extraction: Expression,
resolver: Resolver): Expression = {
if (child.dataType == NullType) {
Literal(null, NullType)
} else {
apply(child, extraction, resolver)
}
}

/**
* Returns the resolved `ExtractValue`. It will return one kind of concrete `ExtractValue`,
* depend on the type of `child` and `extraction`.
Expand Down Expand Up @@ -119,13 +138,21 @@ object ExtractValue {
val withExtractedNestedFields = nestedFields
.foldLeft(Some(attribute): Option[Expression]) {
case (Some(expression), field) =>
ExtractValue.extractValue(
child = expression,
extraction = Literal(field),
resolver = resolver
) match {
case Left(e) => Some(e)
case Right(_) => None
// Extraction from a NULL (NullType) base propagates NULL rather than failing, matching
// the user-facing resolution sites (which use applyOrNull). Treating it as extractable
// here keeps the NullType candidate in single-pass NameScope candidate filtering so it
// resolves consistently with the legacy analyzer.
if (expression.dataType == NullType) {
Some(Literal(null, NullType))
} else {
ExtractValue.extractValue(
child = expression,
extraction = Literal(field),
resolver = resolver
) match {
case Left(e) => Some(e)
case Right(_) => None
}
}
case _ =>
None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -396,7 +396,9 @@ package object expressions {
// Then this will add ExtractValue("c", ExtractValue("b", a)), and alias the final
// expression as "c".
val fieldExprs = nestedFields.foldLeft(a: Expression) { (e, name) =>
ExtractValue(e, Literal(name), resolver)
// applyOrNull propagates NULL when the base is NullType (e.g. a NullType column from
// schema evolution) instead of throwing INVALID_EXTRACT_BASE_FIELD_TYPE.
ExtractValue.applyOrNull(e, Literal(name), resolver)
}
Seq(Alias(fieldExprs, nestedFields.last)())

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
-- Automatically generated by SQLQueryTestSuite
-- !query
SELECT col.a FROM (SELECT null AS col) t
-- !query analysis
Project [null AS a#x]
+- SubqueryAlias t
+- Project [null AS col#x]
+- OneRowRelation
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,30 @@ DROP TABLE t1
-- !query analysis
DropTable false, false
+- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t1


-- !query
SELECT col.a FROM (SELECT null AS col) t
-- !query analysis
Project [null AS a#x]
+- SubqueryAlias t
+- Project [null AS col#x]
+- OneRowRelation


-- !query
SELECT col[0] FROM (SELECT null AS col) t
-- !query analysis
Project [null AS col[0]#x]
+- SubqueryAlias t
+- Project [null AS col#x]
+- OneRowRelation


-- !query
SELECT col['key'] FROM (SELECT null AS col) t
-- !query analysis
Project [null AS col[key]#x]
+- SubqueryAlias t
+- Project [null AS col#x]
+- OneRowRelation
Original file line number Diff line number Diff line change
Expand Up @@ -500,3 +500,12 @@ Project [sum_val#x]
+- Aggregate [col1#x], [(col1#x.nums[0] + col1#x.nums[1]) AS sum_val#x, col1#x]
+- SubqueryAlias t
+- LocalRelation [col1#x]


-- !query
SELECT NAMED_STRUCT('a', 1) AS col1 FROM VALUES (NULL) t (col1) GROUP BY col1 HAVING col1.a == 1
-- !query analysis
Filter (cast(null as int) = 1)
+- Aggregate [col1#x], [named_struct(a, 1) AS col1#x]
+- SubqueryAlias t
+- LocalRelation [col1#x]
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
-- SPARK-57186: multipart field access (col.a) on a NullType base propagates NULL under the
-- single-pass resolver as well, consistently with the legacy analyzer. Dual-running both analyzers
-- locks in that consistency (no HYBRID_ANALYZER_EXCEPTION).
-- The col[0]/col['key'] subscript forms are intentionally not covered here: the single-pass
-- resolver does not resolve subscript extraction (UnresolvedExtractValue) at all -- a pre-existing
-- limitation independent of NullType -- so they are exercised only under the legacy analyzer in
-- extract-value-resolution-edge-cases.sql.
--SET spark.sql.analyzer.singlePassResolver.dualRunWithLegacy=true

SELECT col.a FROM (SELECT null AS col) t;
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,13 @@ SELECT col1.a, a FROM t1 ORDER BY col1.a;
SELECT split(col1, '-')[1] AS a FROM VALUES('a-b') ORDER BY split(col1, '-')[1];

DROP TABLE t1;

-- SPARK-57186: extracting a field/element/key from a NullType base returns NULL instead of
-- throwing INVALID_EXTRACT_BASE_FIELD_TYPE (SQL NULL propagation; a NullType column can arise e.g.
-- from schema evolution with missing columns). This applies uniformly to dotted field access
-- (`col.a`) and the subscript forms (`col[0]`, `col['key']`), and is implemented at the
-- user-facing resolution sites (ExtractValue.applyOrNull) without changing the shared
-- ExtractValue.extractValue utility.
SELECT col.a FROM (SELECT null AS col) t;
SELECT col[0] FROM (SELECT null AS col) t;
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Non-blocking: for a NullType base, col.a now returns NULL while the equivalent subscript forms col[0]/col['key'] still throw INVALID_EXTRACT_BASE_FIELD_TYPE. col.a and col['a'] are equivalent struct-field-access syntaxes elsewhere in Spark, so this asymmetry is worth confirming as intentional. It's largely tied to the analyzer-divergence finding above — a consistent cross-analyzer fix is the natural point to decide whether the subscript forms should follow col.a here.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Under the legacy analyzer all three (col.a, col[0], col['key']) now propagate NULL - the asymmetry is gone there. They're only dual-run for col.a because the single-pass resolver doesn't resolve subscript extraction at all today (normal a[0]/m['k'] fail under single-pass too; ExtractValueResolver is unwired) - a pre-existing limitation independent of NullType, so the subscript forms run under the legacy analyzer only.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: col[0] / col['key'] on a NullType base now produce an output column named NULL (analyzer plan: Project [null AS NULL#x]), whereas col.a is named a and a non-null arr[0] would be named arr[0]. Cosmetic only and an extreme edge case (NullType column + subscript), so not blocking -- just flagging the naming asymmetry in case a stable column name is preferable here.

Copy link
Copy Markdown
Contributor Author

@dejankrak-db dejankrak-db Jun 2, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Addressed the column-name nit: col[0]/col['key'] on a NullType base are now aliased via toPrettySQL (so they're named col[0]/col['key'] instead of NULL, matching the non-null subscript naming); CleanupAliases trims this where it isn't a top-level projection output. Also updated the PR description to reflect the current implementation. Thanks @cloud-fan for the detailed reviews!

SELECT col['key'] FROM (SELECT null AS col) t;
Original file line number Diff line number Diff line change
Expand Up @@ -141,3 +141,10 @@ FROM VALUES (NAMED_STRUCT('nums', ARRAY(10, 20))) t (col1)
GROUP BY col1
HAVING col1.nums[0] + col1.nums[1] > 25
ORDER BY col1.nums[0];

-- SPARK-57186: Alias type: Struct, Table column type: NullType (void).
-- Unlike the STRING/ARRAY/MAP input bases above, which throw INVALID_EXTRACT_BASE_FIELD_TYPE for
-- this shadowing pattern, a NullType input column that shadows the struct alias yields NULL
-- (NULL propagation). The HAVING predicate is therefore NULL and the row is filtered out, giving
-- an empty result. NullType is intentionally the one base type that does not error here.
SELECT NAMED_STRUCT('a', 1) AS col1 FROM VALUES (NULL) t (col1) GROUP BY col1 HAVING col1.a == 1;
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
-- Automatically generated by SQLQueryTestSuite
-- !query
SELECT col.a FROM (SELECT null AS col) t
-- !query schema
struct<a:void>
-- !query output
NULL
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,27 @@ DROP TABLE t1
struct<>
-- !query output



-- !query
SELECT col.a FROM (SELECT null AS col) t
-- !query schema
struct<a:void>
-- !query output
NULL


-- !query
SELECT col[0] FROM (SELECT null AS col) t
-- !query schema
struct<col[0]:void>
-- !query output
NULL


-- !query
SELECT col['key'] FROM (SELECT null AS col) t
-- !query schema
struct<col[key]:void>
-- !query output
NULL
Original file line number Diff line number Diff line change
Expand Up @@ -427,3 +427,11 @@ ORDER BY col1.nums[0]
struct<sum_val:int>
-- !query output
30


-- !query
SELECT NAMED_STRUCT('a', 1) AS col1 FROM VALUES (NULL) t (col1) GROUP BY col1 HAVING col1.a == 1
-- !query schema
struct<col1:struct<a:int>>
-- !query output