From 96fca0edd84b71271e7376ac2fdc34e0afd8d24b Mon Sep 17 00:00:00 2001 From: Szehon Ho Date: Tue, 25 Nov 2025 22:42:24 -0800 Subject: [PATCH 1/3] [SPARK-54525][SQL] Disable nested struct coercion in MERGE INTO --- .../apache/spark/sql/internal/SQLConf.scala | 4 +- .../connector/MergeIntoTableSuiteBase.scala | 1774 ++--------------- 2 files changed, 170 insertions(+), 1608 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 8b50abbe4052..3b052aeee355 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -7903,8 +7903,8 @@ class SQLConf extends Serializable with Logging with SqlApiConf { def legacyXMLParserEnabled: Boolean = getConf(SQLConf.LEGACY_XML_PARSER_ENABLED) - def coerceMergeNestedTypes: Boolean = - getConf(SQLConf.MERGE_INTO_NESTED_TYPE_COERCION_ENABLED) + // Disable until we define the semantics of UPDATE SET * with nested types + def coerceMergeNestedTypes: Boolean = false /** ********************** SQLConf functionality methods ************ */ diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/MergeIntoTableSuiteBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/MergeIntoTableSuiteBase.scala index 680fa63e0929..c3d1e43a58b0 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/MergeIntoTableSuiteBase.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/MergeIntoTableSuiteBase.scala @@ -3230,166 +3230,6 @@ abstract class MergeIntoTableSuiteBase extends RowLevelOperationSuiteBase } } - test("merge into schema evolution replace column with nested struct and set all columns") { - Seq(true, false).foreach { withSchemaEvolution => - withTempView("source") { - // Create table using Spark SQL - sql( - s"""CREATE TABLE $tableNameAsString ( - |pk INT NOT NULL, - |s STRUCT, m: MAP>>, - |dep STRING) - |PARTITIONED BY (dep) - |""".stripMargin) - // Insert data using DataFrame API with objects - val tableSchema = StructType(Seq( - StructField("pk", IntegerType, nullable = false), - StructField("s", StructType(Seq( - StructField("c1", IntegerType), - StructField("c2", StructType(Seq( - StructField("a", ArrayType(IntegerType)), - StructField("m", MapType(StringType, StringType)) - ))) - ))), - StructField("dep", StringType) - )) - val targetData = Seq( - Row(1, Row(2, Row(Array(1, 2), Map("a" -> "b"))), "hr") - ) - spark.createDataFrame(spark.sparkContext.parallelize(targetData), tableSchema) - .coalesce(1).writeTo(tableNameAsString).append() - - val sourceTableSchema = StructType(Seq( - StructField("pk", IntegerType, nullable = false), - StructField("s", StructType(Seq( - StructField("c1", IntegerType), - StructField("c2", StructType(Seq( - // missing column 'a' - StructField("m", MapType(StringType, StringType)), - StructField("c3", BooleanType) // new column - ))) - ))), - StructField("dep", StringType) - )) - val sourceData = Seq( - Row(1, Row(10, Row(Map("c" -> "d"), false)), "sales"), - Row(2, Row(20, Row(Map("e" -> "f"), true)), "engineering") - ) - spark.createDataFrame(spark.sparkContext.parallelize(sourceData), sourceTableSchema) - .createOrReplaceTempView("source") - - val schemaEvolutionClause = if (withSchemaEvolution) "WITH SCHEMA EVOLUTION" else "" - val mergeStmt = - s"""MERGE $schemaEvolutionClause - |INTO $tableNameAsString t - |USING source src - |ON t.pk = src.pk - |WHEN MATCHED THEN - | UPDATE SET * - |WHEN NOT MATCHED THEN - | INSERT * - |""".stripMargin - if (withSchemaEvolution) { - sql(mergeStmt) - checkAnswer( - sql(s"SELECT * FROM $tableNameAsString"), - Seq( - Row(1, Row(10, Row(Seq(1, 2), Map("c" -> "d"), false)), "sales"), - Row(2, Row(20, Row(null, Map("e" -> "f"), true)), "engineering"))) - } else { - val exception = intercept[org.apache.spark.sql.AnalysisException] { - sql(mergeStmt) - } - assert(exception.errorClass.get == "INCOMPATIBLE_DATA_FOR_TABLE.EXTRA_STRUCT_FIELDS") - assert(exception.getMessage.contains( - "Cannot write extra fields `c3` to the struct `s`.`c2`")) - } - } - sql(s"DROP TABLE IF EXISTS $tableNameAsString") - } - } - - test("merge into schema evolution replace column with nested struct and update " + - "top level struct") { - Seq(true, false).foreach { withSchemaEvolution => - withTempView("source") { - // Create table using Spark SQL - sql( - s"""CREATE TABLE $tableNameAsString ( - |pk INT NOT NULL, - |s STRUCT, m: MAP>>, - |dep STRING) - |PARTITIONED BY (dep) - |""".stripMargin) - - // Insert data using DataFrame API with objects - val tableSchema = StructType(Seq( - StructField("pk", IntegerType, nullable = false), - StructField("s", StructType(Seq( - StructField("c1", IntegerType), - StructField("c2", StructType(Seq( - StructField("a", ArrayType(IntegerType)), - StructField("m", MapType(StringType, StringType)) - ))) - ))), - StructField("dep", StringType) - )) - val targetData = Seq( - Row(1, Row(2, Row(Array(1, 2), Map("a" -> "b"))), "hr") - ) - spark.createDataFrame(spark.sparkContext.parallelize(targetData), tableSchema) - .coalesce(1).writeTo(tableNameAsString).append() - - val sourceTableSchema = StructType(Seq( - StructField("pk", IntegerType, nullable = false), - StructField("s", StructType(Seq( - StructField("c1", IntegerType), - StructField("c2", StructType(Seq( - // missing column 'a' - StructField("m", MapType(StringType, StringType)), - StructField("c3", BooleanType) // new column - ))) - ))), - StructField("dep", StringType) - )) - val sourceData = Seq( - Row(1, Row(10, Row(Map("c" -> "d"), false)), "sales"), - Row(2, Row(20, Row(Map("e" -> "f"), true)), "engineering") - ) - spark.createDataFrame(spark.sparkContext.parallelize(sourceData), sourceTableSchema) - .createOrReplaceTempView("source") - - val schemaEvolutionClause = if (withSchemaEvolution) "WITH SCHEMA EVOLUTION" else "" - val mergeStmt = - s"""MERGE $schemaEvolutionClause - |INTO $tableNameAsString t - |USING source src - |ON t.pk = src.pk - |WHEN MATCHED THEN - | UPDATE SET s = src.s - |WHEN NOT MATCHED THEN - | INSERT * - |""".stripMargin - if (withSchemaEvolution) { - sql(mergeStmt) - checkAnswer( - sql(s"SELECT * FROM $tableNameAsString"), - Seq( - Row(1, Row(10, Row(null, Map("c" -> "d"), false)), "hr"), - Row(2, Row(20, Row(null, Map("e" -> "f"), true)), "engineering"))) - } else { - val exception = intercept[org.apache.spark.sql.AnalysisException] { - sql(mergeStmt) - } - assert(exception.errorClass.get == "INCOMPATIBLE_DATA_FOR_TABLE.EXTRA_STRUCT_FIELDS") - assert(exception.getMessage.contains( - "Cannot write extra fields `c3` to the struct `s`.`c2`")) - } - } - sql(s"DROP TABLE IF EXISTS $tableNameAsString") - } - } - test("merge into schema evolution add column for struct in array and set all columns") { Seq(true, false).foreach { withSchemaEvolution => withTempView("source") { @@ -3511,337 +3351,90 @@ abstract class MergeIntoTableSuiteBase extends RowLevelOperationSuiteBase } } - test("merge into schema evolution replace column for struct in map and set all columns") { - Seq(true, false).foreach { withSchemaEvolution => + test("merge into empty table with NOT MATCHED clause schema evolution") { + Seq(true, false) foreach { withSchemaEvolution => withTempView("source") { - val schema = - StructType(Seq( - StructField("pk", IntegerType, nullable = false), - StructField("m", MapType( - StructType(Seq(StructField("c1", IntegerType), StructField("c2", IntegerType))), - StructType(Seq(StructField("c4", StringType), StructField("c5", StringType))))), - StructField("dep", StringType))) - createTable(CatalogV2Util.structTypeToV2Columns(schema)) - - val data = Seq( - Row(0, Map(Row(10, 10) -> Row("c", "c")), "hr"), - Row(1, Map(Row(20, 20) -> Row("d", "d")), "sales")) - spark.createDataFrame(spark.sparkContext.parallelize(data), schema) - .writeTo(tableNameAsString).append() + createTable("pk INT NOT NULL, salary INT, dep STRING") - val sourceTableSchema = StructType(Seq( - StructField("pk", IntegerType), - StructField("m", MapType( - StructType(Seq(StructField("c1", IntegerType), StructField("c3", BooleanType))), - StructType(Seq(StructField("c4", StringType), StructField("c6", BooleanType))))), - StructField("dep", StringType))) - val sourceData = Seq( - Row(1, Map(Row(10, true) -> Row("y", false)), "sales"), - Row(2, Map(Row(20, false) -> Row("z", true)), "engineering") - ) - spark.createDataFrame(spark.sparkContext.parallelize(sourceData), sourceTableSchema) - .createOrReplaceTempView("source") + val sourceRows = Seq( + (1, 100, "hr", true), + (2, 200, "finance", false), + (3, 300, "hr", true)) + sourceRows.toDF("pk", "salary", "dep", "active").createOrReplaceTempView("source") val schemaEvolutionClause = if (withSchemaEvolution) "WITH SCHEMA EVOLUTION" else "" - val mergeStmt = + + sql( s"""MERGE $schemaEvolutionClause |INTO $tableNameAsString t - |USING source src - |ON t.pk = src.pk - |WHEN MATCHED THEN - | UPDATE SET * + |USING source s + |ON t.pk = s.pk |WHEN NOT MATCHED THEN | INSERT * - |""".stripMargin + |""".stripMargin) if (withSchemaEvolution) { - sql(mergeStmt) checkAnswer( sql(s"SELECT * FROM $tableNameAsString"), - Seq(Row(0, Map(Row(10, 10, null) -> Row("c", "c", null)), "hr"), - Row(1, Map(Row(10, null, true) -> Row("y", null, false)), "sales"), - Row(2, Map(Row(20, null, false) -> Row("z", null, true)), "engineering"))) + Seq( + Row(1, 100, "hr", true), + Row(2, 200, "finance", false), + Row(3, 300, "hr", true))) } else { - val exception = intercept[org.apache.spark.sql.AnalysisException] { - sql(mergeStmt) - } - assert(exception.errorClass.get == "INCOMPATIBLE_DATA_FOR_TABLE.EXTRA_STRUCT_FIELDS") - assert(exception.getMessage.contains( - "Cannot write extra fields `c3` to the struct `m`.`key`")) + checkAnswer( + sql(s"SELECT * FROM $tableNameAsString"), + Seq( + Row(1, 100, "hr"), + Row(2, 200, "finance"), + Row(3, 300, "hr"))) } + sql("DROP TABLE IF EXISTS " + tableNameAsString) } - sql(s"DROP TABLE IF EXISTS $tableNameAsString") } } - test("merge into schema evolution replace column for struct in map and set explicit columns") { + test("Merge schema evolution should not evolve referencing new column via transform") { Seq(true, false).foreach { withSchemaEvolution => withTempView("source") { - val schema = - StructType(Seq( - StructField("pk", IntegerType, nullable = false), - StructField("m", MapType( - StructType(Seq(StructField("c1", IntegerType), StructField("c2", IntegerType))), - StructType(Seq(StructField("c4", StringType), StructField("c5", StringType))))), - StructField("dep", StringType))) - createTable(CatalogV2Util.structTypeToV2Columns(schema)) - - val data = Seq( - Row(0, Map(Row(10, 10) -> Row("c", "c")), "hr"), - Row(1, Map(Row(20, 20) -> Row("d", "d")), "sales")) - spark.createDataFrame(spark.sparkContext.parallelize(data), schema) - .writeTo(tableNameAsString).append() + createAndInitTable("pk INT NOT NULL, salary INT, dep STRING", + """{ "pk": 1, "salary": 100, "dep": "hr" } + |{ "pk": 2, "salary": 200, "dep": "software" } + |""".stripMargin) - val sourceTableSchema = StructType(Seq( - StructField("pk", IntegerType), - StructField("m", MapType( - StructType(Seq(StructField("c1", IntegerType), StructField("c3", BooleanType))), - StructType(Seq(StructField("c4", StringType), StructField("c6", BooleanType))))), - StructField("dep", StringType))) - val sourceData = Seq( - Row(1, Map(Row(10, true) -> Row("y", false)), "sales"), - Row(2, Map(Row(20, false) -> Row("z", true)), "engineering") - ) - spark.createDataFrame(spark.sparkContext.parallelize(sourceData), sourceTableSchema) - .createOrReplaceTempView("source") + val sourceDF = Seq((2, 150, "dummy", "blah"), + (3, 250, "dummy", "blah")).toDF("pk", "salary", "dep", "extra") + sourceDF.createOrReplaceTempView("source") val schemaEvolutionClause = if (withSchemaEvolution) "WITH SCHEMA EVOLUTION" else "" val mergeStmt = s"""MERGE $schemaEvolutionClause |INTO $tableNameAsString t - |USING source src - |ON t.pk = src.pk + |USING source s + |ON t.pk = s.pk |WHEN MATCHED THEN - | UPDATE SET t.m = src.m, t.dep = 'my_old_dep' - |WHEN NOT MATCHED THEN - | INSERT (pk, m, dep) VALUES (src.pk, src.m, 'my_new_dep') + | UPDATE SET extra=substring(s.extra, 1, 2) |""".stripMargin - if (withSchemaEvolution) { + + val e = intercept[org.apache.spark.sql.AnalysisException] { sql(mergeStmt) - checkAnswer( - sql(s"SELECT * FROM $tableNameAsString"), - Seq(Row(0, Map(Row(10, 10, null) -> Row("c", "c", null)), "hr"), - Row(1, Map(Row(10, null, true) -> Row("y", null, false)), "my_old_dep"), - Row(2, Map(Row(20, null, false) -> Row("z", null, true)), "my_new_dep"))) - } else { - val exception = intercept[org.apache.spark.sql.AnalysisException] { - sql(mergeStmt) - } - assert(exception.errorClass.get == "INCOMPATIBLE_DATA_FOR_TABLE.EXTRA_STRUCT_FIELDS") - assert(exception.getMessage.contains( - "Cannot write extra fields `c3` to the struct `m`.`key`")) } + assert(e.errorClass.get == "UNRESOLVED_COLUMN.WITH_SUGGESTION") + assert(e.getMessage.contains("A column, variable, or function parameter with name " + + "`extra` cannot be resolved")) + + sql(s"DROP TABLE $tableNameAsString") } - sql(s"DROP TABLE IF EXISTS $tableNameAsString") } } - test("merge into schema evolution replace column for struct in array and set all columns") { + test("Merge schema evolution should not evolve if not directly referencing new column: update") { Seq(true, false).foreach { withSchemaEvolution => withTempView("source") { - val schema = - StructType(Seq( - StructField("pk", IntegerType, nullable = false), - StructField("a", ArrayType( - StructType(Seq(StructField("c1", IntegerType), StructField("c2", IntegerType))))), - StructField("dep", StringType))) - createTable(CatalogV2Util.structTypeToV2Columns(schema)) - - val data = Seq( - Row(0, Array(Row(10, 10)), "hr"), - Row(1, Array(Row(20, 20)), "sales")) - spark.createDataFrame(spark.sparkContext.parallelize(data), schema) - .writeTo(tableNameAsString).append() - - val sourceTableSchema = StructType(Seq( - StructField("pk", IntegerType), - StructField("a", ArrayType( - StructType(Seq(StructField("c1", IntegerType), StructField("c3", BooleanType))))), - StructField("dep", StringType))) - val sourceData = Seq( - Row(1, Array(Row(10, true)), "sales"), - Row(2, Array(Row(20, false)), "engineering") - ) - spark.createDataFrame(spark.sparkContext.parallelize(sourceData), sourceTableSchema) - .createOrReplaceTempView("source") - - val schemaEvolutionClause = if (withSchemaEvolution) "WITH SCHEMA EVOLUTION" else "" - val mergeStmt = - s"""MERGE $schemaEvolutionClause - |INTO $tableNameAsString t - |USING source src - |ON t.pk = src.pk - |WHEN MATCHED THEN - | UPDATE SET * - |WHEN NOT MATCHED THEN - | INSERT * - |""".stripMargin - - if (withSchemaEvolution) { - sql(mergeStmt) - checkAnswer( - sql(s"SELECT * FROM $tableNameAsString"), - Seq(Row(0, Array(Row(10, 10, null)), "hr"), - Row(1, Array(Row(10, null, true)), "sales"), - Row(2, Array(Row(20, null, false)), "engineering"))) - } else { - val exception = intercept[org.apache.spark.sql.AnalysisException] { - sql(mergeStmt) - } - assert(exception.errorClass.get == "INCOMPATIBLE_DATA_FOR_TABLE.EXTRA_STRUCT_FIELDS") - assert(exception.getMessage.contains( - "Cannot write extra fields `c3` to the struct `a`.`element`")) - } - } - sql(s"DROP TABLE IF EXISTS $tableNameAsString") - } - } - - test("merge into schema evolution replace column for struct in array and set explicit columns") { - Seq(true, false).foreach { withSchemaEvolution => - withTempView("source") { - val schema = - StructType(Seq( - StructField("pk", IntegerType, nullable = false), - StructField("a", ArrayType( - StructType(Seq(StructField("c1", IntegerType), StructField("c2", IntegerType))))), - StructField("dep", StringType))) - createTable(CatalogV2Util.structTypeToV2Columns(schema)) - - val data = Seq( - Row(0, Array(Row(10, 10)), "hr"), - Row(1, Array(Row(20, 20)), "sales")) - spark.createDataFrame(spark.sparkContext.parallelize(data), schema) - .writeTo(tableNameAsString).append() - - val sourceTableSchema = StructType(Seq( - StructField("pk", IntegerType), - StructField("a", ArrayType( - StructType(Seq(StructField("c1", IntegerType), StructField("c3", BooleanType))))), - StructField("dep", StringType))) - val sourceData = Seq( - Row(1, Array(Row(10, true)), "sales"), - Row(2, Array(Row(20, false)), "engineering") - ) - spark.createDataFrame(spark.sparkContext.parallelize(sourceData), sourceTableSchema) - .createOrReplaceTempView("source") - - val schemaEvolutionClause = if (withSchemaEvolution) "WITH SCHEMA EVOLUTION" else "" - val mergeStmt = - s"""MERGE $schemaEvolutionClause - |INTO $tableNameAsString t - |USING source src - |ON t.pk = src.pk - |WHEN MATCHED THEN - | UPDATE SET t.a = src.a, t.dep = 'my_old_dep' - |WHEN NOT MATCHED THEN - | INSERT (pk, a, dep) VALUES (src.pk, src.a, 'my_new_dep') - |""".stripMargin - - if (withSchemaEvolution) { - sql(mergeStmt) - checkAnswer( - sql(s"SELECT * FROM $tableNameAsString"), - Seq(Row(0, Array(Row(10, 10, null)), "hr"), - Row(1, Array(Row(10, null, true)), "my_old_dep"), - Row(2, Array(Row(20, null, false)), "my_new_dep"))) - } else { - val exception = intercept[org.apache.spark.sql.AnalysisException] { - sql(mergeStmt) - } - assert(exception.errorClass.get == "INCOMPATIBLE_DATA_FOR_TABLE.EXTRA_STRUCT_FIELDS") - assert(exception.getMessage.contains( - "Cannot write extra fields `c3` to the struct `a`.`element`")) - } - } - sql(s"DROP TABLE IF EXISTS $tableNameAsString") - } - } - test("merge into empty table with NOT MATCHED clause schema evolution") { - Seq(true, false) foreach { withSchemaEvolution => - withTempView("source") { - createTable("pk INT NOT NULL, salary INT, dep STRING") - - val sourceRows = Seq( - (1, 100, "hr", true), - (2, 200, "finance", false), - (3, 300, "hr", true)) - sourceRows.toDF("pk", "salary", "dep", "active").createOrReplaceTempView("source") - - val schemaEvolutionClause = if (withSchemaEvolution) "WITH SCHEMA EVOLUTION" else "" - - sql( - s"""MERGE $schemaEvolutionClause - |INTO $tableNameAsString t - |USING source s - |ON t.pk = s.pk - |WHEN NOT MATCHED THEN - | INSERT * - |""".stripMargin) - - if (withSchemaEvolution) { - checkAnswer( - sql(s"SELECT * FROM $tableNameAsString"), - Seq( - Row(1, 100, "hr", true), - Row(2, 200, "finance", false), - Row(3, 300, "hr", true))) - } else { - checkAnswer( - sql(s"SELECT * FROM $tableNameAsString"), - Seq( - Row(1, 100, "hr"), - Row(2, 200, "finance"), - Row(3, 300, "hr"))) - } - sql("DROP TABLE IF EXISTS " + tableNameAsString) - } - } - } - - test("Merge schema evolution should not evolve referencing new column via transform") { - Seq(true, false).foreach { withSchemaEvolution => - withTempView("source") { - createAndInitTable("pk INT NOT NULL, salary INT, dep STRING", - """{ "pk": 1, "salary": 100, "dep": "hr" } - |{ "pk": 2, "salary": 200, "dep": "software" } - |""".stripMargin) - - val sourceDF = Seq((2, 150, "dummy", "blah"), - (3, 250, "dummy", "blah")).toDF("pk", "salary", "dep", "extra") - sourceDF.createOrReplaceTempView("source") - - val schemaEvolutionClause = if (withSchemaEvolution) "WITH SCHEMA EVOLUTION" else "" - val mergeStmt = - s"""MERGE $schemaEvolutionClause - |INTO $tableNameAsString t - |USING source s - |ON t.pk = s.pk - |WHEN MATCHED THEN - | UPDATE SET extra=substring(s.extra, 1, 2) - |""".stripMargin - - - val e = intercept[org.apache.spark.sql.AnalysisException] { - sql(mergeStmt) - } - assert(e.errorClass.get == "UNRESOLVED_COLUMN.WITH_SUGGESTION") - assert(e.getMessage.contains("A column, variable, or function parameter with name " + - "`extra` cannot be resolved")) - - sql(s"DROP TABLE $tableNameAsString") - } - } - } - - test("Merge schema evolution should not evolve if not directly referencing new column: update") { - Seq(true, false).foreach { withSchemaEvolution => - withTempView("source") { - createAndInitTable("pk INT NOT NULL, salary INT, dep STRING", - """{ "pk": 1, "salary": 100, "dep": "hr" } - |{ "pk": 2, "salary": 200, "dep": "software" } - |""".stripMargin) + createAndInitTable("pk INT NOT NULL, salary INT, dep STRING", + """{ "pk": 1, "salary": 100, "dep": "hr" } + |{ "pk": 2, "salary": 200, "dep": "software" } + |""".stripMargin) val sourceDF = Seq((2, 150, "dummy", "blah"), (3, 250, "dummy", "blah")).toDF("pk", "salary", "dep", "extra") @@ -4446,203 +4039,6 @@ abstract class MergeIntoTableSuiteBase extends RowLevelOperationSuiteBase } } - test("merge into with source missing fields in struct nested in array") { - withTempView("source") { - // Target table has struct with 3 fields (c1, c2, c3) in array - createAndInitTable( - s"""pk INT NOT NULL, - |a ARRAY>, - |dep STRING""".stripMargin, - """{ "pk": 0, "a": [ { "c1": 1, "c2": "a", "c3": true } ], "dep": "sales" } - |{ "pk": 1, "a": [ { "c1": 2, "c2": "b", "c3": false } ], "dep": "sales" }""" - .stripMargin) - - // Source table has struct with only 2 fields (c1, c2) - missing c3 - val sourceTableSchema = StructType(Seq( - StructField("pk", IntegerType, nullable = false), - StructField("a", ArrayType( - StructType(Seq( - StructField("c1", IntegerType), - StructField("c2", StringType))))), // missing c3 field - StructField("dep", StringType))) - val data = Seq( - Row(1, Array(Row(10, "c")), "hr"), - Row(2, Array(Row(30, "e")), "engineering") - ) - spark.createDataFrame(spark.sparkContext.parallelize(data), sourceTableSchema) - .createOrReplaceTempView("source") - - sql( - s"""MERGE INTO $tableNameAsString t - |USING source src - |ON t.pk = src.pk - |WHEN MATCHED THEN - | UPDATE SET * - |WHEN NOT MATCHED THEN - | INSERT * - |""".stripMargin) - - // Missing field c3 should be filled with NULL - checkAnswer( - sql(s"SELECT * FROM $tableNameAsString"), - Seq( - Row(0, Array(Row(1, "a", true)), "sales"), - Row(1, Array(Row(10, "c", null)), "hr"), - Row(2, Array(Row(30, "e", null)), "engineering"))) - } - sql(s"DROP TABLE IF EXISTS $tableNameAsString") - } - - test("merge into with source missing fields in struct nested in map key") { - withTempView("source") { - // Target table has struct with 2 fields in map key - val targetSchema = - StructType(Seq( - StructField("pk", IntegerType, nullable = false), - StructField("m", MapType( - StructType(Seq(StructField("c1", IntegerType), StructField("c2", BooleanType))), - StructType(Seq(StructField("c3", StringType))))), - StructField("dep", StringType))) - createTable(CatalogV2Util.structTypeToV2Columns(targetSchema)) - - val targetData = Seq( - Row(0, Map(Row(10, true) -> Row("x")), "hr"), - Row(1, Map(Row(20, false) -> Row("y")), "sales")) - spark.createDataFrame(spark.sparkContext.parallelize(targetData), targetSchema) - .writeTo(tableNameAsString).append() - - // Source table has struct with only 1 field (c1) in map key - missing c2 - val sourceTableSchema = StructType(Seq( - StructField("pk", IntegerType), - StructField("m", MapType( - StructType(Seq(StructField("c1", IntegerType))), // missing c2 - StructType(Seq(StructField("c3", StringType))))), - StructField("dep", StringType))) - val sourceData = Seq( - Row(1, Map(Row(10) -> Row("z")), "sales"), - Row(2, Map(Row(20) -> Row("w")), "engineering") - ) - spark.createDataFrame(spark.sparkContext.parallelize(sourceData), sourceTableSchema) - .createOrReplaceTempView("source") - - sql( - s"""MERGE INTO $tableNameAsString t - |USING source src - |ON t.pk = src.pk - |WHEN MATCHED THEN - | UPDATE SET * - |WHEN NOT MATCHED THEN - | INSERT * - |""".stripMargin) - - // Missing field c2 should be filled with NULL - checkAnswer( - sql(s"SELECT * FROM $tableNameAsString"), - Seq( - Row(0, Map(Row(10, true) -> Row("x")), "hr"), - Row(1, Map(Row(10, null) -> Row("z")), "sales"), - Row(2, Map(Row(20, null) -> Row("w")), "engineering"))) - } - sql(s"DROP TABLE IF EXISTS $tableNameAsString") - } - - test("merge into with source missing fields in struct nested in map value") { - withTempView("source") { - // Target table has struct with 2 fields in map value - val targetSchema = - StructType(Seq( - StructField("pk", IntegerType, nullable = false), - StructField("m", MapType( - StructType(Seq(StructField("c1", IntegerType))), - StructType(Seq(StructField("c1", StringType), StructField("c2", BooleanType))))), - StructField("dep", StringType))) - createTable(CatalogV2Util.structTypeToV2Columns(targetSchema)) - - val targetData = Seq( - Row(0, Map(Row(10) -> Row("x", true)), "hr"), - Row(1, Map(Row(20) -> Row("y", false)), "sales")) - spark.createDataFrame(spark.sparkContext.parallelize(targetData), targetSchema) - .writeTo(tableNameAsString).append() - - // Source table has struct with only 1 field (c1) in map value - missing c2 - val sourceTableSchema = StructType(Seq( - StructField("pk", IntegerType), - StructField("m", MapType( - StructType(Seq(StructField("c1", IntegerType))), - StructType(Seq(StructField("c1", StringType))))), // missing c2 - StructField("dep", StringType))) - val sourceData = Seq( - Row(1, Map(Row(10) -> Row("z")), "sales"), - Row(2, Map(Row(20) -> Row("w")), "engineering") - ) - spark.createDataFrame(spark.sparkContext.parallelize(sourceData), sourceTableSchema) - .createOrReplaceTempView("source") - - sql( - s"""MERGE INTO $tableNameAsString t - |USING source src - |ON t.pk = src.pk - |WHEN MATCHED THEN - | UPDATE SET * - |WHEN NOT MATCHED THEN - | INSERT * - |""".stripMargin) - - // Missing field c2 should be filled with NULL - checkAnswer( - sql(s"SELECT * FROM $tableNameAsString"), - Seq( - Row(0, Map(Row(10) -> Row("x", true)), "hr"), - Row(1, Map(Row(10) -> Row("z", null)), "sales"), - Row(2, Map(Row(20) -> Row("w", null)), "engineering"))) - } - sql(s"DROP TABLE IF EXISTS $tableNameAsString") - } - - test("merge into with source missing fields in top-level struct") { - withTempView("source") { - // Target table has struct with 3 fields at top level - createAndInitTable( - s"""pk INT NOT NULL, - |s STRUCT, - |dep STRING""".stripMargin, - """{ "pk": 0, "s": { "c1": 1, "c2": "a", "c3": true }, "dep": "sales"}""") - - // Source table has struct with only 2 fields (c1, c2) - missing c3 - val sourceTableSchema = StructType(Seq( - StructField("pk", IntegerType, nullable = false), - StructField("s", StructType(Seq( - StructField("c1", IntegerType), - StructField("c2", StringType)))), // missing c3 field - StructField("dep", StringType))) - val data = Seq( - Row(1, Row(10, "b"), "hr"), - Row(2, Row(20, "c"), "engineering") - ) - spark.createDataFrame(spark.sparkContext.parallelize(data), sourceTableSchema) - .createOrReplaceTempView("source") - - sql( - s"""MERGE INTO $tableNameAsString t - |USING source src - |ON t.pk = src.pk - |WHEN MATCHED THEN - | UPDATE SET * - |WHEN NOT MATCHED THEN - | INSERT * - |""".stripMargin) - - // Missing field c3 should be filled with NULL - checkAnswer( - sql(s"SELECT * FROM $tableNameAsString"), - Seq( - Row(0, Row(1, "a", true), "sales"), - Row(1, Row(10, "b", null), "hr"), - Row(2, Row(20, "c", null), "engineering"))) - } - sql(s"DROP TABLE IF EXISTS $tableNameAsString") - } - test("merge with null struct") { withTempView("source") { createAndInitTable( @@ -4738,297 +4134,18 @@ abstract class MergeIntoTableSuiteBase extends RowLevelOperationSuiteBase withTempView("source") { createAndInitTable( s"""pk INT NOT NULL, - |s STRUCT NOT NULL, - |dep STRING""".stripMargin, - """{ "pk": 0, "s": { "c1": 1, "c2": "a" }, "dep": "sales" } - |{ "pk": 1, "s": { "c1": 2, "c2": "b" }, "dep": "hr" }""" - .stripMargin) - - // Source table has null for the struct column - val sourceTableSchema = StructType(Seq( - StructField("pk", IntegerType), - StructField("s", StructType(Seq( - StructField("c1", IntegerType), - StructField("c2", StringType) - ))), - StructField("dep", StringType) - )) - - val data = Seq( - Row(1, null, "engineering"), - Row(2, null, "finance") - ) - spark.createDataFrame(spark.sparkContext.parallelize(data), sourceTableSchema) - .createOrReplaceTempView("source") - - // Should throw an exception when trying to insert/update null into NOT NULL column - val exception = intercept[Exception] { - sql( - s"""MERGE INTO $tableNameAsString t USING source - |ON t.pk = source.pk - |WHEN MATCHED THEN - | UPDATE SET * - |WHEN NOT MATCHED THEN - | INSERT * - |""".stripMargin) - } - assert(exception.getMessage.contains( - "NULL value appeared in non-nullable field")) - } - sql(s"DROP TABLE IF EXISTS $tableNameAsString") - } - - test("merge with with null struct with missing nested field") { - Seq(true, false).foreach { coerceNestedTypes => - withSQLConf(SQLConf.MERGE_INTO_NESTED_TYPE_COERCION_ENABLED.key -> - coerceNestedTypes.toString) { - withTempView("source") { - // Target table has nested struct with fields c1 and c2 - createAndInitTable( - s"""pk INT NOT NULL, - |s STRUCT>, - |dep STRING""".stripMargin, - """{ "pk": 0, "s": { "c1": 1, "c2": { "a": 10, "b": "x" } }, "dep": "sales" } - |{ "pk": 1, "s": { "c1": 2, "c2": { "a": 20, "b": "y" } }, "dep": "hr" }""" - .stripMargin) - - // Source table has null for the nested struct - val sourceTableSchema = StructType(Seq( - StructField("pk", IntegerType), - StructField("s", StructType(Seq( - StructField("c1", IntegerType), - StructField("c2", StructType(Seq( - StructField("a", IntegerType) - // missing field 'b' - ))) - ))), - StructField("dep", StringType) - )) - - val data = Seq( - Row(1, null, "engineering"), - Row(2, null, "finance") - ) - spark.createDataFrame(spark.sparkContext.parallelize(data), sourceTableSchema) - .createOrReplaceTempView("source") - - val mergeStmt = - s"""MERGE INTO $tableNameAsString t USING source - |ON t.pk = source.pk - |WHEN MATCHED THEN - | UPDATE SET * - |WHEN NOT MATCHED THEN - | INSERT * - |""".stripMargin - - if (coerceNestedTypes) { - sql(mergeStmt) - checkAnswer( - sql(s"SELECT * FROM $tableNameAsString"), - Seq( - Row(0, Row(1, Row(10, "x")), "sales"), - Row(1, null, "engineering"), - Row(2, null, "finance"))) - } else { - // Without coercion, the merge should fail due to missing field - val exception = intercept[org.apache.spark.sql.AnalysisException] { - sql(mergeStmt) - } - assert(exception.errorClass.get == - "INCOMPATIBLE_DATA_FOR_TABLE.CANNOT_FIND_DATA") - assert(exception.getMessage.contains( - "Cannot write incompatible data for the table ``: " + - "Cannot find data for the output column `s`.`c2`.`b`.")) - } - } - } - sql(s"DROP TABLE IF EXISTS $tableNameAsString") - } - } - - test("merge null struct with schema evolution - source with missing and extra nested fields") { - Seq(true, false).foreach { withSchemaEvolution => - Seq(true, false).foreach { coerceNestedTypes => - withSQLConf(SQLConf.MERGE_INTO_NESTED_TYPE_COERCION_ENABLED.key -> - coerceNestedTypes.toString) { - withTempView("source") { - // Target table has nested struct with fields c1 and c2 - createAndInitTable( - s"""pk INT NOT NULL, - |s STRUCT>, - |dep STRING""".stripMargin, - """{ "pk": 0, "s": { "c1": 1, "c2": { "a": 10, "b": "x" } }, "dep": "sales" } - |{ "pk": 1, "s": { "c1": 2, "c2": { "a": 20, "b": "y" } }, "dep": "hr" }""" - .stripMargin) - - // Source table has missing field 'b' and extra field 'c' in nested struct - val sourceTableSchema = StructType(Seq( - StructField("pk", IntegerType), - StructField("s", StructType(Seq( - StructField("c1", IntegerType), - StructField("c2", StructType(Seq( - StructField("a", IntegerType), - // missing field 'b' - StructField("c", StringType) // extra field 'c' - ))) - ))), - StructField("dep", StringType) - )) - - val data = Seq( - Row(1, null, "engineering"), - Row(2, null, "finance") - ) - spark.createDataFrame(spark.sparkContext.parallelize(data), sourceTableSchema) - .createOrReplaceTempView("source") - - val schemaEvolutionClause = if (withSchemaEvolution) "WITH SCHEMA EVOLUTION" else "" - val mergeStmt = - s"""MERGE $schemaEvolutionClause - |INTO $tableNameAsString t USING source - |ON t.pk = source.pk - |WHEN MATCHED THEN - | UPDATE SET * - |WHEN NOT MATCHED THEN - | INSERT * - |""".stripMargin - - if (coerceNestedTypes) { - if (withSchemaEvolution) { - // extra nested field is added - sql(mergeStmt) - checkAnswer( - sql(s"SELECT * FROM $tableNameAsString"), - Seq( - Row(0, Row(1, Row(10, "x", null)), "sales"), - Row(1, null, "engineering"), - Row(2, null, "finance"))) - } else { - // extra nested field is not added - val exception = intercept[org.apache.spark.sql.AnalysisException] { - sql(mergeStmt) - } - assert(exception.errorClass.get == - "INCOMPATIBLE_DATA_FOR_TABLE.EXTRA_STRUCT_FIELDS") - assert(exception.getMessage.contains( - "Cannot write incompatible data for the table ``: " + - "Cannot write extra fields `c` to the struct `s`.`c2`")) - } - } else { - // Without source struct coercion, the merge should fail - val exception = intercept[org.apache.spark.sql.AnalysisException] { - sql(mergeStmt) - } - assert(exception.errorClass.get == - "INCOMPATIBLE_DATA_FOR_TABLE.CANNOT_FIND_DATA") - assert(exception.getMessage.contains( - "Cannot write incompatible data for the table ``: " + - "Cannot find data for the output column `s`.`c2`.`b`.")) - } - } - } - sql(s"DROP TABLE IF EXISTS $tableNameAsString") - } - } - } - - test("merge null struct with non-nullable nested field - source with missing " + - "and extra nested fields") { - withSQLConf( - SQLConf.MERGE_INTO_NESTED_TYPE_COERCION_ENABLED.key -> "true") { - withTempView("source") { - // Target table has nested struct with NON-NULLABLE field b - createAndInitTable( - s"""pk INT NOT NULL, - |s STRUCT>, - |dep STRING""".stripMargin, - """{ "pk": 0, "s": { "c1": 1, "c2": { "a": 10, "b": "x" } }, "dep": "sales" } - |{ "pk": 1, "s": { "c1": 2, "c2": { "a": 20, "b": "y" } }, "dep": "hr" }""" - .stripMargin) - - // Source table has missing field 'b' and extra field 'c' in nested struct - val sourceTableSchema = StructType(Seq( - StructField("pk", IntegerType), - StructField("s", StructType(Seq( - StructField("c1", IntegerType), - StructField("c2", StructType(Seq( - StructField("a", IntegerType), - // missing field 'b' (which is non-nullable in target) - StructField("c", StringType) // extra field 'c' - ))) - ))), - StructField("dep", StringType) - )) - - val data = Seq( - Row(1, null, "engineering"), - Row(2, null, "finance") - ) - spark.createDataFrame(spark.sparkContext.parallelize(data), sourceTableSchema) - .createOrReplaceTempView("source") - - val mergeStmt = - s"""MERGE WITH SCHEMA EVOLUTION - |INTO $tableNameAsString t USING source - |ON t.pk = source.pk - |WHEN MATCHED THEN - | UPDATE SET * - |WHEN NOT MATCHED THEN - | INSERT * - |""".stripMargin - - // All cases should fail due to non-nullable constraint violation - val exception = intercept[org.apache.spark.sql.AnalysisException] { - sql(mergeStmt) - } - assert(exception.errorClass.get == "INCOMPATIBLE_DATA_FOR_TABLE.CANNOT_FIND_DATA") - assert(exception.getMessage.contains("Cannot write incompatible data for the table ``: " + - "Cannot find data for the output column `s`.`c2`.`b`.")) - } - sql(s"DROP TABLE IF EXISTS $tableNameAsString") - } - } - - test("merge with null struct using default value") { - withTempView("source") { - // Target table has nested struct with a default value - sql( - s"""CREATE TABLE $tableNameAsString ( - | pk INT NOT NULL, - | s STRUCT> DEFAULT - | named_struct('c1', 999, 'c2', named_struct('a', 999, 'b', 'default')), - | dep STRING) - |PARTITIONED BY (dep) - |""".stripMargin) - - // Insert initial data using DataFrame API - val initialSchema = StructType(Seq( - StructField("pk", IntegerType, nullable = false), - StructField("s", StructType(Seq( - StructField("c1", IntegerType), - StructField("c2", StructType(Seq( - StructField("a", IntegerType), - StructField("b", StringType) - ))) - ))), - StructField("dep", StringType) - )) - val initialData = Seq( - Row(0, Row(1, Row(10, "x")), "sales"), - Row(1, Row(2, Row(20, "y")), "hr") - ) - spark.createDataFrame(spark.sparkContext.parallelize(initialData), initialSchema) - .writeTo(tableNameAsString).append() + |s STRUCT NOT NULL, + |dep STRING""".stripMargin, + """{ "pk": 0, "s": { "c1": 1, "c2": "a" }, "dep": "sales" } + |{ "pk": 1, "s": { "c1": 2, "c2": "b" }, "dep": "hr" }""" + .stripMargin) - // Source table has null for the nested struct + // Source table has null for the struct column val sourceTableSchema = StructType(Seq( StructField("pk", IntegerType), StructField("s", StructType(Seq( StructField("c1", IntegerType), - StructField("c2", StructType(Seq( - StructField("a", IntegerType) - // missing field 'b' - ))) + StructField("c2", StringType) ))), StructField("dep", StringType) )) @@ -5040,25 +4157,23 @@ abstract class MergeIntoTableSuiteBase extends RowLevelOperationSuiteBase spark.createDataFrame(spark.sparkContext.parallelize(data), sourceTableSchema) .createOrReplaceTempView("source") - sql( - s"""MERGE INTO $tableNameAsString t USING source - |ON t.pk = source.pk - |WHEN MATCHED THEN - | UPDATE SET * - |WHEN NOT MATCHED THEN - | INSERT * - |""".stripMargin) - checkAnswer( - sql(s"SELECT * FROM $tableNameAsString"), - Seq( - Row(0, Row(1, Row(10, "x")), "sales"), - Row(1, null, "engineering"), - Row(2, null, "finance"))) + // Should throw an exception when trying to insert/update null into NOT NULL column + val exception = intercept[Exception] { + sql( + s"""MERGE INTO $tableNameAsString t USING source + |ON t.pk = source.pk + |WHEN MATCHED THEN + | UPDATE SET * + |WHEN NOT MATCHED THEN + | INSERT * + |""".stripMargin) + } + assert(exception.getMessage.contains( + "NULL value appeared in non-nullable field")) } sql(s"DROP TABLE IF EXISTS $tableNameAsString") } - test("merge with source missing struct column with default value") { withTempView("source") { // Target table has nested struct with a default value @@ -5123,68 +4238,6 @@ abstract class MergeIntoTableSuiteBase extends RowLevelOperationSuiteBase } } - test("merge into with source missing fields in nested struct") { - Seq(true, false).foreach { nestedTypeCoercion => - withSQLConf(SQLConf.MERGE_INTO_NESTED_TYPE_COERCION_ENABLED.key - -> nestedTypeCoercion.toString) { - withTempView("source") { - // Target table has nested struct: s.c1, s.c2.a, s.c2.b - createAndInitTable( - s"""pk INT NOT NULL, - |s STRUCT>, - |dep STRING""".stripMargin, - """{ "pk": 1, "s": { "c1": 2, "c2": { "a": 10, "b": true } } } - |{ "pk": 2, "s": { "c1": 2, "c2": { "a": 30, "b": false } } }""".stripMargin) - - // Source table is missing field 'b' in nested struct s.c2 - val sourceTableSchema = StructType(Seq( - StructField("pk", IntegerType, nullable = false), - StructField("s", StructType(Seq( - StructField("c1", IntegerType), - StructField("c2", StructType(Seq( - StructField("a", IntegerType) - // missing field 'b' - ))) - ))), - StructField("dep", StringType) - )) - val data = Seq( - Row(1, Row(10, Row(20)), "sales"), - Row(2, Row(20, Row(30)), "engineering") - ) - spark.createDataFrame(spark.sparkContext.parallelize(data), sourceTableSchema) - .createOrReplaceTempView("source") - - // Missing field b should be filled with NULL - val mergeStmt = s"""MERGE INTO $tableNameAsString t - |USING source src - |ON t.pk = src.pk - |WHEN MATCHED THEN - | UPDATE SET * - |WHEN NOT MATCHED THEN - | INSERT * - |""".stripMargin - - if (nestedTypeCoercion) { - sql(mergeStmt) - checkAnswer( - sql(s"SELECT * FROM $tableNameAsString"), - Seq( - Row(1, Row(10, Row(20, true)), "sales"), - Row(2, Row(20, Row(30, false)), "engineering"))) - } else { - val exception = intercept[Exception] { - sql(mergeStmt) - } - assert(exception.getMessage.contains( - """Cannot write incompatible data for the table ``""".stripMargin)) - } - } - sql(s"DROP TABLE IF EXISTS $tableNameAsString") - } - } - } - test("merge with named_struct missing non-nullable field") { withTempView("source") { createAndInitTable( @@ -5360,210 +4413,9 @@ abstract class MergeIntoTableSuiteBase extends RowLevelOperationSuiteBase val e = intercept[org.apache.spark.sql.AnalysisException] { mergeBuilder.merge() } - assert(e.errorClass.get == "UNRESOLVED_COLUMN.WITH_SUGGESTION") - assert(e.getMessage.contains("A column, variable, or function parameter with name " + - "`active` cannot be resolved")) - } - - sql(s"DROP TABLE $tableNameAsString") - } - } - } - - test("merge schema evolution add column with nested struct and set explicit columns " + - "using dataframe API") { - Seq(true, false).foreach { withSchemaEvolution => - val sourceTable = "cat.ns1.source_table" - withTable(sourceTable) { - sql( - s"""CREATE TABLE $tableNameAsString ( - |pk INT NOT NULL, - |s STRUCT, m: MAP>>, - |dep STRING)""".stripMargin) - - val targetData = Seq( - Row(1, Row(2, Row(Array(1, 2), Map("a" -> "b"))), "hr") - ) - val targetSchema = StructType(Seq( - StructField("pk", IntegerType, nullable = false), - StructField("s", StructType(Seq( - StructField("c1", IntegerType), - StructField("c2", StructType(Seq( - StructField("a", ArrayType(IntegerType)), - StructField("m", MapType(StringType, StringType)) - ))) - ))), - StructField("dep", StringType) - )) - spark.createDataFrame(spark.sparkContext.parallelize(targetData), targetSchema) - .writeTo(tableNameAsString).append() - - val sourceIdent = Identifier.of(Array("ns1"), "source_table") - val columns = Array( - Column.create("pk", IntegerType, false), - Column.create("s", StructType(Seq( - StructField("c1", IntegerType), - StructField("c2", StructType(Seq( - StructField("a", ArrayType(IntegerType)), - StructField("m", MapType(StringType, StringType)), - StructField("c3", BooleanType) // new column - ))) - ))), - Column.create("dep", StringType)) - val tableInfo = new TableInfo.Builder() - .withColumns(columns) - .withProperties(extraTableProps) - .build() - catalog.createTable(sourceIdent, tableInfo) - - val data = Seq( - Row(1, Row(10, Row(Array(3, 4), Map("c" -> "d"), false)), "sales"), - Row(2, Row(20, Row(Array(4, 5), Map("e" -> "f"), true)), "engineering") - ) - val sourceTableSchema = StructType(Seq( - StructField("pk", IntegerType, nullable = false), - StructField("s", StructType(Seq( - StructField("c1", IntegerType), - StructField("c2", StructType(Seq( - StructField("a", ArrayType(IntegerType)), - StructField("m", MapType(StringType, StringType)), - StructField("c3", BooleanType) - ))) - ))), - StructField("dep", StringType) - )) - spark.createDataFrame(spark.sparkContext.parallelize(data), sourceTableSchema) - .createOrReplaceTempView("source_temp") - - sql(s"INSERT INTO $sourceTable SELECT * FROM source_temp") - - val mergeBuilder = spark.table(sourceTable) - .mergeInto(tableNameAsString, $"source_table.pk" === col(tableNameAsString + ".pk")) - .whenMatched() - .update(Map( - "s.c1" -> lit(-1), - "s.c2.m" -> map(lit("k"), lit("v")), - "s.c2.a" -> array(lit(-1)), - "s.c2.c3" -> col("source_table.s.c2.c3"))) - .whenNotMatched() - .insert(Map( - "pk" -> col("source_table.pk"), - "s" -> struct( - col("source_table.s.c1").as("c1"), - struct( - col("source_table.s.c2.a").as("a"), - map(lit("g"), lit("h")).as("m"), - lit(true).as("c3") - ).as("c2") - ), - "dep" -> col("source_table.dep"))) - - if (withSchemaEvolution) { - mergeBuilder.withSchemaEvolution().merge() - checkAnswer( - sql(s"SELECT * FROM $tableNameAsString"), - Seq(Row(1, Row(-1, Row(Seq(-1), Map("k" -> "v"), false)), "hr"), - Row(2, Row(20, Row(Seq(4, 5), Map("g" -> "h"), true)), "engineering"))) - } else { - val exception = intercept[org.apache.spark.sql.AnalysisException] { - mergeBuilder.merge() - } - assert(exception.errorClass.get == "FIELD_NOT_FOUND") - assert(exception.getMessage.contains("No such struct field `c3` in `a`, `m`. ")) - } - - sql(s"DROP TABLE $tableNameAsString") - } - } - } - - test("merge schema evolution add column with nested struct and set all columns " + - "using dataframe API") { - Seq(true, false).foreach { withSchemaEvolution => - val sourceTable = "cat.ns1.source_table" - withTable(sourceTable) { - sql( - s"""CREATE TABLE $tableNameAsString ( - |pk INT NOT NULL, - |s STRUCT, m: MAP>>, - |dep STRING)""".stripMargin) - - val targetData = Seq( - Row(1, Row(2, Row(Array(1, 2), Map("a" -> "b"))), "hr") - ) - val targetSchema = StructType(Seq( - StructField("pk", IntegerType, nullable = false), - StructField("s", StructType(Seq( - StructField("c1", IntegerType), - StructField("c2", StructType(Seq( - StructField("a", ArrayType(IntegerType)), - StructField("m", MapType(StringType, StringType)) - ))) - ))), - StructField("dep", StringType) - )) - spark.createDataFrame(spark.sparkContext.parallelize(targetData), targetSchema) - .writeTo(tableNameAsString).append() - - val sourceIdent = Identifier.of(Array("ns1"), "source_table") - val columns = Array( - Column.create("pk", IntegerType, false), - Column.create("s", StructType(Seq( - StructField("c1", IntegerType), - StructField("c2", StructType(Seq( - StructField("a", ArrayType(IntegerType)), - StructField("m", MapType(StringType, StringType)), - StructField("c3", BooleanType) // new column - ))) - ))), - Column.create("dep", StringType)) - val tableInfo = new TableInfo.Builder() - .withColumns(columns) - .withProperties(extraTableProps) - .build() - catalog.createTable(sourceIdent, tableInfo) - - val data = Seq( - Row(1, Row(10, Row(Array(3, 4), Map("c" -> "d"), false)), "sales"), - Row(2, Row(20, Row(Array(4, 5), Map("e" -> "f"), true)), "engineering") - ) - val sourceTableSchema = StructType(Seq( - StructField("pk", IntegerType, nullable = false), - StructField("s", StructType(Seq( - StructField("c1", IntegerType), - StructField("c2", StructType(Seq( - StructField("a", ArrayType(IntegerType)), - StructField("m", MapType(StringType, StringType)), - StructField("c3", BooleanType) - ))) - ))), - StructField("dep", StringType) - )) - spark.createDataFrame(spark.sparkContext.parallelize(data), sourceTableSchema) - .createOrReplaceTempView("source_temp") - - sql(s"INSERT INTO $sourceTable SELECT * FROM source_temp") - - val mergeBuilder = spark.table(sourceTable) - .mergeInto(tableNameAsString, $"source_table.pk" === col(tableNameAsString + ".pk")) - .whenMatched() - .updateAll() - .whenNotMatched() - .insertAll() - - if (withSchemaEvolution) { - mergeBuilder.withSchemaEvolution().merge() - checkAnswer( - sql(s"SELECT * FROM $tableNameAsString"), - Seq(Row(1, Row(10, Row(Seq(3, 4), Map("c" -> "d"), false)), "sales"), - Row(2, Row(20, Row(Seq(4, 5), Map("e" -> "f"), true)), "engineering"))) - } else { - val exception = intercept[org.apache.spark.sql.AnalysisException] { - mergeBuilder.merge() - } - assert(exception.errorClass.get == "INCOMPATIBLE_DATA_FOR_TABLE.EXTRA_STRUCT_FIELDS") - assert(exception.getMessage.contains( - "Cannot write extra fields `c3` to the struct `s`.`c2`")) + assert(e.errorClass.get == "UNRESOLVED_COLUMN.WITH_SUGGESTION") + assert(e.getMessage.contains("A column, variable, or function parameter with name " + + "`active` cannot be resolved")) } sql(s"DROP TABLE $tableNameAsString") @@ -5571,8 +4423,8 @@ abstract class MergeIntoTableSuiteBase extends RowLevelOperationSuiteBase } } - test("merge schema evolution replace column with nested struct and " + - "set explicit columns using dataframe API") { + test("merge schema evolution add column with nested struct and set explicit columns " + + "using dataframe API") { Seq(true, false).foreach { withSchemaEvolution => val sourceTable = "cat.ns1.source_table" withTable(sourceTable) { @@ -5605,7 +4457,7 @@ abstract class MergeIntoTableSuiteBase extends RowLevelOperationSuiteBase Column.create("s", StructType(Seq( StructField("c1", IntegerType), StructField("c2", StructType(Seq( - // removed column 'a' + StructField("a", ArrayType(IntegerType)), StructField("m", MapType(StringType, StringType)), StructField("c3", BooleanType) // new column ))) @@ -5618,14 +4470,15 @@ abstract class MergeIntoTableSuiteBase extends RowLevelOperationSuiteBase catalog.createTable(sourceIdent, tableInfo) val data = Seq( - Row(1, Row(10, Row(Map("c" -> "d"), false)), "sales"), - Row(2, Row(20, Row(Map("e" -> "f"), true)), "engineering") + Row(1, Row(10, Row(Array(3, 4), Map("c" -> "d"), false)), "sales"), + Row(2, Row(20, Row(Array(4, 5), Map("e" -> "f"), true)), "engineering") ) val sourceTableSchema = StructType(Seq( StructField("pk", IntegerType, nullable = false), StructField("s", StructType(Seq( StructField("c1", IntegerType), StructField("c2", StructType(Seq( + StructField("a", ArrayType(IntegerType)), StructField("m", MapType(StringType, StringType)), StructField("c3", BooleanType) ))) @@ -5651,7 +4504,7 @@ abstract class MergeIntoTableSuiteBase extends RowLevelOperationSuiteBase "s" -> struct( col("source_table.s.c1").as("c1"), struct( - array(lit(-2)).as("a"), + col("source_table.s.c2.a").as("a"), map(lit("g"), lit("h")).as("m"), lit(true).as("c3") ).as("c2") @@ -5663,7 +4516,7 @@ abstract class MergeIntoTableSuiteBase extends RowLevelOperationSuiteBase checkAnswer( sql(s"SELECT * FROM $tableNameAsString"), Seq(Row(1, Row(-1, Row(Seq(-1), Map("k" -> "v"), false)), "hr"), - Row(2, Row(20, Row(Seq(-2), Map("g" -> "h"), true)), "engineering"))) + Row(2, Row(20, Row(Seq(4, 5), Map("g" -> "h"), true)), "engineering"))) } else { val exception = intercept[org.apache.spark.sql.AnalysisException] { mergeBuilder.merge() @@ -5677,7 +4530,7 @@ abstract class MergeIntoTableSuiteBase extends RowLevelOperationSuiteBase } } - test("merge schema evolution replace column with nested struct and set all columns " + + test("merge schema evolution add column with nested struct and set all columns " + "using dataframe API") { Seq(true, false).foreach { withSchemaEvolution => val sourceTable = "cat.ns1.source_table" @@ -5686,26 +4539,24 @@ abstract class MergeIntoTableSuiteBase extends RowLevelOperationSuiteBase s"""CREATE TABLE $tableNameAsString ( |pk INT NOT NULL, |s STRUCT, m: MAP>>, - |dep STRING) - |PARTITIONED BY (dep) - |""".stripMargin) + |dep STRING)""".stripMargin) - val tableSchema = StructType(Seq( - StructField("pk", IntegerType, nullable = false), - StructField("s", StructType(Seq( - StructField("c1", IntegerType), - StructField("c2", StructType(Seq( - StructField("a", ArrayType(IntegerType)), - StructField("m", MapType(StringType, StringType)) - ))) - ))), - StructField("dep", StringType) - )) - val targetData = Seq( - Row(1, Row(2, Row(Array(1, 2), Map("a" -> "b"))), "hr") - ) - spark.createDataFrame(spark.sparkContext.parallelize(targetData), tableSchema) - .coalesce(1).writeTo(tableNameAsString).append() + val targetData = Seq( + Row(1, Row(2, Row(Array(1, 2), Map("a" -> "b"))), "hr") + ) + val targetSchema = StructType(Seq( + StructField("pk", IntegerType, nullable = false), + StructField("s", StructType(Seq( + StructField("c1", IntegerType), + StructField("c2", StructType(Seq( + StructField("a", ArrayType(IntegerType)), + StructField("m", MapType(StringType, StringType)) + ))) + ))), + StructField("dep", StringType) + )) + spark.createDataFrame(spark.sparkContext.parallelize(targetData), targetSchema) + .writeTo(tableNameAsString).append() val sourceIdent = Identifier.of(Array("ns1"), "source_table") val columns = Array( @@ -5713,7 +4564,7 @@ abstract class MergeIntoTableSuiteBase extends RowLevelOperationSuiteBase Column.create("s", StructType(Seq( StructField("c1", IntegerType), StructField("c2", StructType(Seq( - // missing column 'a' + StructField("a", ArrayType(IntegerType)), StructField("m", MapType(StringType, StringType)), StructField("c3", BooleanType) // new column ))) @@ -5725,22 +4576,23 @@ abstract class MergeIntoTableSuiteBase extends RowLevelOperationSuiteBase .build() catalog.createTable(sourceIdent, tableInfo) - val sourceData = Seq( - Row(1, Row(10, Row(Map("c" -> "d"), false)), "sales"), - Row(2, Row(20, Row(Map("e" -> "f"), true)), "engineering") + val data = Seq( + Row(1, Row(10, Row(Array(3, 4), Map("c" -> "d"), false)), "sales"), + Row(2, Row(20, Row(Array(4, 5), Map("e" -> "f"), true)), "engineering") ) val sourceTableSchema = StructType(Seq( StructField("pk", IntegerType, nullable = false), StructField("s", StructType(Seq( StructField("c1", IntegerType), StructField("c2", StructType(Seq( + StructField("a", ArrayType(IntegerType)), StructField("m", MapType(StringType, StringType)), StructField("c3", BooleanType) ))) ))), StructField("dep", StringType) )) - spark.createDataFrame(spark.sparkContext.parallelize(sourceData), sourceTableSchema) + spark.createDataFrame(spark.sparkContext.parallelize(data), sourceTableSchema) .createOrReplaceTempView("source_temp") sql(s"INSERT INTO $sourceTable SELECT * FROM source_temp") @@ -5756,15 +4608,13 @@ abstract class MergeIntoTableSuiteBase extends RowLevelOperationSuiteBase mergeBuilder.withSchemaEvolution().merge() checkAnswer( sql(s"SELECT * FROM $tableNameAsString"), - Seq( - Row(1, Row(10, Row(Seq(1, 2), Map("c" -> "d"), false)), "sales"), - Row(2, Row(20, Row(null, Map("e" -> "f"), true)), "engineering"))) + Seq(Row(1, Row(10, Row(Seq(3, 4), Map("c" -> "d"), false)), "sales"), + Row(2, Row(20, Row(Seq(4, 5), Map("e" -> "f"), true)), "engineering"))) } else { val exception = intercept[org.apache.spark.sql.AnalysisException] { mergeBuilder.merge() } - assert(exception.errorClass.get == - "INCOMPATIBLE_DATA_FOR_TABLE.EXTRA_STRUCT_FIELDS") + assert(exception.errorClass.get == "INCOMPATIBLE_DATA_FOR_TABLE.EXTRA_STRUCT_FIELDS") assert(exception.getMessage.contains( "Cannot write extra fields `c3` to the struct `s`.`c2`")) } @@ -5775,7 +4625,7 @@ abstract class MergeIntoTableSuiteBase extends RowLevelOperationSuiteBase } test("merge schema evolution replace column with nested struct and " + - "update top level struct using dataframe API") { + "set explicit columns using dataframe API") { Seq(true, false).foreach { withSchemaEvolution => val sourceTable = "cat.ns1.source_table" withTable(sourceTable) { @@ -5783,11 +4633,12 @@ abstract class MergeIntoTableSuiteBase extends RowLevelOperationSuiteBase s"""CREATE TABLE $tableNameAsString ( |pk INT NOT NULL, |s STRUCT, m: MAP>>, - |dep STRING) - |PARTITIONED BY (dep) - |""".stripMargin) + |dep STRING)""".stripMargin) - val tableSchema = StructType(Seq( + val targetData = Seq( + Row(1, Row(2, Row(Array(1, 2), Map("a" -> "b"))), "hr") + ) + val targetSchema = StructType(Seq( StructField("pk", IntegerType, nullable = false), StructField("s", StructType(Seq( StructField("c1", IntegerType), @@ -5798,73 +4649,80 @@ abstract class MergeIntoTableSuiteBase extends RowLevelOperationSuiteBase ))), StructField("dep", StringType) )) - val targetData = Seq( - Row(1, Row(2, Row(Array(1, 2), Map("a" -> "b"))), "hr") - ) - spark.createDataFrame(spark.sparkContext.parallelize(targetData), tableSchema) - .coalesce(1).writeTo(tableNameAsString).append() + spark.createDataFrame(spark.sparkContext.parallelize(targetData), targetSchema) + .writeTo(tableNameAsString).append() - // Create source table - val sourceIdent = Identifier.of(Array("ns1"), "source_table") - val columns = Array( - Column.create("pk", IntegerType, false), - Column.create("s", StructType(Seq( - StructField("c1", IntegerType), - StructField("c2", StructType(Seq( - // missing column 'a' - StructField("m", MapType(StringType, StringType)), - StructField("c3", BooleanType) // new column - ))) - ))), - Column.create("dep", StringType)) - val tableInfo = new TableInfo.Builder() - .withColumns(columns) - .withProperties(extraTableProps) - .build() - catalog.createTable(sourceIdent, tableInfo) + val sourceIdent = Identifier.of(Array("ns1"), "source_table") + val columns = Array( + Column.create("pk", IntegerType, false), + Column.create("s", StructType(Seq( + StructField("c1", IntegerType), + StructField("c2", StructType(Seq( + // removed column 'a' + StructField("m", MapType(StringType, StringType)), + StructField("c3", BooleanType) // new column + ))) + ))), + Column.create("dep", StringType)) + val tableInfo = new TableInfo.Builder() + .withColumns(columns) + .withProperties(extraTableProps) + .build() + catalog.createTable(sourceIdent, tableInfo) - val sourceData = Seq( - Row(1, Row(10, Row(Map("c" -> "d"), false)), "sales"), - Row(2, Row(20, Row(Map("e" -> "f"), true)), "engineering") - ) - val sourceTableSchema = StructType(Seq( - StructField("pk", IntegerType, nullable = false), - StructField("s", StructType(Seq( - StructField("c1", IntegerType), - StructField("c2", StructType(Seq( - StructField("m", MapType(StringType, StringType)), - StructField("c3", BooleanType) - ))) - ))), - StructField("dep", StringType) - )) - spark.createDataFrame(spark.sparkContext.parallelize(sourceData), sourceTableSchema) - .createOrReplaceTempView("source_temp") + val data = Seq( + Row(1, Row(10, Row(Map("c" -> "d"), false)), "sales"), + Row(2, Row(20, Row(Map("e" -> "f"), true)), "engineering") + ) + val sourceTableSchema = StructType(Seq( + StructField("pk", IntegerType, nullable = false), + StructField("s", StructType(Seq( + StructField("c1", IntegerType), + StructField("c2", StructType(Seq( + StructField("m", MapType(StringType, StringType)), + StructField("c3", BooleanType) + ))) + ))), + StructField("dep", StringType) + )) + spark.createDataFrame(spark.sparkContext.parallelize(data), sourceTableSchema) + .createOrReplaceTempView("source_temp") - sql(s"INSERT INTO $sourceTable SELECT * FROM source_temp") + sql(s"INSERT INTO $sourceTable SELECT * FROM source_temp") val mergeBuilder = spark.table(sourceTable) .mergeInto(tableNameAsString, $"source_table.pk" === col(tableNameAsString + ".pk")) .whenMatched() - .update(Map("s" -> col("source_table.s"))) + .update(Map( + "s.c1" -> lit(-1), + "s.c2.m" -> map(lit("k"), lit("v")), + "s.c2.a" -> array(lit(-1)), + "s.c2.c3" -> col("source_table.s.c2.c3"))) .whenNotMatched() - .insertAll() + .insert(Map( + "pk" -> col("source_table.pk"), + "s" -> struct( + col("source_table.s.c1").as("c1"), + struct( + array(lit(-2)).as("a"), + map(lit("g"), lit("h")).as("m"), + lit(true).as("c3") + ).as("c2") + ), + "dep" -> col("source_table.dep"))) if (withSchemaEvolution) { mergeBuilder.withSchemaEvolution().merge() checkAnswer( sql(s"SELECT * FROM $tableNameAsString"), - Seq( - Row(1, Row(10, Row(null, Map("c" -> "d"), false)), "hr"), - Row(2, Row(20, Row(null, Map("e" -> "f"), true)), "engineering"))) + Seq(Row(1, Row(-1, Row(Seq(-1), Map("k" -> "v"), false)), "hr"), + Row(2, Row(20, Row(Seq(-2), Map("g" -> "h"), true)), "engineering"))) } else { val exception = intercept[org.apache.spark.sql.AnalysisException] { mergeBuilder.merge() } - assert(exception.errorClass.get == - "INCOMPATIBLE_DATA_FOR_TABLE.EXTRA_STRUCT_FIELDS") - assert(exception.getMessage.contains( - "Cannot write extra fields `c3` to the struct `s`.`c2`")) + assert(exception.errorClass.get == "FIELD_NOT_FOUND") + assert(exception.getMessage.contains("No such struct field `c3` in `a`, `m`. ")) } sql(s"DROP TABLE $tableNameAsString") @@ -5932,302 +4790,6 @@ abstract class MergeIntoTableSuiteBase extends RowLevelOperationSuiteBase } } - test("merge into with source missing fields in top-level struct using dataframe API") { - val sourceTable = "cat.ns1.source_table" - withTable(sourceTable) { - // Target table has struct with 3 fields at top level - sql( - s"""CREATE TABLE $tableNameAsString ( - |pk INT NOT NULL, - |s STRUCT, - |dep STRING)""".stripMargin) - - val targetData = Seq( - Row(0, Row(1, "a", true), "sales") - ) - val targetSchema = StructType(Seq( - StructField("pk", IntegerType, nullable = false), - StructField("s", StructType(Seq( - StructField("c1", IntegerType), - StructField("c2", StringType), - StructField("c3", BooleanType) - ))), - StructField("dep", StringType) - )) - spark.createDataFrame(spark.sparkContext.parallelize(targetData), targetSchema) - .writeTo(tableNameAsString).append() - - // Create source table with struct having only 2 fields (c1, c2) - missing c3 - val sourceIdent = Identifier.of(Array("ns1"), "source_table") - val columns = Array( - Column.create("pk", IntegerType, false), - Column.create("s", StructType(Seq( - StructField("c1", IntegerType), - StructField("c2", StringType)))), // missing c3 field - Column.create("dep", StringType)) - val tableInfo = new TableInfo.Builder() - .withColumns(columns) - .withProperties(extraTableProps) - .build() - catalog.createTable(sourceIdent, tableInfo) - - val data = Seq( - Row(1, Row(10, "b"), "hr"), - Row(2, Row(20, "c"), "engineering") - ) - val sourceTableSchema = StructType(Seq( - StructField("pk", IntegerType, nullable = false), - StructField("s", StructType(Seq( - StructField("c1", IntegerType), - StructField("c2", StringType)))), - StructField("dep", StringType))) - spark.createDataFrame(spark.sparkContext.parallelize(data), sourceTableSchema) - .createOrReplaceTempView("source_temp") - - sql(s"INSERT INTO $sourceTable SELECT * FROM source_temp") - - spark.table(sourceTable) - .mergeInto(tableNameAsString, $"source_table.pk" === col(tableNameAsString + ".pk")) - .whenMatched() - .updateAll() - .whenNotMatched() - .insertAll() - .merge() - - // Missing field c3 should be filled with NULL - checkAnswer( - sql(s"SELECT * FROM $tableNameAsString"), - Seq( - Row(0, Row(1, "a", true), "sales"), - Row(1, Row(10, "b", null), "hr"), - Row(2, Row(20, "c", null), "engineering"))) - - sql(s"DROP TABLE $tableNameAsString") - } - } - - test("merge with null struct with missing nested field using dataframe API") { - Seq(true, false).foreach { coerceNestedTypes => - withSQLConf(SQLConf.MERGE_INTO_NESTED_TYPE_COERCION_ENABLED.key -> - coerceNestedTypes.toString) { - val sourceTable = "cat.ns1.source_table" - withTable(sourceTable) { - // Target table has nested struct with fields c1 and c2 - sql( - s"""CREATE TABLE $tableNameAsString ( - |pk INT NOT NULL, - |s STRUCT>, - |dep STRING)""".stripMargin) - - val targetData = Seq( - Row(0, Row(1, Row(10, "x")), "sales"), - Row(1, Row(2, Row(20, "y")), "hr") - ) - val targetSchema = StructType(Seq( - StructField("pk", IntegerType, nullable = false), - StructField("s", StructType(Seq( - StructField("c1", IntegerType), - StructField("c2", StructType(Seq( - StructField("a", IntegerType), - StructField("b", StringType) - ))) - ))), - StructField("dep", StringType) - )) - spark.createDataFrame(spark.sparkContext.parallelize(targetData), targetSchema) - .writeTo(tableNameAsString).append() - - // Create source table with missing nested field 'b' - val sourceIdent = Identifier.of(Array("ns1"), "source_table") - val columns = Array( - Column.create("pk", IntegerType, false), - Column.create("s", StructType(Seq( - StructField("c1", IntegerType), - StructField("c2", StructType(Seq( - StructField("a", IntegerType) - // missing field 'b' - ))) - ))), - Column.create("dep", StringType)) - val tableInfo = new TableInfo.Builder() - .withColumns(columns) - .withProperties(extraTableProps) - .build() - catalog.createTable(sourceIdent, tableInfo) - - // Source table has null for the nested struct - val data = Seq( - Row(1, null, "engineering"), - Row(2, null, "finance") - ) - val sourceTableSchema = StructType(Seq( - StructField("pk", IntegerType), - StructField("s", StructType(Seq( - StructField("c1", IntegerType), - StructField("c2", StructType(Seq( - StructField("a", IntegerType) - ))) - ))), - StructField("dep", StringType) - )) - spark.createDataFrame(spark.sparkContext.parallelize(data), sourceTableSchema) - .createOrReplaceTempView("source_temp") - - sql(s"INSERT INTO $sourceTable SELECT * FROM source_temp") - val mergeBuilder = spark.table(sourceTable) - .mergeInto(tableNameAsString, - $"source_table.pk" === col(tableNameAsString + ".pk")) - .whenMatched() - .updateAll() - .whenNotMatched() - .insertAll() - - if (coerceNestedTypes) { - mergeBuilder.merge() - checkAnswer( - sql(s"SELECT * FROM $tableNameAsString"), - Seq( - Row(0, Row(1, Row(10, "x")), "sales"), - Row(1, null, "engineering"), - Row(2, null, "finance"))) - } else { - // Without coercion, the merge should fail due to missing field - val exception = intercept[org.apache.spark.sql.AnalysisException] { - mergeBuilder.merge() - } - assert(exception.errorClass.get == - "INCOMPATIBLE_DATA_FOR_TABLE.CANNOT_FIND_DATA") - assert(exception.getMessage.contains( - "Cannot write incompatible data for the table ``: " + - "Cannot find data for the output column `s`.`c2`.`b`.")) - } - - sql(s"DROP TABLE $tableNameAsString") - } - } - } - } - - test("merge null struct with schema evolution - " + - "source with missing and extra nested fields using dataframe API") { - Seq(true, false).foreach { withSchemaEvolution => - Seq(true, false).foreach { coerceNestedTypes => - withSQLConf(SQLConf.MERGE_INTO_NESTED_TYPE_COERCION_ENABLED.key -> - coerceNestedTypes.toString) { - val sourceTable = "cat.ns1.source_table" - withTable(sourceTable) { - // Target table has nested struct with fields c1 and c2 - sql( - s"""CREATE TABLE $tableNameAsString ( - |pk INT NOT NULL, - |s STRUCT>, - |dep STRING)""".stripMargin) - - val targetData = Seq( - Row(0, Row(1, Row(10, "x")), "sales"), - Row(1, Row(2, Row(20, "y")), "hr") - ) - val targetSchema = StructType(Seq( - StructField("pk", IntegerType, nullable = false), - StructField("s", StructType(Seq( - StructField("c1", IntegerType), - StructField("c2", StructType(Seq( - StructField("a", IntegerType), - StructField("b", StringType) - ))) - ))), - StructField("dep", StringType) - )) - spark.createDataFrame(spark.sparkContext.parallelize(targetData), targetSchema) - .writeTo(tableNameAsString).append() - - // Create source table with missing field 'b' and extra field 'c' in nested struct - val sourceIdent = Identifier.of(Array("ns1"), "source_table") - val columns = Array( - Column.create("pk", IntegerType, false), - Column.create("s", StructType(Seq( - StructField("c1", IntegerType), - StructField("c2", StructType(Seq( - StructField("a", IntegerType), - // missing field 'b' - StructField("c", StringType) // extra field 'c' - ))) - ))), - Column.create("dep", StringType)) - val tableInfo = new TableInfo.Builder() - .withColumns(columns) - .withProperties(extraTableProps) - .build() - catalog.createTable(sourceIdent, tableInfo) - - // Source data has null for the nested struct - val data = Seq( - Row(1, null, "engineering"), - Row(2, null, "finance") - ) - val sourceTableSchema = StructType(Seq( - StructField("pk", IntegerType), - StructField("s", StructType(Seq( - StructField("c1", IntegerType), - StructField("c2", StructType(Seq( - StructField("a", IntegerType), - StructField("c", StringType) - ))) - ))), - StructField("dep", StringType) - )) - spark.createDataFrame(spark.sparkContext.parallelize(data), sourceTableSchema) - .createOrReplaceTempView("source_temp") - - sql(s"INSERT INTO $sourceTable SELECT * FROM source_temp") - - val mergeBuilder = spark.table(sourceTable) - .mergeInto(tableNameAsString, $"source_table.pk" === col(tableNameAsString + ".pk")) - .whenMatched() - .updateAll() - .whenNotMatched() - .insertAll() - - if (coerceNestedTypes) { - if (withSchemaEvolution) { - // extra nested field is added - mergeBuilder.withSchemaEvolution().merge() - checkAnswer( - sql(s"SELECT * FROM $tableNameAsString"), - Seq( - Row(0, Row(1, Row(10, "x", null)), "sales"), - Row(1, null, "engineering"), - Row(2, null, "finance"))) - } else { - // extra nested field is not added - val exception = intercept[org.apache.spark.sql.AnalysisException] { - mergeBuilder.merge() - } - assert(exception.errorClass.get == - "INCOMPATIBLE_DATA_FOR_TABLE.EXTRA_STRUCT_FIELDS") - assert(exception.getMessage.contains( - "Cannot write incompatible data for the table ``: " + - "Cannot write extra fields `c` to the struct `s`.`c2`")) - } - } else { - // Without source struct coercion, the merge should fail - val exception = intercept[org.apache.spark.sql.AnalysisException] { - mergeBuilder.merge() - } - assert(exception.errorClass.get == - "INCOMPATIBLE_DATA_FOR_TABLE.CANNOT_FIND_DATA") - assert(exception.getMessage.contains( - "Cannot write incompatible data for the table ``: " + - "Cannot find data for the output column `s`.`c2`.`b`.")) - } - - sql(s"DROP TABLE $tableNameAsString") - } - } - } - } - } - test("Merge schema evolution should error on non-existent column in UPDATE and INSERT") { withTable(tableNameAsString) { withTempView("source") { From ac75170f1b11e6e2a0811026594f9d590e76993c Mon Sep 17 00:00:00 2001 From: Szehon Ho Date: Wed, 26 Nov 2025 10:41:37 -0800 Subject: [PATCH 2/3] Set the flag to false and preserve unit tests This reverts commit 96fca0edd84b71271e7376ac2fdc34e0afd8d24b. --- .../apache/spark/sql/internal/SQLConf.scala | 9 +- .../connector/MergeIntoTableSuiteBase.scala | 2003 +++++++++++++++-- 2 files changed, 1850 insertions(+), 162 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 3b052aeee355..85011da24de9 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -6699,10 +6699,11 @@ object SQLConf { buildConf("spark.sql.merge.nested.type.coercion.enabled") .internal() .doc("If enabled, allow MERGE INTO to coerce source nested types if they have less" + - "nested fields than the target table's nested types.") + "nested fields than the target table's nested types. This is experimental and" + + "the semantics may change.") .version("4.1.0") .booleanConf - .createWithDefault(true) + .createWithDefault(false) /** * Holds information about keys that have been deprecated. @@ -7903,8 +7904,8 @@ class SQLConf extends Serializable with Logging with SqlApiConf { def legacyXMLParserEnabled: Boolean = getConf(SQLConf.LEGACY_XML_PARSER_ENABLED) - // Disable until we define the semantics of UPDATE SET * with nested types - def coerceMergeNestedTypes: Boolean = false + def coerceMergeNestedTypes: Boolean = + getConf(SQLConf.MERGE_INTO_NESTED_TYPE_COERCION_ENABLED) /** ********************** SQLConf functionality methods ************ */ diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/MergeIntoTableSuiteBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/MergeIntoTableSuiteBase.scala index c3d1e43a58b0..e0432dc3227f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/MergeIntoTableSuiteBase.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/MergeIntoTableSuiteBase.scala @@ -3171,62 +3171,275 @@ abstract class MergeIntoTableSuiteBase extends RowLevelOperationSuiteBase test("merge into schema evolution replace column with nested struct and set explicit columns") { Seq(true, false).foreach { withSchemaEvolution => - withTempView("source") { - createAndInitTable( - s"""pk INT NOT NULL, - |s STRUCT, m: MAP>>, - |dep STRING""".stripMargin, - """{ "pk": 1, "s": { "c1": 2, "c2": { "a": [1,2], "m": { "a": "b" } } }, "dep": "hr" }""") + Seq(true, false).foreach { coerceNestedTypes => + withSQLConf(SQLConf.MERGE_INTO_NESTED_TYPE_COERCION_ENABLED.key -> + coerceNestedTypes.toString) { + withTempView("source") { + sql( + s"""CREATE TABLE $tableNameAsString ( + |pk INT NOT NULL, + |s STRUCT, + | m: MAP>>, + |dep STRING)""".stripMargin) + + val targetSchema = StructType(Seq( + StructField("pk", IntegerType, nullable = false), + StructField("s", StructType(Seq( + StructField("c1", IntegerType), + StructField("c2", StructType(Seq( + StructField("a", ArrayType(IntegerType)), + StructField("m", MapType(StringType, StringType)) + ))) + ))), + StructField("dep", StringType) + )) + val targetData = Seq( + Row(1, Row(2, Row(Array(1, 2), Map("a" -> "b"))), "hr") + ) + spark.createDataFrame( + spark.sparkContext.parallelize(targetData), targetSchema) + .writeTo(tableNameAsString).append() + + val sourceTableSchema = StructType(Seq( + StructField("pk", IntegerType, nullable = false), + StructField("s", StructType(Seq( + StructField("c1", IntegerType), + StructField("c2", StructType(Seq( + StructField("m", MapType(StringType, StringType)), + StructField("c3", BooleanType) + ))) + ))), + StructField("dep", StringType) + )) + val data = Seq( + Row(1, Row(10, Row(Map("c" -> "d"), false)), "sales"), + Row(2, Row(20, Row(Map("e" -> "f"), true)), "engineering") + ) + spark.createDataFrame(spark.sparkContext.parallelize(data), sourceTableSchema) + .createOrReplaceTempView("source") - val sourceTableSchema = StructType(Seq( - StructField("pk", IntegerType, nullable = false), - StructField("s", StructType(Seq( - StructField("c1", IntegerType), - StructField("c2", StructType(Seq( - // removed column 'a' - StructField("m", MapType(StringType, StringType)), - StructField("c3", BooleanType) // new column - ))) - ))), - StructField("dep", StringType) - )) - val data = Seq( - Row(1, Row(10, Row(Map("c" -> "d"), false)), "sales"), - Row(2, Row(20, Row(Map("e" -> "f"), true)), "engineering") - ) - spark.createDataFrame(spark.sparkContext.parallelize(data), sourceTableSchema) - .createOrReplaceTempView("source") + val schemaEvolutionClause = if (withSchemaEvolution) "WITH SCHEMA EVOLUTION" else "" + val mergeStmt = + s"""MERGE $schemaEvolutionClause + |INTO $tableNameAsString t + |USING source src + |ON t.pk = src.pk + |WHEN MATCHED THEN + | UPDATE SET s.c1 = -1, s.c2.m = map('k', 'v'), s.c2.a = array(-1), + | s.c2.c3 = src.s.c2.c3 + |WHEN NOT MATCHED THEN + | INSERT (pk, s, dep) VALUES (src.pk, + | named_struct('c1', src.s.c1, + | 'c2', named_struct('a', array(-2), 'm', map('g', 'h'), 'c3', true)), src.dep) + |""".stripMargin - val schemaEvolutionClause = if (withSchemaEvolution) "WITH SCHEMA EVOLUTION" else "" - val mergeStmt = - s"""MERGE $schemaEvolutionClause - |INTO $tableNameAsString t - |USING source src - |ON t.pk = src.pk - |WHEN MATCHED THEN - | UPDATE SET s.c1 = -1, s.c2.m = map('k', 'v'), s.c2.a = array(-1), - | s.c2.c3 = src.s.c2.c3 - |WHEN NOT MATCHED THEN - | INSERT (pk, s, dep) VALUES (src.pk, - | named_struct('c1', src.s.c1, - | 'c2', named_struct('a', array(-2), 'm', map('g', 'h'), 'c3', true)), src.dep) - |""".stripMargin + if (withSchemaEvolution) { + sql(mergeStmt) + checkAnswer( + sql(s"SELECT * FROM $tableNameAsString"), + Seq(Row(1, Row(-1, Row(Seq(-1), Map("k" -> "v"), false)), "hr"), + Row(2, Row(20, Row(Seq(-2), Map("g" -> "h"), true)), "engineering"))) + } else { + val exception = intercept[org.apache.spark.sql.AnalysisException] { + sql(mergeStmt) + } + assert(exception.errorClass.get == "FIELD_NOT_FOUND") + assert(exception.getMessage.contains("No such struct field `c3` in `a`, `m`. ")) + } + } + sql(s"DROP TABLE IF EXISTS $tableNameAsString") + } + } + } + } - if (withSchemaEvolution) { - sql(mergeStmt) - checkAnswer( - sql(s"SELECT * FROM $tableNameAsString"), - Seq(Row(1, Row(-1, Row(Seq(-1), Map("k" -> "v"), false)), "hr"), - Row(2, Row(20, Row(Seq(-2), Map("g" -> "h"), true)), "engineering"))) - } else { - val exception = intercept[org.apache.spark.sql.AnalysisException] { - sql(mergeStmt) + test("merge into schema evolution replace column with nested struct and set all columns") { + Seq(true, false).foreach { withSchemaEvolution => + Seq(true, false).foreach { coerceNestedTypes => + withSQLConf(SQLConf.MERGE_INTO_NESTED_TYPE_COERCION_ENABLED.key -> + coerceNestedTypes.toString) { + withTempView("source") { + // Create table using Spark SQL + sql( + s"""CREATE TABLE $tableNameAsString ( + |pk INT NOT NULL, + |s STRUCT, m: MAP>>, + |dep STRING) + |PARTITIONED BY (dep) + |""".stripMargin) + // Insert data using DataFrame API with objects + val tableSchema = StructType(Seq( + StructField("pk", IntegerType, nullable = false), + StructField("s", StructType(Seq( + StructField("c1", IntegerType), + StructField("c2", StructType(Seq( + StructField("a", ArrayType(IntegerType)), + StructField("m", MapType(StringType, StringType)) + ))) + ))), + StructField("dep", StringType) + )) + val targetData = Seq( + Row(1, Row(2, Row(Array(1, 2), Map("a" -> "b"))), "hr") + ) + spark.createDataFrame(spark.sparkContext.parallelize(targetData), tableSchema) + .coalesce(1).writeTo(tableNameAsString).append() + + val sourceTableSchema = StructType(Seq( + StructField("pk", IntegerType, nullable = false), + StructField("s", StructType(Seq( + StructField("c1", IntegerType), + StructField("c2", StructType(Seq( + // missing column 'a' + StructField("m", MapType(StringType, StringType)), + StructField("c3", BooleanType) // new column + ))) + ))), + StructField("dep", StringType) + )) + val sourceData = Seq( + Row(1, Row(10, Row(Map("c" -> "d"), false)), "sales"), + Row(2, Row(20, Row(Map("e" -> "f"), true)), "engineering") + ) + spark.createDataFrame(spark.sparkContext.parallelize(sourceData), sourceTableSchema) + .createOrReplaceTempView("source") + + val schemaEvolutionClause = if (withSchemaEvolution) "WITH SCHEMA EVOLUTION" else "" + val mergeStmt = + s"""MERGE $schemaEvolutionClause + |INTO $tableNameAsString t + |USING source src + |ON t.pk = src.pk + |WHEN MATCHED THEN + | UPDATE SET * + |WHEN NOT MATCHED THEN + | INSERT * + |""".stripMargin + if (coerceNestedTypes) { + if (withSchemaEvolution) { + sql(mergeStmt) + checkAnswer( + sql(s"SELECT * FROM $tableNameAsString"), + Seq( + Row(1, Row(10, Row(Seq(1, 2), Map("c" -> "d"), false)), "sales"), + Row(2, Row(20, Row(null, Map("e" -> "f"), true)), "engineering"))) + } else { + val exception = intercept[org.apache.spark.sql.AnalysisException] { + sql(mergeStmt) + } + assert(exception.errorClass.get == + "INCOMPATIBLE_DATA_FOR_TABLE.EXTRA_STRUCT_FIELDS") + assert(exception.getMessage.contains( + "Cannot write extra fields `c3` to the struct `s`.`c2`")) + } + } else { + val exception = intercept[org.apache.spark.sql.AnalysisException] { + sql(mergeStmt) + } + assert(exception.errorClass.get == "INCOMPATIBLE_DATA_FOR_TABLE.CANNOT_FIND_DATA") + assert(exception.getMessage.contains( + "Cannot find data for the output column `s`.`c2`.`a`")) + } } - assert(exception.errorClass.get == "FIELD_NOT_FOUND") - assert(exception.getMessage.contains("No such struct field `c3` in `a`, `m`. ")) + sql(s"DROP TABLE IF EXISTS $tableNameAsString") + } + } + } + } + + test("merge into schema evolution replace column with nested struct and update " + + "top level struct") { + Seq(true, false).foreach { withSchemaEvolution => + Seq(true, false).foreach { coerceNestedTypes => + withSQLConf(SQLConf.MERGE_INTO_NESTED_TYPE_COERCION_ENABLED.key -> + coerceNestedTypes.toString) { + withTempView("source") { + // Create table using Spark SQL + sql( + s"""CREATE TABLE $tableNameAsString ( + |pk INT NOT NULL, + |s STRUCT, m: MAP>>, + |dep STRING) + |PARTITIONED BY (dep) + |""".stripMargin) + + // Insert data using DataFrame API with objects + val tableSchema = StructType(Seq( + StructField("pk", IntegerType, nullable = false), + StructField("s", StructType(Seq( + StructField("c1", IntegerType), + StructField("c2", StructType(Seq( + StructField("a", ArrayType(IntegerType)), + StructField("m", MapType(StringType, StringType)) + ))) + ))), + StructField("dep", StringType) + )) + val targetData = Seq( + Row(1, Row(2, Row(Array(1, 2), Map("a" -> "b"))), "hr") + ) + spark.createDataFrame(spark.sparkContext.parallelize(targetData), tableSchema) + .coalesce(1).writeTo(tableNameAsString).append() + + val sourceTableSchema = StructType(Seq( + StructField("pk", IntegerType, nullable = false), + StructField("s", StructType(Seq( + StructField("c1", IntegerType), + StructField("c2", StructType(Seq( + // missing column 'a' + StructField("m", MapType(StringType, StringType)), + StructField("c3", BooleanType) // new column + ))) + ))), + StructField("dep", StringType) + )) + val sourceData = Seq( + Row(1, Row(10, Row(Map("c" -> "d"), false)), "sales"), + Row(2, Row(20, Row(Map("e" -> "f"), true)), "engineering") + ) + spark.createDataFrame(spark.sparkContext.parallelize(sourceData), sourceTableSchema) + .createOrReplaceTempView("source") + + val schemaEvolutionClause = if (withSchemaEvolution) "WITH SCHEMA EVOLUTION" else "" + val mergeStmt = + s"""MERGE $schemaEvolutionClause + |INTO $tableNameAsString t + |USING source src + |ON t.pk = src.pk + |WHEN MATCHED THEN + | UPDATE SET s = src.s + |WHEN NOT MATCHED THEN + | INSERT * + |""".stripMargin + if (coerceNestedTypes) { + if (withSchemaEvolution) { + sql(mergeStmt) + checkAnswer( + sql(s"SELECT * FROM $tableNameAsString"), + Seq( + Row(1, Row(10, Row(null, Map("c" -> "d"), false)), "hr"), + Row(2, Row(20, Row(null, Map("e" -> "f"), true)), "engineering"))) + } else { + val exception = intercept[org.apache.spark.sql.AnalysisException] { + sql(mergeStmt) + } + assert(exception.errorClass.get == + "INCOMPATIBLE_DATA_FOR_TABLE.EXTRA_STRUCT_FIELDS") + assert(exception.getMessage.contains( + "Cannot write extra fields `c3` to the struct `s`.`c2`")) + } + } else { + val exception = intercept[org.apache.spark.sql.AnalysisException] { + sql(mergeStmt) + } + assert(exception.errorClass.get == "INCOMPATIBLE_DATA_FOR_TABLE.CANNOT_FIND_DATA") + assert(exception.getMessage.contains( + "Cannot find data for the output column `s`.`c2`.`a`")) + } + } + sql(s"DROP TABLE IF EXISTS $tableNameAsString") } } - sql(s"DROP TABLE IF EXISTS $tableNameAsString") } } @@ -3351,6 +3564,313 @@ abstract class MergeIntoTableSuiteBase extends RowLevelOperationSuiteBase } } + test("merge into schema evolution replace column for struct in map and set all columns") { + Seq(true, false).foreach { withSchemaEvolution => + Seq(true, false).foreach { coerceNestedTypes => + withSQLConf(SQLConf.MERGE_INTO_NESTED_TYPE_COERCION_ENABLED.key -> + coerceNestedTypes.toString) { + withTempView("source") { + val schema = + StructType(Seq( + StructField("pk", IntegerType, nullable = false), + StructField("m", MapType( + StructType(Seq(StructField("c1", IntegerType), StructField("c2", IntegerType))), + StructType(Seq(StructField("c4", StringType), StructField("c5", StringType))))), + StructField("dep", StringType))) + createTable(CatalogV2Util.structTypeToV2Columns(schema)) + + val data = Seq( + Row(0, Map(Row(10, 10) -> Row("c", "c")), "hr"), + Row(1, Map(Row(20, 20) -> Row("d", "d")), "sales")) + spark.createDataFrame(spark.sparkContext.parallelize(data), schema) + .writeTo(tableNameAsString).append() + + val sourceTableSchema = StructType(Seq( + StructField("pk", IntegerType), + StructField("m", MapType( + StructType(Seq(StructField("c1", IntegerType), StructField("c3", BooleanType))), + StructType(Seq(StructField("c4", StringType), StructField("c6", BooleanType))))), + StructField("dep", StringType))) + val sourceData = Seq( + Row(1, Map(Row(10, true) -> Row("y", false)), "sales"), + Row(2, Map(Row(20, false) -> Row("z", true)), "engineering") + ) + spark.createDataFrame(spark.sparkContext.parallelize(sourceData), sourceTableSchema) + .createOrReplaceTempView("source") + + val schemaEvolutionClause = if (withSchemaEvolution) "WITH SCHEMA EVOLUTION" else "" + val mergeStmt = + s"""MERGE $schemaEvolutionClause + |INTO $tableNameAsString t + |USING source src + |ON t.pk = src.pk + |WHEN MATCHED THEN + | UPDATE SET * + |WHEN NOT MATCHED THEN + | INSERT * + |""".stripMargin + + if (coerceNestedTypes) { + if (withSchemaEvolution) { + sql(mergeStmt) + checkAnswer( + sql(s"SELECT * FROM $tableNameAsString"), + Seq(Row(0, Map(Row(10, 10, null) -> Row("c", "c", null)), "hr"), + Row(1, Map(Row(10, null, true) -> Row("y", null, false)), "sales"), + Row(2, Map(Row(20, null, false) -> Row("z", null, true)), "engineering"))) + } else { + val exception = intercept[org.apache.spark.sql.AnalysisException] { + sql(mergeStmt) + } + assert(exception.errorClass.get == + "INCOMPATIBLE_DATA_FOR_TABLE.EXTRA_STRUCT_FIELDS") + assert(exception.getMessage.contains( + "Cannot write extra fields `c3` to the struct `m`.`key`")) + } + } else { + val exception = intercept[org.apache.spark.sql.AnalysisException] { + sql(mergeStmt) + } + assert(exception.errorClass.get == "INCOMPATIBLE_DATA_FOR_TABLE.CANNOT_FIND_DATA") + assert(exception.getMessage.contains( + "Cannot find data for the output column `m`.`key`.`c2`")) + } + } + sql(s"DROP TABLE IF EXISTS $tableNameAsString") + } + } + } + } + + test("merge into schema evolution replace column for struct in map and set explicit columns") { + Seq(true, false).foreach { withSchemaEvolution => + Seq(true, false).foreach { coerceNestedTypes => + withSQLConf(SQLConf.MERGE_INTO_NESTED_TYPE_COERCION_ENABLED.key -> + coerceNestedTypes.toString) { + withTempView("source") { + val schema = + StructType(Seq( + StructField("pk", IntegerType, nullable = false), + StructField("m", MapType( + StructType(Seq(StructField("c1", IntegerType), StructField("c2", IntegerType))), + StructType(Seq(StructField("c4", StringType), StructField("c5", StringType))))), + StructField("dep", StringType))) + createTable(CatalogV2Util.structTypeToV2Columns(schema)) + + val data = Seq( + Row(0, Map(Row(10, 10) -> Row("c", "c")), "hr"), + Row(1, Map(Row(20, 20) -> Row("d", "d")), "sales")) + spark.createDataFrame(spark.sparkContext.parallelize(data), schema) + .writeTo(tableNameAsString).append() + + val sourceTableSchema = StructType(Seq( + StructField("pk", IntegerType), + StructField("m", MapType( + StructType(Seq(StructField("c1", IntegerType), StructField("c3", BooleanType))), + StructType(Seq(StructField("c4", StringType), StructField("c6", BooleanType))))), + StructField("dep", StringType))) + val sourceData = Seq( + Row(1, Map(Row(10, true) -> Row("y", false)), "sales"), + Row(2, Map(Row(20, false) -> Row("z", true)), "engineering") + ) + spark.createDataFrame(spark.sparkContext.parallelize(sourceData), sourceTableSchema) + .createOrReplaceTempView("source") + + val schemaEvolutionClause = if (withSchemaEvolution) "WITH SCHEMA EVOLUTION" else "" + val mergeStmt = + s"""MERGE $schemaEvolutionClause + |INTO $tableNameAsString t + |USING source src + |ON t.pk = src.pk + |WHEN MATCHED THEN + | UPDATE SET t.m = src.m, t.dep = 'my_old_dep' + |WHEN NOT MATCHED THEN + | INSERT (pk, m, dep) VALUES (src.pk, src.m, 'my_new_dep') + |""".stripMargin + + if (coerceNestedTypes) { + if (withSchemaEvolution) { + sql(mergeStmt) + checkAnswer( + sql(s"SELECT * FROM $tableNameAsString"), + Seq(Row(0, Map(Row(10, 10, null) -> Row("c", "c", null)), "hr"), + Row(1, Map(Row(10, null, true) -> Row("y", null, false)), "my_old_dep"), + Row(2, Map(Row(20, null, false) -> Row("z", null, true)), "my_new_dep"))) + } else { + val exception = intercept[org.apache.spark.sql.AnalysisException] { + sql(mergeStmt) + } + assert(exception.errorClass.get == + "INCOMPATIBLE_DATA_FOR_TABLE.EXTRA_STRUCT_FIELDS") + assert(exception.getMessage.contains( + "Cannot write extra fields `c3` to the struct `m`.`key`")) + } + } else { + val exception = intercept[org.apache.spark.sql.AnalysisException] { + sql(mergeStmt) + } + assert(exception.errorClass.get == "INCOMPATIBLE_DATA_FOR_TABLE.CANNOT_FIND_DATA") + assert(exception.getMessage.contains( + "Cannot find data for the output column `m`.`key`.`c2`")) + } + } + sql(s"DROP TABLE IF EXISTS $tableNameAsString") + } + } + } + } + + test("merge into schema evolution replace column for struct in array and set all columns") { + Seq(true, false).foreach { withSchemaEvolution => + Seq(true, false).foreach { coerceNestedTypes => + withSQLConf(SQLConf.MERGE_INTO_NESTED_TYPE_COERCION_ENABLED.key -> + coerceNestedTypes.toString) { + withTempView("source") { + val schema = + StructType(Seq( + StructField("pk", IntegerType, nullable = false), + StructField("a", ArrayType( + StructType(Seq(StructField("c1", IntegerType), StructField("c2", IntegerType))))), + StructField("dep", StringType))) + createTable(CatalogV2Util.structTypeToV2Columns(schema)) + + val data = Seq( + Row(0, Array(Row(10, 10)), "hr"), + Row(1, Array(Row(20, 20)), "sales")) + spark.createDataFrame(spark.sparkContext.parallelize(data), schema) + .writeTo(tableNameAsString).append() + + val sourceTableSchema = StructType(Seq( + StructField("pk", IntegerType), + StructField("a", ArrayType( + StructType(Seq(StructField("c1", IntegerType), StructField("c3", BooleanType))))), + StructField("dep", StringType))) + val sourceData = Seq( + Row(1, Array(Row(10, true)), "sales"), + Row(2, Array(Row(20, false)), "engineering") + ) + spark.createDataFrame(spark.sparkContext.parallelize(sourceData), sourceTableSchema) + .createOrReplaceTempView("source") + + val schemaEvolutionClause = if (withSchemaEvolution) "WITH SCHEMA EVOLUTION" else "" + val mergeStmt = + s"""MERGE $schemaEvolutionClause + |INTO $tableNameAsString t + |USING source src + |ON t.pk = src.pk + |WHEN MATCHED THEN + | UPDATE SET * + |WHEN NOT MATCHED THEN + | INSERT * + |""".stripMargin + + if (coerceNestedTypes) { + if (withSchemaEvolution) { + sql(mergeStmt) + checkAnswer( + sql(s"SELECT * FROM $tableNameAsString"), + Seq(Row(0, Array(Row(10, 10, null)), "hr"), + Row(1, Array(Row(10, null, true)), "sales"), + Row(2, Array(Row(20, null, false)), "engineering"))) + } else { + val exception = intercept[org.apache.spark.sql.AnalysisException] { + sql(mergeStmt) + } + assert(exception.errorClass.get == + "INCOMPATIBLE_DATA_FOR_TABLE.EXTRA_STRUCT_FIELDS") + assert(exception.getMessage.contains( + "Cannot write extra fields `c3` to the struct `a`.`element`")) + } + } else { + val exception = intercept[org.apache.spark.sql.AnalysisException] { + sql(mergeStmt) + } + assert(exception.errorClass.get == "INCOMPATIBLE_DATA_FOR_TABLE.CANNOT_FIND_DATA") + assert(exception.getMessage.contains( + "Cannot find data for the output column `a`.`element`.`c2`")) + } + } + sql(s"DROP TABLE IF EXISTS $tableNameAsString") + } + } + } + } + + test("merge into schema evolution replace column for struct in array and set explicit columns") { + Seq(true, false).foreach { withSchemaEvolution => + Seq(true, false).foreach { coerceNestedTypes => + withSQLConf(SQLConf.MERGE_INTO_NESTED_TYPE_COERCION_ENABLED.key -> + coerceNestedTypes.toString) { + withTempView("source") { + val schema = + StructType(Seq( + StructField("pk", IntegerType, nullable = false), + StructField("a", ArrayType( + StructType(Seq(StructField("c1", IntegerType), StructField("c2", IntegerType))))), + StructField("dep", StringType))) + createTable(CatalogV2Util.structTypeToV2Columns(schema)) + + val data = Seq( + Row(0, Array(Row(10, 10)), "hr"), + Row(1, Array(Row(20, 20)), "sales")) + spark.createDataFrame(spark.sparkContext.parallelize(data), schema) + .writeTo(tableNameAsString).append() + + val sourceTableSchema = StructType(Seq( + StructField("pk", IntegerType), + StructField("a", ArrayType( + StructType(Seq(StructField("c1", IntegerType), StructField("c3", BooleanType))))), + StructField("dep", StringType))) + val sourceData = Seq( + Row(1, Array(Row(10, true)), "sales"), + Row(2, Array(Row(20, false)), "engineering") + ) + spark.createDataFrame(spark.sparkContext.parallelize(sourceData), sourceTableSchema) + .createOrReplaceTempView("source") + + val schemaEvolutionClause = if (withSchemaEvolution) "WITH SCHEMA EVOLUTION" else "" + val mergeStmt = + s"""MERGE $schemaEvolutionClause + |INTO $tableNameAsString t + |USING source src + |ON t.pk = src.pk + |WHEN MATCHED THEN + | UPDATE SET t.a = src.a, t.dep = 'my_old_dep' + |WHEN NOT MATCHED THEN + | INSERT (pk, a, dep) VALUES (src.pk, src.a, 'my_new_dep') + |""".stripMargin + + if (coerceNestedTypes) { + if (withSchemaEvolution) { + sql(mergeStmt) + checkAnswer( + sql(s"SELECT * FROM $tableNameAsString"), + Seq(Row(0, Array(Row(10, 10, null)), "hr"), + Row(1, Array(Row(10, null, true)), "my_old_dep"), + Row(2, Array(Row(20, null, false)), "my_new_dep"))) + } else { + val exception = intercept[org.apache.spark.sql.AnalysisException] { + sql(mergeStmt) + } + assert(exception.errorClass.get == + "INCOMPATIBLE_DATA_FOR_TABLE.EXTRA_STRUCT_FIELDS") + assert(exception.getMessage.contains( + "Cannot write extra fields `c3` to the struct `a`.`element`")) + } + } else { + val exception = intercept[org.apache.spark.sql.AnalysisException] { + sql(mergeStmt) + } + assert(exception.errorClass.get == "INCOMPATIBLE_DATA_FOR_TABLE.CANNOT_FIND_DATA") + assert(exception.getMessage.contains( + "Cannot find data for the output column `a`.`element`.`c2`")) + } + } + sql(s"DROP TABLE IF EXISTS $tableNameAsString") + } + } + } + } test("merge into empty table with NOT MATCHED clause schema evolution") { Seq(true, false) foreach { withSchemaEvolution => withTempView("source") { @@ -4039,24 +4559,289 @@ abstract class MergeIntoTableSuiteBase extends RowLevelOperationSuiteBase } } - test("merge with null struct") { - withTempView("source") { - createAndInitTable( - s"""pk INT NOT NULL, - |s STRUCT, - |dep STRING""".stripMargin, - """{ "pk": 0, "s": { "c1": 1, "c2": "a" }, "dep": "sales" } - |{ "pk": 1, "s": { "c1": 2, "c2": "b" }, "dep": "hr" }""" - .stripMargin) + test("merge into with source missing fields in struct nested in array") { + Seq(true, false).foreach { coerceNestedTypes => + withSQLConf(SQLConf.MERGE_INTO_NESTED_TYPE_COERCION_ENABLED.key -> + coerceNestedTypes.toString) { + withTempView("source") { + // Target table has struct with 3 fields (c1, c2, c3) in array + createAndInitTable( + s"""pk INT NOT NULL, + |a ARRAY>, + |dep STRING""".stripMargin, + """{ "pk": 0, "a": [ { "c1": 1, "c2": "a", "c3": true } ], "dep": "sales" } + |{ "pk": 1, "a": [ { "c1": 2, "c2": "b", "c3": false } ], "dep": "sales" }""" + .stripMargin) - // Source table matches target table schema - val sourceTableSchema = StructType(Seq( - StructField("pk", IntegerType), - StructField("s", StructType(Seq( - StructField("c1", IntegerType), - StructField("c2", StringType) - ))), - StructField("dep", StringType) + // Source table has struct with only 2 fields (c1, c2) - missing c3 + val sourceTableSchema = StructType(Seq( + StructField("pk", IntegerType, nullable = false), + StructField("a", ArrayType( + StructType(Seq( + StructField("c1", IntegerType), + StructField("c2", StringType))))), // missing c3 field + StructField("dep", StringType))) + val data = Seq( + Row(1, Array(Row(10, "c")), "hr"), + Row(2, Array(Row(30, "e")), "engineering") + ) + spark.createDataFrame(spark.sparkContext.parallelize(data), sourceTableSchema) + .createOrReplaceTempView("source") + + val mergeStmt = + s"""MERGE INTO $tableNameAsString t + |USING source src + |ON t.pk = src.pk + |WHEN MATCHED THEN + | UPDATE SET * + |WHEN NOT MATCHED THEN + | INSERT * + |""".stripMargin + + if (coerceNestedTypes) { + sql(mergeStmt) + // Missing field c3 should be filled with NULL + checkAnswer( + sql(s"SELECT * FROM $tableNameAsString"), + Seq( + Row(0, Array(Row(1, "a", true)), "sales"), + Row(1, Array(Row(10, "c", null)), "hr"), + Row(2, Array(Row(30, "e", null)), "engineering"))) + } else { + val exception = intercept[org.apache.spark.sql.AnalysisException] { + sql(mergeStmt) + } + assert(exception.errorClass.get == + "INCOMPATIBLE_DATA_FOR_TABLE.CANNOT_FIND_DATA") + assert(exception.getMessage.contains( + "Cannot write incompatible data for the table ``: " + + "Cannot find data for the output column `a`.`element`.`c3`.")) + } + } + sql(s"DROP TABLE IF EXISTS $tableNameAsString") + } + } + } + + test("merge into with source missing fields in struct nested in map key") { + Seq(true, false).foreach { coerceNestedTypes => + withSQLConf(SQLConf.MERGE_INTO_NESTED_TYPE_COERCION_ENABLED.key -> + coerceNestedTypes.toString) { + withTempView("source") { + // Target table has struct with 2 fields in map key + val targetSchema = + StructType(Seq( + StructField("pk", IntegerType, nullable = false), + StructField("m", MapType( + StructType(Seq(StructField("c1", IntegerType), StructField("c2", BooleanType))), + StructType(Seq(StructField("c3", StringType))))), + StructField("dep", StringType))) + createTable(CatalogV2Util.structTypeToV2Columns(targetSchema)) + + val targetData = Seq( + Row(0, Map(Row(10, true) -> Row("x")), "hr"), + Row(1, Map(Row(20, false) -> Row("y")), "sales")) + spark.createDataFrame(spark.sparkContext.parallelize(targetData), targetSchema) + .writeTo(tableNameAsString).append() + + // Source table has struct with only 1 field (c1) in map key - missing c2 + val sourceTableSchema = StructType(Seq( + StructField("pk", IntegerType), + StructField("m", MapType( + StructType(Seq(StructField("c1", IntegerType))), // missing c2 + StructType(Seq(StructField("c3", StringType))))), + StructField("dep", StringType))) + val sourceData = Seq( + Row(1, Map(Row(10) -> Row("z")), "sales"), + Row(2, Map(Row(20) -> Row("w")), "engineering") + ) + spark.createDataFrame(spark.sparkContext.parallelize(sourceData), sourceTableSchema) + .createOrReplaceTempView("source") + + val mergeStmt = + s"""MERGE INTO $tableNameAsString t + |USING source src + |ON t.pk = src.pk + |WHEN MATCHED THEN + | UPDATE SET * + |WHEN NOT MATCHED THEN + | INSERT * + |""".stripMargin + + if (coerceNestedTypes) { + sql(mergeStmt) + // Missing field c2 should be filled with NULL + checkAnswer( + sql(s"SELECT * FROM $tableNameAsString"), + Seq( + Row(0, Map(Row(10, true) -> Row("x")), "hr"), + Row(1, Map(Row(10, null) -> Row("z")), "sales"), + Row(2, Map(Row(20, null) -> Row("w")), "engineering"))) + } else { + val exception = intercept[org.apache.spark.sql.AnalysisException] { + sql(mergeStmt) + } + assert(exception.errorClass.get == + "INCOMPATIBLE_DATA_FOR_TABLE.CANNOT_FIND_DATA") + assert(exception.getMessage.contains( + "Cannot write incompatible data for the table ``: " + + "Cannot find data for the output column `m`.`key`.`c2`.")) + } + } + sql(s"DROP TABLE IF EXISTS $tableNameAsString") + } + } + } + + test("merge into with source missing fields in struct nested in map value") { + Seq(true, false).foreach { coerceNestedTypes => + withSQLConf(SQLConf.MERGE_INTO_NESTED_TYPE_COERCION_ENABLED.key -> + coerceNestedTypes.toString) { + withTempView("source") { + // Target table has struct with 2 fields in map value + val targetSchema = + StructType(Seq( + StructField("pk", IntegerType, nullable = false), + StructField("m", MapType( + StructType(Seq(StructField("c1", IntegerType))), + StructType(Seq(StructField("c1", StringType), StructField("c2", BooleanType))))), + StructField("dep", StringType))) + createTable(CatalogV2Util.structTypeToV2Columns(targetSchema)) + + val targetData = Seq( + Row(0, Map(Row(10) -> Row("x", true)), "hr"), + Row(1, Map(Row(20) -> Row("y", false)), "sales")) + spark.createDataFrame(spark.sparkContext.parallelize(targetData), targetSchema) + .writeTo(tableNameAsString).append() + + // Source table has struct with only 1 field (c1) in map value - missing c2 + val sourceTableSchema = StructType(Seq( + StructField("pk", IntegerType), + StructField("m", MapType( + StructType(Seq(StructField("c1", IntegerType))), + StructType(Seq(StructField("c1", StringType))))), // missing c2 + StructField("dep", StringType))) + val sourceData = Seq( + Row(1, Map(Row(10) -> Row("z")), "sales"), + Row(2, Map(Row(20) -> Row("w")), "engineering") + ) + spark.createDataFrame(spark.sparkContext.parallelize(sourceData), sourceTableSchema) + .createOrReplaceTempView("source") + + val mergeStmt = + s"""MERGE INTO $tableNameAsString t + |USING source src + |ON t.pk = src.pk + |WHEN MATCHED THEN + | UPDATE SET * + |WHEN NOT MATCHED THEN + | INSERT * + |""".stripMargin + + if (coerceNestedTypes) { + sql(mergeStmt) + // Missing field c2 should be filled with NULL + checkAnswer( + sql(s"SELECT * FROM $tableNameAsString"), + Seq( + Row(0, Map(Row(10) -> Row("x", true)), "hr"), + Row(1, Map(Row(10) -> Row("z", null)), "sales"), + Row(2, Map(Row(20) -> Row("w", null)), "engineering"))) + } else { + val exception = intercept[org.apache.spark.sql.AnalysisException] { + sql(mergeStmt) + } + assert(exception.errorClass.get == + "INCOMPATIBLE_DATA_FOR_TABLE.CANNOT_FIND_DATA") + assert(exception.getMessage.contains( + "Cannot write incompatible data for the table ``: " + + "Cannot find data for the output column `m`.`value`.`c2`.")) + } + } + sql(s"DROP TABLE IF EXISTS $tableNameAsString") + } + } + } + + test("merge into with source missing fields in top-level struct") { + Seq(true, false).foreach { coerceNestedTypes => + withSQLConf(SQLConf.MERGE_INTO_NESTED_TYPE_COERCION_ENABLED.key -> + coerceNestedTypes.toString) { + withTempView("source") { + // Target table has struct with 3 fields at top level + createAndInitTable( + s"""pk INT NOT NULL, + |s STRUCT, + |dep STRING""".stripMargin, + """{ "pk": 0, "s": { "c1": 1, "c2": "a", "c3": true }, "dep": "sales"}""") + + // Source table has struct with only 2 fields (c1, c2) - missing c3 + val sourceTableSchema = StructType(Seq( + StructField("pk", IntegerType, nullable = false), + StructField("s", StructType(Seq( + StructField("c1", IntegerType), + StructField("c2", StringType)))), // missing c3 field + StructField("dep", StringType))) + val data = Seq( + Row(1, Row(10, "b"), "hr"), + Row(2, Row(20, "c"), "engineering") + ) + spark.createDataFrame(spark.sparkContext.parallelize(data), sourceTableSchema) + .createOrReplaceTempView("source") + + val mergeStmt = + s"""MERGE INTO $tableNameAsString t + |USING source src + |ON t.pk = src.pk + |WHEN MATCHED THEN + | UPDATE SET * + |WHEN NOT MATCHED THEN + | INSERT * + |""".stripMargin + + if (coerceNestedTypes) { + sql(mergeStmt) + // Missing field c3 should be filled with NULL + checkAnswer( + sql(s"SELECT * FROM $tableNameAsString"), + Seq( + Row(0, Row(1, "a", true), "sales"), + Row(1, Row(10, "b", null), "hr"), + Row(2, Row(20, "c", null), "engineering"))) + } else { + val exception = intercept[org.apache.spark.sql.AnalysisException] { + sql(mergeStmt) + } + assert(exception.errorClass.get == + "INCOMPATIBLE_DATA_FOR_TABLE.CANNOT_FIND_DATA") + assert(exception.getMessage.contains( + "Cannot write incompatible data for the table ``: " + + "Cannot find data for the output column `s`.`c3`.")) + } + } + sql(s"DROP TABLE IF EXISTS $tableNameAsString") + } + } + } + + test("merge with null struct") { + withTempView("source") { + createAndInitTable( + s"""pk INT NOT NULL, + |s STRUCT, + |dep STRING""".stripMargin, + """{ "pk": 0, "s": { "c1": 1, "c2": "a" }, "dep": "sales" } + |{ "pk": 1, "s": { "c1": 2, "c2": "b" }, "dep": "hr" }""" + .stripMargin) + + // Source table matches target table schema + val sourceTableSchema = StructType(Seq( + StructField("pk", IntegerType), + StructField("s", StructType(Seq( + StructField("c1", IntegerType), + StructField("c2", StringType) + ))), + StructField("dep", StringType) )) val data = Seq( @@ -4174,6 +4959,301 @@ abstract class MergeIntoTableSuiteBase extends RowLevelOperationSuiteBase sql(s"DROP TABLE IF EXISTS $tableNameAsString") } + test("merge with with null struct with missing nested field") { + Seq(true, false).foreach { coerceNestedTypes => + withSQLConf(SQLConf.MERGE_INTO_NESTED_TYPE_COERCION_ENABLED.key -> + coerceNestedTypes.toString) { + withTempView("source") { + // Target table has nested struct with fields c1 and c2 + createAndInitTable( + s"""pk INT NOT NULL, + |s STRUCT>, + |dep STRING""".stripMargin, + """{ "pk": 0, "s": { "c1": 1, "c2": { "a": 10, "b": "x" } }, "dep": "sales" } + |{ "pk": 1, "s": { "c1": 2, "c2": { "a": 20, "b": "y" } }, "dep": "hr" }""" + .stripMargin) + + // Source table has null for the nested struct + val sourceTableSchema = StructType(Seq( + StructField("pk", IntegerType), + StructField("s", StructType(Seq( + StructField("c1", IntegerType), + StructField("c2", StructType(Seq( + StructField("a", IntegerType) + // missing field 'b' + ))) + ))), + StructField("dep", StringType) + )) + + val data = Seq( + Row(1, null, "engineering"), + Row(2, null, "finance") + ) + spark.createDataFrame(spark.sparkContext.parallelize(data), sourceTableSchema) + .createOrReplaceTempView("source") + + val mergeStmt = + s"""MERGE INTO $tableNameAsString t USING source + |ON t.pk = source.pk + |WHEN MATCHED THEN + | UPDATE SET * + |WHEN NOT MATCHED THEN + | INSERT * + |""".stripMargin + + if (coerceNestedTypes) { + sql(mergeStmt) + checkAnswer( + sql(s"SELECT * FROM $tableNameAsString"), + Seq( + Row(0, Row(1, Row(10, "x")), "sales"), + Row(1, null, "engineering"), + Row(2, null, "finance"))) + } else { + // Without coercion, the merge should fail due to missing field + val exception = intercept[org.apache.spark.sql.AnalysisException] { + sql(mergeStmt) + } + assert(exception.errorClass.get == + "INCOMPATIBLE_DATA_FOR_TABLE.CANNOT_FIND_DATA") + assert(exception.getMessage.contains( + "Cannot write incompatible data for the table ``: " + + "Cannot find data for the output column `s`.`c2`.`b`.")) + } + } + } + sql(s"DROP TABLE IF EXISTS $tableNameAsString") + } + } + + test("merge null struct with schema evolution - source with missing and extra nested fields") { + Seq(true, false).foreach { withSchemaEvolution => + Seq(true, false).foreach { coerceNestedTypes => + withSQLConf(SQLConf.MERGE_INTO_NESTED_TYPE_COERCION_ENABLED.key -> + coerceNestedTypes.toString) { + withTempView("source") { + // Target table has nested struct with fields c1 and c2 + createAndInitTable( + s"""pk INT NOT NULL, + |s STRUCT>, + |dep STRING""".stripMargin, + """{ "pk": 0, "s": { "c1": 1, "c2": { "a": 10, "b": "x" } }, "dep": "sales" } + |{ "pk": 1, "s": { "c1": 2, "c2": { "a": 20, "b": "y" } }, "dep": "hr" }""" + .stripMargin) + + // Source table has missing field 'b' and extra field 'c' in nested struct + val sourceTableSchema = StructType(Seq( + StructField("pk", IntegerType), + StructField("s", StructType(Seq( + StructField("c1", IntegerType), + StructField("c2", StructType(Seq( + StructField("a", IntegerType), + // missing field 'b' + StructField("c", StringType) // extra field 'c' + ))) + ))), + StructField("dep", StringType) + )) + + val data = Seq( + Row(1, null, "engineering"), + Row(2, null, "finance") + ) + spark.createDataFrame(spark.sparkContext.parallelize(data), sourceTableSchema) + .createOrReplaceTempView("source") + + val schemaEvolutionClause = if (withSchemaEvolution) "WITH SCHEMA EVOLUTION" else "" + val mergeStmt = + s"""MERGE $schemaEvolutionClause + |INTO $tableNameAsString t USING source + |ON t.pk = source.pk + |WHEN MATCHED THEN + | UPDATE SET * + |WHEN NOT MATCHED THEN + | INSERT * + |""".stripMargin + + if (coerceNestedTypes) { + if (withSchemaEvolution) { + // extra nested field is added + sql(mergeStmt) + checkAnswer( + sql(s"SELECT * FROM $tableNameAsString"), + Seq( + Row(0, Row(1, Row(10, "x", null)), "sales"), + Row(1, null, "engineering"), + Row(2, null, "finance"))) + } else { + // extra nested field is not added + val exception = intercept[org.apache.spark.sql.AnalysisException] { + sql(mergeStmt) + } + assert(exception.errorClass.get == + "INCOMPATIBLE_DATA_FOR_TABLE.EXTRA_STRUCT_FIELDS") + assert(exception.getMessage.contains( + "Cannot write incompatible data for the table ``: " + + "Cannot write extra fields `c` to the struct `s`.`c2`")) + } + } else { + // Without source struct coercion, the merge should fail + val exception = intercept[org.apache.spark.sql.AnalysisException] { + sql(mergeStmt) + } + assert(exception.errorClass.get == + "INCOMPATIBLE_DATA_FOR_TABLE.CANNOT_FIND_DATA") + assert(exception.getMessage.contains( + "Cannot write incompatible data for the table ``: " + + "Cannot find data for the output column `s`.`c2`.`b`.")) + } + } + } + sql(s"DROP TABLE IF EXISTS $tableNameAsString") + } + } + } + + test("merge null struct with non-nullable nested field - source with missing " + + "and extra nested fields") { + Seq(true, false).foreach { withSchemaEvolution => + Seq(true, false).foreach { coerceNestedTypes => + withSQLConf(SQLConf.MERGE_INTO_NESTED_TYPE_COERCION_ENABLED.key -> + coerceNestedTypes.toString) { + withTempView("source") { + createAndInitTable( + s"""pk INT NOT NULL, + |s STRUCT>, + |dep STRING""".stripMargin, + """{ "pk": 0, "s": { "c1": 1, "c2": { "a": 10, "b": "x" } }, "dep": "sales" } + |{ "pk": 1, "s": { "c1": 2, "c2": { "a": 20, "b": "y" } }, "dep": "hr" }""" + .stripMargin) + + val sourceTableSchema = StructType(Seq( + StructField("pk", IntegerType), + StructField("s", StructType(Seq( + StructField("c1", IntegerType), + StructField("c2", StructType(Seq( + StructField("a", IntegerType), + StructField("c", StringType) + ))) + ))), + StructField("dep", StringType) + )) + + val data = Seq( + Row(1, null, "engineering"), + Row(2, null, "finance") + ) + spark.createDataFrame(spark.sparkContext.parallelize(data), sourceTableSchema) + .createOrReplaceTempView("source") + + val schemaEvolutionClause = if (withSchemaEvolution) "WITH SCHEMA EVOLUTION" else "" + val mergeStmt = + s"""MERGE $schemaEvolutionClause + |INTO $tableNameAsString t USING source + |ON t.pk = source.pk + |WHEN MATCHED THEN + | UPDATE SET * + |WHEN NOT MATCHED THEN + | INSERT * + |""".stripMargin + + val exception = intercept[org.apache.spark.sql.AnalysisException] { + sql(mergeStmt) + } + assert(exception.errorClass.get == + "INCOMPATIBLE_DATA_FOR_TABLE.CANNOT_FIND_DATA") + assert(exception.getMessage.contains( + "Cannot find data for the output column `s`.`c2`.`b`")) + } + sql(s"DROP TABLE IF EXISTS $tableNameAsString") + } + } + } + } + + test("merge with null struct using default value") { + Seq(true, false).foreach { coerceNestedTypes => + withSQLConf(SQLConf.MERGE_INTO_NESTED_TYPE_COERCION_ENABLED.key -> + coerceNestedTypes.toString) { + withTempView("source") { + sql( + s"""CREATE TABLE $tableNameAsString ( + | pk INT NOT NULL, + | s STRUCT> DEFAULT + | named_struct('c1', 999, 'c2', named_struct('a', 999, 'b', 'default')), + | dep STRING) + |PARTITIONED BY (dep) + |""".stripMargin) + + val initialSchema = StructType(Seq( + StructField("pk", IntegerType, nullable = false), + StructField("s", StructType(Seq( + StructField("c1", IntegerType), + StructField("c2", StructType(Seq( + StructField("a", IntegerType), + StructField("b", StringType) + ))) + ))), + StructField("dep", StringType) + )) + val initialData = Seq( + Row(0, Row(1, Row(10, "x")), "sales"), + Row(1, Row(2, Row(20, "y")), "hr") + ) + spark.createDataFrame(spark.sparkContext.parallelize(initialData), initialSchema) + .writeTo(tableNameAsString).append() + + val sourceTableSchema = StructType(Seq( + StructField("pk", IntegerType), + StructField("s", StructType(Seq( + StructField("c1", IntegerType), + StructField("c2", StructType(Seq( + StructField("a", IntegerType) + ))) + ))), + StructField("dep", StringType) + )) + + val data = Seq( + Row(1, null, "engineering"), + Row(2, null, "finance") + ) + spark.createDataFrame(spark.sparkContext.parallelize(data), sourceTableSchema) + .createOrReplaceTempView("source") + + val mergeStmt = + s"""MERGE INTO $tableNameAsString t USING source + |ON t.pk = source.pk + |WHEN MATCHED THEN + | UPDATE SET * + |WHEN NOT MATCHED THEN + | INSERT * + |""".stripMargin + + if (coerceNestedTypes) { + sql(mergeStmt) + checkAnswer( + sql(s"SELECT * FROM $tableNameAsString"), + Seq( + Row(0, Row(1, Row(10, "x")), "sales"), + Row(1, null, "engineering"), + Row(2, null, "finance"))) + } else { + val exception = intercept[org.apache.spark.sql.AnalysisException] { + sql(mergeStmt) + } + assert(exception.errorClass.get == "INCOMPATIBLE_DATA_FOR_TABLE.CANNOT_FIND_DATA") + assert(exception.getMessage.contains( + "Cannot find data for the output column `s`.`c2`.`b`")) + } + } + sql(s"DROP TABLE IF EXISTS $tableNameAsString") + } + } + } + + test("merge with source missing struct column with default value") { withTempView("source") { // Target table has nested struct with a default value @@ -4238,6 +5318,68 @@ abstract class MergeIntoTableSuiteBase extends RowLevelOperationSuiteBase } } + test("merge into with source missing fields in nested struct") { + Seq(true, false).foreach { nestedTypeCoercion => + withSQLConf(SQLConf.MERGE_INTO_NESTED_TYPE_COERCION_ENABLED.key + -> nestedTypeCoercion.toString) { + withTempView("source") { + // Target table has nested struct: s.c1, s.c2.a, s.c2.b + createAndInitTable( + s"""pk INT NOT NULL, + |s STRUCT>, + |dep STRING""".stripMargin, + """{ "pk": 1, "s": { "c1": 2, "c2": { "a": 10, "b": true } } } + |{ "pk": 2, "s": { "c1": 2, "c2": { "a": 30, "b": false } } }""".stripMargin) + + // Source table is missing field 'b' in nested struct s.c2 + val sourceTableSchema = StructType(Seq( + StructField("pk", IntegerType, nullable = false), + StructField("s", StructType(Seq( + StructField("c1", IntegerType), + StructField("c2", StructType(Seq( + StructField("a", IntegerType) + // missing field 'b' + ))) + ))), + StructField("dep", StringType) + )) + val data = Seq( + Row(1, Row(10, Row(20)), "sales"), + Row(2, Row(20, Row(30)), "engineering") + ) + spark.createDataFrame(spark.sparkContext.parallelize(data), sourceTableSchema) + .createOrReplaceTempView("source") + + // Missing field b should be filled with NULL + val mergeStmt = s"""MERGE INTO $tableNameAsString t + |USING source src + |ON t.pk = src.pk + |WHEN MATCHED THEN + | UPDATE SET * + |WHEN NOT MATCHED THEN + | INSERT * + |""".stripMargin + + if (nestedTypeCoercion) { + sql(mergeStmt) + checkAnswer( + sql(s"SELECT * FROM $tableNameAsString"), + Seq( + Row(1, Row(10, Row(20, true)), "sales"), + Row(2, Row(20, Row(30, false)), "engineering"))) + } else { + val exception = intercept[Exception] { + sql(mergeStmt) + } + assert(exception.getMessage.contains( + """Cannot write incompatible data for the table ``""".stripMargin)) + } + } + sql(s"DROP TABLE IF EXISTS $tableNameAsString") + } + } + } + test("merge with named_struct missing non-nullable field") { withTempView("source") { createAndInitTable( @@ -4627,105 +5769,332 @@ abstract class MergeIntoTableSuiteBase extends RowLevelOperationSuiteBase test("merge schema evolution replace column with nested struct and " + "set explicit columns using dataframe API") { Seq(true, false).foreach { withSchemaEvolution => - val sourceTable = "cat.ns1.source_table" - withTable(sourceTable) { - sql( - s"""CREATE TABLE $tableNameAsString ( - |pk INT NOT NULL, - |s STRUCT, m: MAP>>, - |dep STRING)""".stripMargin) + Seq(true, false).foreach { coerceNestedTypes => + withSQLConf(SQLConf.MERGE_INTO_NESTED_TYPE_COERCION_ENABLED.key -> + coerceNestedTypes.toString) { + val sourceTable = "cat.ns1.source_table" + withTable(sourceTable) { + sql( + s"""CREATE TABLE $tableNameAsString ( + |pk INT NOT NULL, + |s STRUCT, m: MAP>>, + |dep STRING)""".stripMargin) + + val targetData = Seq( + Row(1, Row(2, Row(Array(1, 2), Map("a" -> "b"))), "hr") + ) + val targetSchema = StructType(Seq( + StructField("pk", IntegerType, nullable = false), + StructField("s", StructType(Seq( + StructField("c1", IntegerType), + StructField("c2", StructType(Seq( + StructField("a", ArrayType(IntegerType)), + StructField("m", MapType(StringType, StringType)) + ))) + ))), + StructField("dep", StringType) + )) + spark.createDataFrame(spark.sparkContext.parallelize(targetData), targetSchema) + .writeTo(tableNameAsString).append() + + val sourceIdent = Identifier.of(Array("ns1"), "source_table") + val columns = Array( + Column.create("pk", IntegerType, false), + Column.create("s", StructType(Seq( + StructField("c1", IntegerType), + StructField("c2", StructType(Seq( + // removed column 'a' + StructField("m", MapType(StringType, StringType)), + StructField("c3", BooleanType) // new column + ))) + ))), + Column.create("dep", StringType)) + val tableInfo = new TableInfo.Builder() + .withColumns(columns) + .withProperties(extraTableProps) + .build() + catalog.createTable(sourceIdent, tableInfo) + + val data = Seq( + Row(1, Row(10, Row(Map("c" -> "d"), false)), "sales"), + Row(2, Row(20, Row(Map("e" -> "f"), true)), "engineering") + ) + val sourceTableSchema = StructType(Seq( + StructField("pk", IntegerType, nullable = false), + StructField("s", StructType(Seq( + StructField("c1", IntegerType), + StructField("c2", StructType(Seq( + StructField("m", MapType(StringType, StringType)), + StructField("c3", BooleanType) + ))) + ))), + StructField("dep", StringType) + )) + spark.createDataFrame(spark.sparkContext.parallelize(data), sourceTableSchema) + .createOrReplaceTempView("source_temp") + + sql(s"INSERT INTO $sourceTable SELECT * FROM source_temp") + + val mergeBuilder = spark.table(sourceTable) + .mergeInto(tableNameAsString, $"source_table.pk" === col(tableNameAsString + ".pk")) + .whenMatched() + .update(Map( + "s.c1" -> lit(-1), + "s.c2.m" -> map(lit("k"), lit("v")), + "s.c2.a" -> array(lit(-1)), + "s.c2.c3" -> col("source_table.s.c2.c3"))) + .whenNotMatched() + .insert(Map( + "pk" -> col("source_table.pk"), + "s" -> struct( + col("source_table.s.c1").as("c1"), + struct( + array(lit(-2)).as("a"), + map(lit("g"), lit("h")).as("m"), + lit(true).as("c3") + ).as("c2") + ), + "dep" -> col("source_table.dep"))) + + if (coerceNestedTypes && withSchemaEvolution) { + mergeBuilder.withSchemaEvolution().merge() + checkAnswer( + sql(s"SELECT * FROM $tableNameAsString"), + Seq(Row(1, Row(-1, Row(Seq(-1), Map("k" -> "v"), false)), "hr"), + Row(2, Row(20, Row(Seq(-2), Map("g" -> "h"), true)), "engineering"))) + } else { + val exception = intercept[org.apache.spark.sql.AnalysisException] { + mergeBuilder.merge() + } + assert(exception.errorClass.get == "FIELD_NOT_FOUND") + assert(exception.getMessage.contains("No such struct field `c3` in `a`, `m`. ")) + } + } + sql(s"DROP TABLE $tableNameAsString") + } + } + } + } - val targetData = Seq( - Row(1, Row(2, Row(Array(1, 2), Map("a" -> "b"))), "hr") - ) - val targetSchema = StructType(Seq( - StructField("pk", IntegerType, nullable = false), - StructField("s", StructType(Seq( - StructField("c1", IntegerType), - StructField("c2", StructType(Seq( - StructField("a", ArrayType(IntegerType)), - StructField("m", MapType(StringType, StringType)) - ))) - ))), - StructField("dep", StringType) - )) - spark.createDataFrame(spark.sparkContext.parallelize(targetData), targetSchema) - .writeTo(tableNameAsString).append() + test("merge schema evolution replace column with nested struct and set all columns " + + "using dataframe API") { + Seq(true, false).foreach { withSchemaEvolution => + Seq(true, false).foreach { coerceNestedTypes => + withSQLConf(SQLConf.MERGE_INTO_NESTED_TYPE_COERCION_ENABLED.key -> + coerceNestedTypes.toString) { + val sourceTable = "cat.ns1.source_table" + withTable(sourceTable) { + sql( + s"""CREATE TABLE $tableNameAsString ( + |pk INT NOT NULL, + |s STRUCT, m: MAP>>, + |dep STRING) + |PARTITIONED BY (dep) + |""".stripMargin) - val sourceIdent = Identifier.of(Array("ns1"), "source_table") - val columns = Array( - Column.create("pk", IntegerType, false), - Column.create("s", StructType(Seq( - StructField("c1", IntegerType), - StructField("c2", StructType(Seq( - // removed column 'a' - StructField("m", MapType(StringType, StringType)), - StructField("c3", BooleanType) // new column - ))) - ))), - Column.create("dep", StringType)) - val tableInfo = new TableInfo.Builder() - .withColumns(columns) - .withProperties(extraTableProps) - .build() - catalog.createTable(sourceIdent, tableInfo) + val tableSchema = StructType(Seq( + StructField("pk", IntegerType, nullable = false), + StructField("s", StructType(Seq( + StructField("c1", IntegerType), + StructField("c2", StructType(Seq( + StructField("a", ArrayType(IntegerType)), + StructField("m", MapType(StringType, StringType)) + ))) + ))), + StructField("dep", StringType) + )) + val targetData = Seq( + Row(1, Row(2, Row(Array(1, 2), Map("a" -> "b"))), "hr") + ) + spark.createDataFrame(spark.sparkContext.parallelize(targetData), tableSchema) + .coalesce(1).writeTo(tableNameAsString).append() + + val sourceIdent = Identifier.of(Array("ns1"), "source_table") + val columns = Array( + Column.create("pk", IntegerType, false), + Column.create("s", StructType(Seq( + StructField("c1", IntegerType), + StructField("c2", StructType(Seq( + // missing column 'a' + StructField("m", MapType(StringType, StringType)), + StructField("c3", BooleanType) // new column + ))) + ))), + Column.create("dep", StringType)) + val tableInfo = new TableInfo.Builder() + .withColumns(columns) + .withProperties(extraTableProps) + .build() + catalog.createTable(sourceIdent, tableInfo) + + val sourceData = Seq( + Row(1, Row(10, Row(Map("c" -> "d"), false)), "sales"), + Row(2, Row(20, Row(Map("e" -> "f"), true)), "engineering") + ) + val sourceTableSchema = StructType(Seq( + StructField("pk", IntegerType, nullable = false), + StructField("s", StructType(Seq( + StructField("c1", IntegerType), + StructField("c2", StructType(Seq( + StructField("m", MapType(StringType, StringType)), + StructField("c3", BooleanType) + ))) + ))), + StructField("dep", StringType) + )) + spark.createDataFrame(spark.sparkContext.parallelize(sourceData), sourceTableSchema) + .createOrReplaceTempView("source_temp") + + sql(s"INSERT INTO $sourceTable SELECT * FROM source_temp") + + val mergeBuilder = spark.table(sourceTable) + .mergeInto(tableNameAsString, $"source_table.pk" === col(tableNameAsString + ".pk")) + .whenMatched() + .updateAll() + .whenNotMatched() + .insertAll() + + if (coerceNestedTypes) { + if (withSchemaEvolution) { + mergeBuilder.withSchemaEvolution().merge() + checkAnswer( + sql(s"SELECT * FROM $tableNameAsString"), + Seq( + Row(1, Row(10, Row(Seq(1, 2), Map("c" -> "d"), false)), "sales"), + Row(2, Row(20, Row(null, Map("e" -> "f"), true)), "engineering"))) + } else { + val exception = intercept[org.apache.spark.sql.AnalysisException] { + mergeBuilder.merge() + } + assert(exception.errorClass.get == + "INCOMPATIBLE_DATA_FOR_TABLE.EXTRA_STRUCT_FIELDS") + assert(exception.getMessage.contains( + "Cannot write extra fields `c3` to the struct `s`.`c2`")) + } + } else { + val exception = intercept[org.apache.spark.sql.AnalysisException] { + mergeBuilder.merge() + } + assert(exception.errorClass.get == "INCOMPATIBLE_DATA_FOR_TABLE.CANNOT_FIND_DATA") + assert(exception.getMessage.contains( + "Cannot find data for the output column `s`.`c2`.`a`")) + } - val data = Seq( - Row(1, Row(10, Row(Map("c" -> "d"), false)), "sales"), - Row(2, Row(20, Row(Map("e" -> "f"), true)), "engineering") - ) - val sourceTableSchema = StructType(Seq( - StructField("pk", IntegerType, nullable = false), - StructField("s", StructType(Seq( - StructField("c1", IntegerType), - StructField("c2", StructType(Seq( - StructField("m", MapType(StringType, StringType)), - StructField("c3", BooleanType) - ))) - ))), - StructField("dep", StringType) - )) - spark.createDataFrame(spark.sparkContext.parallelize(data), sourceTableSchema) - .createOrReplaceTempView("source_temp") + sql(s"DROP TABLE $tableNameAsString") + } + } + } + } + } - sql(s"INSERT INTO $sourceTable SELECT * FROM source_temp") + test("merge schema evolution replace column with nested struct and " + + "update top level struct using dataframe API") { + Seq(true, false).foreach { withSchemaEvolution => + Seq(true, false).foreach { coerceNestedTypes => + withSQLConf(SQLConf.MERGE_INTO_NESTED_TYPE_COERCION_ENABLED.key -> + coerceNestedTypes.toString) { + val sourceTable = "cat.ns1.source_table" + withTable(sourceTable) { + sql( + s"""CREATE TABLE $tableNameAsString ( + |pk INT NOT NULL, + |s STRUCT, m: MAP>>, + |dep STRING) + |PARTITIONED BY (dep) + |""".stripMargin) - val mergeBuilder = spark.table(sourceTable) - .mergeInto(tableNameAsString, $"source_table.pk" === col(tableNameAsString + ".pk")) - .whenMatched() - .update(Map( - "s.c1" -> lit(-1), - "s.c2.m" -> map(lit("k"), lit("v")), - "s.c2.a" -> array(lit(-1)), - "s.c2.c3" -> col("source_table.s.c2.c3"))) - .whenNotMatched() - .insert(Map( - "pk" -> col("source_table.pk"), - "s" -> struct( - col("source_table.s.c1").as("c1"), - struct( - array(lit(-2)).as("a"), - map(lit("g"), lit("h")).as("m"), - lit(true).as("c3") - ).as("c2") - ), - "dep" -> col("source_table.dep"))) + val tableSchema = StructType(Seq( + StructField("pk", IntegerType, nullable = false), + StructField("s", StructType(Seq( + StructField("c1", IntegerType), + StructField("c2", StructType(Seq( + StructField("a", ArrayType(IntegerType)), + StructField("m", MapType(StringType, StringType)) + ))) + ))), + StructField("dep", StringType) + )) + val targetData = Seq( + Row(1, Row(2, Row(Array(1, 2), Map("a" -> "b"))), "hr") + ) + spark.createDataFrame(spark.sparkContext.parallelize(targetData), tableSchema) + .coalesce(1).writeTo(tableNameAsString).append() + + // Create source table + val sourceIdent = Identifier.of(Array("ns1"), "source_table") + val columns = Array( + Column.create("pk", IntegerType, false), + Column.create("s", StructType(Seq( + StructField("c1", IntegerType), + StructField("c2", StructType(Seq( + // missing column 'a' + StructField("m", MapType(StringType, StringType)), + StructField("c3", BooleanType) // new column + ))) + ))), + Column.create("dep", StringType)) + val tableInfo = new TableInfo.Builder() + .withColumns(columns) + .withProperties(extraTableProps) + .build() + catalog.createTable(sourceIdent, tableInfo) + + val sourceData = Seq( + Row(1, Row(10, Row(Map("c" -> "d"), false)), "sales"), + Row(2, Row(20, Row(Map("e" -> "f"), true)), "engineering") + ) + val sourceTableSchema = StructType(Seq( + StructField("pk", IntegerType, nullable = false), + StructField("s", StructType(Seq( + StructField("c1", IntegerType), + StructField("c2", StructType(Seq( + StructField("m", MapType(StringType, StringType)), + StructField("c3", BooleanType) + ))) + ))), + StructField("dep", StringType) + )) + spark.createDataFrame(spark.sparkContext.parallelize(sourceData), sourceTableSchema) + .createOrReplaceTempView("source_temp") + + sql(s"INSERT INTO $sourceTable SELECT * FROM source_temp") + + val mergeBuilder = spark.table(sourceTable) + .mergeInto(tableNameAsString, $"source_table.pk" === col(tableNameAsString + ".pk")) + .whenMatched() + .update(Map("s" -> col("source_table.s"))) + .whenNotMatched() + .insertAll() + + if (coerceNestedTypes) { + if (withSchemaEvolution) { + mergeBuilder.withSchemaEvolution().merge() + checkAnswer( + sql(s"SELECT * FROM $tableNameAsString"), + Seq( + Row(1, Row(10, Row(null, Map("c" -> "d"), false)), "hr"), + Row(2, Row(20, Row(null, Map("e" -> "f"), true)), "engineering"))) + } else { + val exception = intercept[org.apache.spark.sql.AnalysisException] { + mergeBuilder.merge() + } + assert(exception.errorClass.get == + "INCOMPATIBLE_DATA_FOR_TABLE.EXTRA_STRUCT_FIELDS") + assert(exception.getMessage.contains( + "Cannot write extra fields `c3` to the struct `s`.`c2`")) + } + } else { + val exception = intercept[org.apache.spark.sql.AnalysisException] { + mergeBuilder.merge() + } + assert(exception.errorClass.get == "INCOMPATIBLE_DATA_FOR_TABLE.CANNOT_FIND_DATA") + assert(exception.getMessage.contains( + "Cannot find data for the output column `s`.`c2`.`a`")) + } - if (withSchemaEvolution) { - mergeBuilder.withSchemaEvolution().merge() - checkAnswer( - sql(s"SELECT * FROM $tableNameAsString"), - Seq(Row(1, Row(-1, Row(Seq(-1), Map("k" -> "v"), false)), "hr"), - Row(2, Row(20, Row(Seq(-2), Map("g" -> "h"), true)), "engineering"))) - } else { - val exception = intercept[org.apache.spark.sql.AnalysisException] { - mergeBuilder.merge() + sql(s"DROP TABLE $tableNameAsString") } - assert(exception.errorClass.get == "FIELD_NOT_FOUND") - assert(exception.getMessage.contains("No such struct field `c3` in `a`, `m`. ")) } - - sql(s"DROP TABLE $tableNameAsString") } } } @@ -4790,6 +6159,324 @@ abstract class MergeIntoTableSuiteBase extends RowLevelOperationSuiteBase } } + test("merge into with source missing fields in top-level struct using dataframe API") { + Seq(true, false).foreach { coerceNestedTypes => + withSQLConf(SQLConf.MERGE_INTO_NESTED_TYPE_COERCION_ENABLED.key -> + coerceNestedTypes.toString) { + val sourceTable = "cat.ns1.source_table" + withTable(sourceTable) { + // Target table has struct with 3 fields at top level + sql( + s"""CREATE TABLE $tableNameAsString ( + |pk INT NOT NULL, + |s STRUCT, + |dep STRING)""".stripMargin) + + val targetData = Seq( + Row(0, Row(1, "a", true), "sales") + ) + val targetSchema = StructType(Seq( + StructField("pk", IntegerType, nullable = false), + StructField("s", StructType(Seq( + StructField("c1", IntegerType), + StructField("c2", StringType), + StructField("c3", BooleanType) + ))), + StructField("dep", StringType) + )) + spark.createDataFrame(spark.sparkContext.parallelize(targetData), targetSchema) + .writeTo(tableNameAsString).append() + + // Create source table with struct having only 2 fields (c1, c2) - missing c3 + val sourceIdent = Identifier.of(Array("ns1"), "source_table") + val columns = Array( + Column.create("pk", IntegerType, false), + Column.create("s", StructType(Seq( + StructField("c1", IntegerType), + StructField("c2", StringType)))), // missing c3 field + Column.create("dep", StringType)) + val tableInfo = new TableInfo.Builder() + .withColumns(columns) + .withProperties(extraTableProps) + .build() + catalog.createTable(sourceIdent, tableInfo) + + val data = Seq( + Row(1, Row(10, "b"), "hr"), + Row(2, Row(20, "c"), "engineering") + ) + val sourceTableSchema = StructType(Seq( + StructField("pk", IntegerType, nullable = false), + StructField("s", StructType(Seq( + StructField("c1", IntegerType), + StructField("c2", StringType)))), + StructField("dep", StringType))) + spark.createDataFrame(spark.sparkContext.parallelize(data), sourceTableSchema) + .createOrReplaceTempView("source_temp") + + sql(s"INSERT INTO $sourceTable SELECT * FROM source_temp") + + if (coerceNestedTypes) { + spark.table(sourceTable) + .mergeInto(tableNameAsString, $"source_table.pk" === col(tableNameAsString + ".pk")) + .whenMatched() + .updateAll() + .whenNotMatched() + .insertAll() + .merge() + + // Missing field c3 should be filled with NULL + checkAnswer( + sql(s"SELECT * FROM $tableNameAsString"), + Seq( + Row(0, Row(1, "a", true), "sales"), + Row(1, Row(10, "b", null), "hr"), + Row(2, Row(20, "c", null), "engineering"))) + } else { + val exception = intercept[org.apache.spark.sql.AnalysisException] { + spark.table(sourceTable) + .mergeInto(tableNameAsString, $"source_table.pk" === col(tableNameAsString + ".pk")) + .whenMatched() + .updateAll() + .whenNotMatched() + .insertAll() + .merge() + } + assert(exception.errorClass.get == + "INCOMPATIBLE_DATA_FOR_TABLE.CANNOT_FIND_DATA") + assert(exception.getMessage.contains( + "Cannot write incompatible data for the table ``: " + + "Cannot find data for the output column `s`.`c3`.")) + } + + sql(s"DROP TABLE $tableNameAsString") + } + } + } + } + + test("merge with null struct with missing nested field using dataframe API") { + Seq(true, false).foreach { coerceNestedTypes => + withSQLConf(SQLConf.MERGE_INTO_NESTED_TYPE_COERCION_ENABLED.key -> + coerceNestedTypes.toString) { + val sourceTable = "cat.ns1.source_table" + withTable(sourceTable) { + // Target table has nested struct with fields c1 and c2 + sql( + s"""CREATE TABLE $tableNameAsString ( + |pk INT NOT NULL, + |s STRUCT>, + |dep STRING)""".stripMargin) + + val targetData = Seq( + Row(0, Row(1, Row(10, "x")), "sales"), + Row(1, Row(2, Row(20, "y")), "hr") + ) + val targetSchema = StructType(Seq( + StructField("pk", IntegerType, nullable = false), + StructField("s", StructType(Seq( + StructField("c1", IntegerType), + StructField("c2", StructType(Seq( + StructField("a", IntegerType), + StructField("b", StringType) + ))) + ))), + StructField("dep", StringType) + )) + spark.createDataFrame(spark.sparkContext.parallelize(targetData), targetSchema) + .writeTo(tableNameAsString).append() + + // Create source table with missing nested field 'b' + val sourceIdent = Identifier.of(Array("ns1"), "source_table") + val columns = Array( + Column.create("pk", IntegerType, false), + Column.create("s", StructType(Seq( + StructField("c1", IntegerType), + StructField("c2", StructType(Seq( + StructField("a", IntegerType) + // missing field 'b' + ))) + ))), + Column.create("dep", StringType)) + val tableInfo = new TableInfo.Builder() + .withColumns(columns) + .withProperties(extraTableProps) + .build() + catalog.createTable(sourceIdent, tableInfo) + + // Source table has null for the nested struct + val data = Seq( + Row(1, null, "engineering"), + Row(2, null, "finance") + ) + val sourceTableSchema = StructType(Seq( + StructField("pk", IntegerType), + StructField("s", StructType(Seq( + StructField("c1", IntegerType), + StructField("c2", StructType(Seq( + StructField("a", IntegerType) + ))) + ))), + StructField("dep", StringType) + )) + spark.createDataFrame(spark.sparkContext.parallelize(data), sourceTableSchema) + .createOrReplaceTempView("source_temp") + + sql(s"INSERT INTO $sourceTable SELECT * FROM source_temp") + val mergeBuilder = spark.table(sourceTable) + .mergeInto(tableNameAsString, + $"source_table.pk" === col(tableNameAsString + ".pk")) + .whenMatched() + .updateAll() + .whenNotMatched() + .insertAll() + + if (coerceNestedTypes) { + mergeBuilder.merge() + checkAnswer( + sql(s"SELECT * FROM $tableNameAsString"), + Seq( + Row(0, Row(1, Row(10, "x")), "sales"), + Row(1, null, "engineering"), + Row(2, null, "finance"))) + } else { + // Without coercion, the merge should fail due to missing field + val exception = intercept[org.apache.spark.sql.AnalysisException] { + mergeBuilder.merge() + } + assert(exception.errorClass.get == + "INCOMPATIBLE_DATA_FOR_TABLE.CANNOT_FIND_DATA") + assert(exception.getMessage.contains( + "Cannot write incompatible data for the table ``: " + + "Cannot find data for the output column `s`.`c2`.`b`.")) + } + + sql(s"DROP TABLE $tableNameAsString") + } + } + } + } + + test("merge null struct with schema evolution - " + + "source with missing and extra nested fields using dataframe API") { + Seq(true, false).foreach { withSchemaEvolution => + Seq(true, false).foreach { coerceNestedTypes => + withSQLConf(SQLConf.MERGE_INTO_NESTED_TYPE_COERCION_ENABLED.key -> + coerceNestedTypes.toString) { + val sourceTable = "cat.ns1.source_table" + withTable(sourceTable) { + // Target table has nested struct with fields c1 and c2 + sql( + s"""CREATE TABLE $tableNameAsString ( + |pk INT NOT NULL, + |s STRUCT>, + |dep STRING)""".stripMargin) + + val targetData = Seq( + Row(0, Row(1, Row(10, "x")), "sales"), + Row(1, Row(2, Row(20, "y")), "hr") + ) + val targetSchema = StructType(Seq( + StructField("pk", IntegerType, nullable = false), + StructField("s", StructType(Seq( + StructField("c1", IntegerType), + StructField("c2", StructType(Seq( + StructField("a", IntegerType), + StructField("b", StringType) + ))) + ))), + StructField("dep", StringType) + )) + spark.createDataFrame(spark.sparkContext.parallelize(targetData), targetSchema) + .writeTo(tableNameAsString).append() + + // Create source table with missing field 'b' and extra field 'c' in nested struct + val sourceIdent = Identifier.of(Array("ns1"), "source_table") + val columns = Array( + Column.create("pk", IntegerType, false), + Column.create("s", StructType(Seq( + StructField("c1", IntegerType), + StructField("c2", StructType(Seq( + StructField("a", IntegerType), + // missing field 'b' + StructField("c", StringType) // extra field 'c' + ))) + ))), + Column.create("dep", StringType)) + val tableInfo = new TableInfo.Builder() + .withColumns(columns) + .withProperties(extraTableProps) + .build() + catalog.createTable(sourceIdent, tableInfo) + + // Source data has null for the nested struct + val data = Seq( + Row(1, null, "engineering"), + Row(2, null, "finance") + ) + val sourceTableSchema = StructType(Seq( + StructField("pk", IntegerType), + StructField("s", StructType(Seq( + StructField("c1", IntegerType), + StructField("c2", StructType(Seq( + StructField("a", IntegerType), + StructField("c", StringType) + ))) + ))), + StructField("dep", StringType) + )) + spark.createDataFrame(spark.sparkContext.parallelize(data), sourceTableSchema) + .createOrReplaceTempView("source_temp") + + sql(s"INSERT INTO $sourceTable SELECT * FROM source_temp") + + val mergeBuilder = spark.table(sourceTable) + .mergeInto(tableNameAsString, $"source_table.pk" === col(tableNameAsString + ".pk")) + .whenMatched() + .updateAll() + .whenNotMatched() + .insertAll() + + if (coerceNestedTypes) { + if (withSchemaEvolution) { + // extra nested field is added + mergeBuilder.withSchemaEvolution().merge() + checkAnswer( + sql(s"SELECT * FROM $tableNameAsString"), + Seq( + Row(0, Row(1, Row(10, "x", null)), "sales"), + Row(1, null, "engineering"), + Row(2, null, "finance"))) + } else { + // extra nested field is not added + val exception = intercept[org.apache.spark.sql.AnalysisException] { + mergeBuilder.merge() + } + assert(exception.errorClass.get == + "INCOMPATIBLE_DATA_FOR_TABLE.EXTRA_STRUCT_FIELDS") + assert(exception.getMessage.contains( + "Cannot write incompatible data for the table ``: " + + "Cannot write extra fields `c` to the struct `s`.`c2`")) + } + } else { + // Without source struct coercion, the merge should fail + val exception = intercept[org.apache.spark.sql.AnalysisException] { + mergeBuilder.merge() + } + assert(exception.errorClass.get == + "INCOMPATIBLE_DATA_FOR_TABLE.CANNOT_FIND_DATA") + assert(exception.getMessage.contains( + "Cannot write incompatible data for the table ``: " + + "Cannot find data for the output column `s`.`c2`.`b`.")) + } + + sql(s"DROP TABLE $tableNameAsString") + } + } + } + } + } + test("Merge schema evolution should error on non-existent column in UPDATE and INSERT") { withTable(tableNameAsString) { withTempView("source") { From fe4e6cd2972eef9bbc25d038af947ecbc207573d Mon Sep 17 00:00:00 2001 From: Szehon Ho Date: Wed, 26 Nov 2025 12:14:51 -0800 Subject: [PATCH 3/3] Revert https://github.com/apache/spark/pull/53149 --- .../catalyst/analysis/AssignmentUtils.scala | 139 +----------------- .../connector/MergeIntoTableSuiteBase.scala | 8 +- 2 files changed, 10 insertions(+), 137 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/AssignmentUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/AssignmentUtils.scala index 185dc5ec54f6..c9ed5b86dbde 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/AssignmentUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/AssignmentUtils.scala @@ -21,8 +21,7 @@ import scala.collection.mutable import org.apache.spark.sql.catalyst.SQLConfHelper import org.apache.spark.sql.catalyst.analysis.TableOutputResolver.DefaultValueFillMode.{NONE, RECURSE} -import org.apache.spark.sql.catalyst.expressions.{And, Attribute, CreateNamedStruct, Expression, GetStructField, If, IsNull, Literal} -import org.apache.spark.sql.catalyst.expressions.objects.AssertNotNull +import org.apache.spark.sql.catalyst.expressions.{Attribute, CreateNamedStruct, Expression, GetStructField, IsNull, Literal} import org.apache.spark.sql.catalyst.plans.logical.Assignment import org.apache.spark.sql.catalyst.types.DataTypeUtils import org.apache.spark.sql.catalyst.util.CharVarcharUtils @@ -182,31 +181,11 @@ object AssignmentUtils extends SQLConfHelper with CastSupport { } else if (exactAssignments.isEmpty && fieldAssignments.isEmpty) { TableOutputResolver.checkNullability(colExpr, col, conf, colPath) } else if (exactAssignments.nonEmpty) { - if (updateStar) { - val value = exactAssignments.head.value - col.dataType match { - case structType: StructType => - // Expand assignments to leaf fields - val structAssignment = - applyNestedFieldAssignments(col, colExpr, value, addError, colPath, - coerceNestedTypes) - - // Wrap with null check for missing source fields - fixNullExpansion(col, value, structType, structAssignment, - colPath, addError) - case _ => - // For non-struct types, resolve directly - val coerceMode = if (coerceNestedTypes) RECURSE else NONE - TableOutputResolver.resolveUpdate("", value, col, conf, addError, colPath, - coerceMode) - } - } else { - val value = exactAssignments.head.value - val coerceMode = if (coerceNestedTypes) RECURSE else NONE - val resolvedValue = TableOutputResolver.resolveUpdate("", value, col, conf, addError, - colPath, coerceMode) - resolvedValue - } + val value = exactAssignments.head.value + val coerceMode = if (coerceNestedTypes) RECURSE else NONE + val resolvedValue = TableOutputResolver.resolveUpdate("", value, col, conf, addError, + colPath, coerceMode) + resolvedValue } else { applyFieldAssignments(col, colExpr, fieldAssignments, addError, colPath, coerceNestedTypes) } @@ -240,63 +219,6 @@ object AssignmentUtils extends SQLConfHelper with CastSupport { } } - private def applyNestedFieldAssignments( - col: Attribute, - colExpr: Expression, - value: Expression, - addError: String => Unit, - colPath: Seq[String], - coerceNestedTyptes: Boolean): Expression = { - - col.dataType match { - case structType: StructType => - val fieldAttrs = DataTypeUtils.toAttributes(structType) - - val updatedFieldExprs = fieldAttrs.zipWithIndex.map { case (fieldAttr, ordinal) => - val fieldPath = colPath :+ fieldAttr.name - val targetFieldExpr = GetStructField(colExpr, ordinal, Some(fieldAttr.name)) - - // Try to find a corresponding field in the source value by name - val sourceFieldValue: Expression = value.dataType match { - case valueStructType: StructType => - valueStructType.fields.find(f => conf.resolver(f.name, fieldAttr.name)) match { - case Some(matchingField) => - // Found matching field in source, extract it - val fieldIndex = valueStructType.fieldIndex(matchingField.name) - GetStructField(value, fieldIndex, Some(matchingField.name)) - case None => - // Field doesn't exist in source, use target's current value with null check - TableOutputResolver.checkNullability(targetFieldExpr, fieldAttr, conf, fieldPath) - } - case _ => - // Value is not a struct, cannot extract field - addError(s"Cannot assign non-struct value to struct field '${fieldPath.quoted}'") - Literal(null, fieldAttr.dataType) - } - - // Recurse or resolve based on field type - fieldAttr.dataType match { - case nestedStructType: StructType => - // Field is a struct, recurse - applyNestedFieldAssignments(fieldAttr, targetFieldExpr, sourceFieldValue, - addError, fieldPath, coerceNestedTyptes) - case _ => - // Field is not a struct, resolve with TableOutputResolver - val coerceMode = if (coerceNestedTyptes) RECURSE else NONE - TableOutputResolver.resolveUpdate("", sourceFieldValue, fieldAttr, conf, addError, - fieldPath, coerceMode) - } - } - toNamedStruct(structType, updatedFieldExprs) - - case otherType => - addError( - "Updating nested fields is only supported for StructType but " + - s"'${colPath.quoted}' is of type $otherType") - colExpr - } - } - private def toNamedStruct(structType: StructType, fieldExprs: Seq[Expression]): Expression = { val namedStructExprs = structType.fields.zip(fieldExprs).flatMap { case (field, expr) => Seq(Literal(field.name), expr) @@ -350,55 +272,6 @@ object AssignmentUtils extends SQLConfHelper with CastSupport { IsNull(currentExpr) } - /** - * As UPDATE SET * can assign struct fields individually (preserving existing fields), - * this will lead to null expansion, ie, a struct is created where all fields are null. - * Wraps a struct assignment with null checks for the source and missing source fields. - * Return null if all are null. - * - * @param col the target column attribute - * @param value the source value expression - * @param structType the target struct type - * @param structAssignment the struct assignment result to wrap - * @param colPath the column path for error reporting - * @param addError error reporting function - * @return the wrapped expression with null checks - */ - private def fixNullExpansion( - col: Attribute, - value: Expression, - structType: StructType, - structAssignment: Expression, - colPath: Seq[String], - addError: String => Unit): Expression = { - // As StoreAssignmentPolicy.LEGACY is not allowed in DSv2, always add null check for - // non-nullable column - if (!col.nullable) { - AssertNotNull(value) - } else { - // Check if source struct is null - val valueIsNull = IsNull(value) - - // Check if missing source paths (paths in target but not in source) are not null - // These will be null for the case of UPDATE SET * and - val missingSourcePaths = getMissingSourcePaths(structType, value.dataType, colPath, addError) - val condition = if (missingSourcePaths.nonEmpty) { - // Check if all target attributes at missing source paths are null - val missingFieldNullChecks = missingSourcePaths.map { path => - createNullCheckForFieldPath(col, path) - } - // Combine all null checks with AND - val allMissingFieldsNull = missingFieldNullChecks.reduce[Expression]((a, b) => And(a, b)) - And(valueIsNull, allMissingFieldsNull) - } else { - valueIsNull - } - - // Return: If (condition) THEN NULL ELSE structAssignment - If(condition, Literal(null, structAssignment.dataType), structAssignment) - } - } - /** * Checks whether assignments are aligned and compatible with table columns. * diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/MergeIntoTableSuiteBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/MergeIntoTableSuiteBase.scala index e0432dc3227f..7539506e8bfe 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/MergeIntoTableSuiteBase.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/MergeIntoTableSuiteBase.scala @@ -3321,7 +3321,7 @@ abstract class MergeIntoTableSuiteBase extends RowLevelOperationSuiteBase checkAnswer( sql(s"SELECT * FROM $tableNameAsString"), Seq( - Row(1, Row(10, Row(Seq(1, 2), Map("c" -> "d"), false)), "sales"), + Row(1, Row(10, Row(null, Map("c" -> "d"), false)), "sales"), Row(2, Row(20, Row(null, Map("e" -> "f"), true)), "engineering"))) } else { val exception = intercept[org.apache.spark.sql.AnalysisException] { @@ -5365,8 +5365,8 @@ abstract class MergeIntoTableSuiteBase extends RowLevelOperationSuiteBase checkAnswer( sql(s"SELECT * FROM $tableNameAsString"), Seq( - Row(1, Row(10, Row(20, true)), "sales"), - Row(2, Row(20, Row(30, false)), "engineering"))) + Row(1, Row(10, Row(20, null)), "sales"), + Row(2, Row(20, Row(30, null)), "engineering"))) } else { val exception = intercept[Exception] { sql(mergeStmt) @@ -5960,7 +5960,7 @@ abstract class MergeIntoTableSuiteBase extends RowLevelOperationSuiteBase checkAnswer( sql(s"SELECT * FROM $tableNameAsString"), Seq( - Row(1, Row(10, Row(Seq(1, 2), Map("c" -> "d"), false)), "sales"), + Row(1, Row(10, Row(null, Map("c" -> "d"), false)), "sales"), Row(2, Row(20, Row(null, Map("e" -> "f"), true)), "engineering"))) } else { val exception = intercept[org.apache.spark.sql.AnalysisException] {