diff --git a/docs/docs/spark/sql-write.md b/docs/docs/spark/sql-write.md index 3a05500cccf8..518bf0e5b40c 100644 --- a/docs/docs/spark/sql-write.md +++ b/docs/docs/spark/sql-write.md @@ -221,6 +221,23 @@ WHEN NOT MATCHED THEN INSERT * -- when not matched, insert this row without any transformation; ``` +### Column Alignment + +Assignments are aligned to the target table by **column name**. + +For explicit clauses (`UPDATE SET col = expr` / `INSERT (col list) VALUES ...`), only the mentioned columns are written. Unmentioned target columns preserve their current value for `UPDATE`, or are filled with NULL / `CURRENT_DEFAULT` for `INSERT`. + +For star clauses (`UPDATE SET *` / `INSERT *`), `*` expands against the **target** columns. The behavior when source and target columns don't match exactly depends on `spark.paimon.write.merge-schema` (see [Write Merge Schema](#write-merge-schema)): + +| Scenario | `merge-schema=false` (default) | `merge-schema=true` | +|----------|-------------------------------|---------------------| +| Top-level source-extra columns | Silently dropped (`*` only covers target columns) | Evolved into the target schema | +| Top-level target columns missing from source | Throws | `UPDATE *` preserves current value; `INSERT *` fills NULL | +| Nested struct source-extra fields | Throws | Evolved into the target schema | +| Nested struct target-missing fields | Throws | `UPDATE *` preserves current value; `INSERT *` fills NULL | + +The key difference between top-level and nested: under strict mode (`merge-schema=false`), top-level source-extras are silently dropped because `*` never references them, while nested source-extras inside a struct value throw an error to avoid silent data loss. + ## Write Merge Schema :::info diff --git a/paimon-spark/paimon-spark-3.2/src/main/scala/org/apache/paimon/spark/catalyst/analysis/PaimonMergeInto.scala b/paimon-spark/paimon-spark-3.2/src/main/scala/org/apache/paimon/spark/catalyst/analysis/PaimonMergeInto.scala deleted file mode 100644 index 95ae7471daf8..000000000000 --- a/paimon-spark/paimon-spark-3.2/src/main/scala/org/apache/paimon/spark/catalyst/analysis/PaimonMergeInto.scala +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.paimon.spark.catalyst.analysis - -import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.catalyst.expressions.Attribute -import org.apache.spark.sql.catalyst.plans.logical.{MergeAction, MergeIntoTable} - -/** A post-hoc resolution rule for MergeInto. */ -case class PaimonMergeInto(spark: SparkSession) extends PaimonMergeIntoBase { - - /** - * Align all MergeActions in a MergeIntoTable based on the target table's output attributes. - * Returns a new MergeIntoTable with aligned matchedActions and notMatchedActions. - */ - override def alignMergeIntoTable( - m: MergeIntoTable, - targetOutput: Seq[Attribute]): MergeIntoTable = { - m.copy( - matchedActions = m.matchedActions.map(alignMergeAction(_, targetOutput)), - notMatchedActions = m.notMatchedActions.map(alignMergeAction(_, targetOutput)) - ) - } - - override def resolveNotMatchedBySourceActions(merge: MergeIntoTable): Seq[MergeAction] = { - Seq.empty - } -} diff --git a/paimon-spark/paimon-spark-3.2/src/main/scala/org/apache/paimon/spark/catalyst/analysis/PaimonMergeIntoResolver.scala b/paimon-spark/paimon-spark-3.2/src/main/scala/org/apache/paimon/spark/catalyst/analysis/PaimonMergeIntoResolver.scala deleted file mode 100644 index e0869a6089e9..000000000000 --- a/paimon-spark/paimon-spark-3.2/src/main/scala/org/apache/paimon/spark/catalyst/analysis/PaimonMergeIntoResolver.scala +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.paimon.spark.catalyst.analysis - -import org.apache.spark.sql.catalyst.expressions.Expression -import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, MergeAction, MergeIntoTable} - -object PaimonMergeIntoResolver extends PaimonMergeIntoResolverBase { - - def resolveNotMatchedBySourceActions( - merge: MergeIntoTable, - resolve: (Expression, LogicalPlan) => Expression): Seq[MergeAction] = { - Seq.empty - } - - def build( - merge: MergeIntoTable, - resolvedCond: Expression, - resolvedMatched: Seq[MergeAction], - resolvedNotMatched: Seq[MergeAction], - resolvedNotMatchedBySource: Seq[MergeAction]): MergeIntoTable = { - if (resolvedNotMatchedBySource.nonEmpty) { - throw new RuntimeException("WHEN NOT MATCHED BY SOURCE is not supported here.") - } - - merge.copy( - mergeCondition = resolvedCond, - matchedActions = resolvedMatched, - notMatchedActions = resolvedNotMatched) - } - -} diff --git a/paimon-spark/paimon-spark-3.2/src/main/scala/org/apache/spark/sql/paimon/shims/MinorVersionShim.scala b/paimon-spark/paimon-spark-3.2/src/main/scala/org/apache/spark/sql/paimon/shims/MinorVersionShim.scala index e7947acb83ab..dffd53beb9c4 100644 --- a/paimon-spark/paimon-spark-3.2/src/main/scala/org/apache/spark/sql/paimon/shims/MinorVersionShim.scala +++ b/paimon-spark/paimon-spark-3.2/src/main/scala/org/apache/spark/sql/paimon/shims/MinorVersionShim.scala @@ -38,4 +38,7 @@ object MinorVersionShim { notMatchedBySourceActions: Seq[MergeAction]): MergeIntoTable = { MergeIntoTable(targetTable, sourceTable, mergeCondition, matchedActions, notMatchedActions) } + + // Spark 3.2 has no `notMatchedBySourceActions` field on `MergeIntoTable` (added in 3.4). + def notMatchedBySourceActions(merge: MergeIntoTable): Seq[MergeAction] = Seq.empty } diff --git a/paimon-spark/paimon-spark-3.3/src/main/scala/org/apache/paimon/spark/catalyst/analysis/PaimonMergeInto.scala b/paimon-spark/paimon-spark-3.3/src/main/scala/org/apache/paimon/spark/catalyst/analysis/PaimonMergeInto.scala deleted file mode 100644 index a92d13cc0b35..000000000000 --- a/paimon-spark/paimon-spark-3.3/src/main/scala/org/apache/paimon/spark/catalyst/analysis/PaimonMergeInto.scala +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.paimon.spark.catalyst.analysis - -import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.catalyst.expressions.Attribute -import org.apache.spark.sql.catalyst.plans.logical.{MergeAction, MergeIntoTable} - -/** A post-hoc resolution rule for MergeInto. */ -case class PaimonMergeInto(spark: SparkSession) extends PaimonMergeIntoBase { - - /** - * Align all MergeActions in a MergeIntoTable based on the target table's output attributes. - * Returns a new MergeIntoTable with aligned matchedActions and notMatchedActions. - */ - override def alignMergeIntoTable( - m: MergeIntoTable, - targetOutput: Seq[Attribute]): MergeIntoTable = { - m.copy( - matchedActions = m.matchedActions.map(alignMergeAction(_, targetOutput)), - notMatchedActions = m.notMatchedActions.map(alignMergeAction(_, targetOutput)) - ) - } - - override def resolveNotMatchedBySourceActions(merge: MergeIntoTable): Seq[MergeAction] = { - Seq.empty - } - -} diff --git a/paimon-spark/paimon-spark-3.3/src/main/scala/org/apache/paimon/spark/catalyst/analysis/PaimonMergeIntoResolver.scala b/paimon-spark/paimon-spark-3.3/src/main/scala/org/apache/paimon/spark/catalyst/analysis/PaimonMergeIntoResolver.scala deleted file mode 100644 index e0869a6089e9..000000000000 --- a/paimon-spark/paimon-spark-3.3/src/main/scala/org/apache/paimon/spark/catalyst/analysis/PaimonMergeIntoResolver.scala +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.paimon.spark.catalyst.analysis - -import org.apache.spark.sql.catalyst.expressions.Expression -import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, MergeAction, MergeIntoTable} - -object PaimonMergeIntoResolver extends PaimonMergeIntoResolverBase { - - def resolveNotMatchedBySourceActions( - merge: MergeIntoTable, - resolve: (Expression, LogicalPlan) => Expression): Seq[MergeAction] = { - Seq.empty - } - - def build( - merge: MergeIntoTable, - resolvedCond: Expression, - resolvedMatched: Seq[MergeAction], - resolvedNotMatched: Seq[MergeAction], - resolvedNotMatchedBySource: Seq[MergeAction]): MergeIntoTable = { - if (resolvedNotMatchedBySource.nonEmpty) { - throw new RuntimeException("WHEN NOT MATCHED BY SOURCE is not supported here.") - } - - merge.copy( - mergeCondition = resolvedCond, - matchedActions = resolvedMatched, - notMatchedActions = resolvedNotMatched) - } - -} diff --git a/paimon-spark/paimon-spark-3.3/src/main/scala/org/apache/spark/sql/paimon/shims/MinorVersionShim.scala b/paimon-spark/paimon-spark-3.3/src/main/scala/org/apache/spark/sql/paimon/shims/MinorVersionShim.scala index e7947acb83ab..ae0d96e8f8e1 100644 --- a/paimon-spark/paimon-spark-3.3/src/main/scala/org/apache/spark/sql/paimon/shims/MinorVersionShim.scala +++ b/paimon-spark/paimon-spark-3.3/src/main/scala/org/apache/spark/sql/paimon/shims/MinorVersionShim.scala @@ -38,4 +38,7 @@ object MinorVersionShim { notMatchedBySourceActions: Seq[MergeAction]): MergeIntoTable = { MergeIntoTable(targetTable, sourceTable, mergeCondition, matchedActions, notMatchedActions) } + + // Spark 3.3 has no `notMatchedBySourceActions` field on `MergeIntoTable` (added in 3.4). + def notMatchedBySourceActions(merge: MergeIntoTable): Seq[MergeAction] = Seq.empty } diff --git a/paimon-spark/paimon-spark-4.0/src/main/scala/org/apache/paimon/spark/catalyst/analysis/PaimonMergeIntoResolver.scala b/paimon-spark/paimon-spark-4.0/src/main/scala/org/apache/paimon/spark/catalyst/analysis/PaimonMergeIntoResolver.scala deleted file mode 100644 index b43b7a59a6c5..000000000000 --- a/paimon-spark/paimon-spark-4.0/src/main/scala/org/apache/paimon/spark/catalyst/analysis/PaimonMergeIntoResolver.scala +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.paimon.spark.catalyst.analysis - -import org.apache.spark.sql.catalyst.expressions.Expression -import org.apache.spark.sql.catalyst.plans.logical.{DeleteAction, LogicalPlan, MergeAction, MergeIntoTable, UpdateAction} - -/** Resolve all the expressions for MergeInto. */ -object PaimonMergeIntoResolver extends PaimonMergeIntoResolverBase { - - def resolveNotMatchedBySourceActions( - merge: MergeIntoTable, - resolve: (Expression, LogicalPlan) => Expression): Seq[MergeAction] = { - merge.notMatchedBySourceActions.map { - case DeleteAction(condition) => - // The condition must be from the target table - val resolvedCond = condition.map(resolveCondition(resolve, _, merge, TARGET_ONLY)) - DeleteAction(resolvedCond) - case PaimonUpdateAction(condition, assignments) => - // The condition and value must be from the target table - val resolvedCond = condition.map(resolveCondition(resolve, _, merge, TARGET_ONLY)) - val resolvedAssignments = resolveAssignments(resolve, assignments, merge, TARGET_ONLY) - UpdateAction(resolvedCond, resolvedAssignments) - case action => - throw new RuntimeException(s"Can't recognize this action: $action") - } - } - - def build( - merge: MergeIntoTable, - resolvedCond: Expression, - resolvedMatched: Seq[MergeAction], - resolvedNotMatched: Seq[MergeAction], - resolvedNotMatchedBySource: Seq[MergeAction]): MergeIntoTable = { - merge.copy( - mergeCondition = resolvedCond, - matchedActions = resolvedMatched, - notMatchedActions = resolvedNotMatched, - notMatchedBySourceActions = resolvedNotMatchedBySource - ) - } - -} diff --git a/paimon-spark/paimon-spark-4.0/src/main/scala/org/apache/paimon/spark/catalyst/analysis/PaimonMergeIntoResolverBase.scala b/paimon-spark/paimon-spark-4.0/src/main/scala/org/apache/paimon/spark/catalyst/analysis/PaimonMergeIntoResolverBase.scala deleted file mode 100644 index 4222c54d6e9d..000000000000 --- a/paimon-spark/paimon-spark-4.0/src/main/scala/org/apache/paimon/spark/catalyst/analysis/PaimonMergeIntoResolverBase.scala +++ /dev/null @@ -1,144 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.paimon.spark.catalyst.analysis - -import org.apache.paimon.spark.catalyst.analysis.expressions.ExpressionHelper - -import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute -import org.apache.spark.sql.catalyst.expressions.Expression -import org.apache.spark.sql.catalyst.plans.logical._ - -trait PaimonMergeIntoResolverBase extends ExpressionHelper { - - def apply(merge: MergeIntoTable, spark: SparkSession): LogicalPlan = { - val target = merge.targetTable - val source = merge.sourceTable - assert(target.resolved, "Target should have been resolved here.") - assert(source.resolved, "Source should have been resolved here.") - - val resolve: (Expression, LogicalPlan) => Expression = resolveExpression(spark) - - val resolvedCond = resolveCondition(resolve, merge.mergeCondition, merge, ALL) - val resolvedMatched = resolveMatchedByTargetActions(merge, resolve) - val resolvedNotMatched = resolveNotMatchedByTargetActions(merge, resolve) - val resolvedNotMatchedBySource = resolveNotMatchedBySourceActions(merge, resolve) - - build(merge, resolvedCond, resolvedMatched, resolvedNotMatched, resolvedNotMatchedBySource) - } - - def build( - merge: MergeIntoTable, - resolvedCond: Expression, - resolvedMatched: Seq[MergeAction], - resolvedNotMatched: Seq[MergeAction], - resolvedNotMatchedBySource: Seq[MergeAction]): MergeIntoTable - - private def resolveMatchedByTargetActions( - merge: MergeIntoTable, - resolve: (Expression, LogicalPlan) => Expression): Seq[MergeAction] = { - merge.matchedActions.map { - case DeleteAction(condition) => - // The condition can be from both target and source tables - val resolvedCond = condition.map(resolveCondition(resolve, _, merge, ALL)) - DeleteAction(resolvedCond) - case PaimonUpdateAction(condition, assignments) => - // The condition and value can be from both target and source tables - val resolvedCond = condition.map(resolveCondition(resolve, _, merge, ALL)) - val resolvedAssignments = resolveAssignments(resolve, assignments, merge, ALL) - UpdateAction(resolvedCond, resolvedAssignments) - case UpdateStarAction(condition) => - // The condition can be from both target and source tables, but the value must be from the source table - val resolvedCond = condition.map(resolveCondition(resolve, _, merge, ALL)) - val assignments = merge.targetTable.output.map { - attr => Assignment(attr, UnresolvedAttribute(Seq(attr.name))) - } - val resolvedAssignments = - resolveAssignments(resolve, assignments, merge, SOURCE_ONLY) - // Tag so merge-schema can distinguish `UPDATE *` from a fully-listed explicit clause. - PaimonMergeActionTags.markFromStar(UpdateAction(resolvedCond, resolvedAssignments)) - case action => - throw new RuntimeException(s"Can't recognize this action: $action") - } - } - - private def resolveNotMatchedByTargetActions( - merge: MergeIntoTable, - resolve: (Expression, LogicalPlan) => Expression): Seq[MergeAction] = { - merge.notMatchedActions.map { - case InsertAction(condition, assignments) => - // The condition and value must be from the source table - val resolvedCond = - condition.map(resolveCondition(resolve, _, merge, SOURCE_ONLY)) - val resolvedAssignments = - resolveAssignments(resolve, assignments, merge, SOURCE_ONLY) - InsertAction(resolvedCond, resolvedAssignments) - case InsertStarAction(condition) => - // The condition and value must be from the source table - val resolvedCond = - condition.map(resolveCondition(resolve, _, merge, SOURCE_ONLY)) - val assignments = merge.targetTable.output.map { - attr => Assignment(attr, UnresolvedAttribute(Seq(attr.name))) - } - val resolvedAssignments = - resolveAssignments(resolve, assignments, merge, SOURCE_ONLY) - PaimonMergeActionTags.markFromStar(InsertAction(resolvedCond, resolvedAssignments)) - case action => - throw new RuntimeException(s"Can't recognize this action: $action") - } - } - - def resolveNotMatchedBySourceActions( - merge: MergeIntoTable, - resolve: (Expression, LogicalPlan) => Expression): Seq[MergeAction] - - sealed trait ResolvedWith - case object ALL extends ResolvedWith - case object SOURCE_ONLY extends ResolvedWith - case object TARGET_ONLY extends ResolvedWith - - def resolveCondition( - resolve: (Expression, LogicalPlan) => Expression, - condition: Expression, - mergeInto: MergeIntoTable, - resolvedWith: ResolvedWith): Expression = { - resolvedWith match { - case ALL => resolve(condition, mergeInto) - case SOURCE_ONLY => resolve(condition, Project(Nil, mergeInto.sourceTable)) - case TARGET_ONLY => resolve(condition, Project(Nil, mergeInto.targetTable)) - } - } - - def resolveAssignments( - resolve: (Expression, LogicalPlan) => Expression, - assignments: Seq[Assignment], - mergeInto: MergeIntoTable, - resolvedWith: ResolvedWith): Seq[Assignment] = { - assignments.map { - assign => - val resolvedKey = resolve(assign.key, Project(Nil, mergeInto.targetTable)) - val resolvedValue = resolvedWith match { - case ALL => resolve(assign.value, mergeInto) - case SOURCE_ONLY => resolve(assign.value, Project(Nil, mergeInto.sourceTable)) - case TARGET_ONLY => resolve(assign.value, Project(Nil, mergeInto.targetTable)) - } - Assignment(resolvedKey, resolvedValue) - } - } -} diff --git a/paimon-spark/paimon-spark-4.0/src/main/scala/org/apache/spark/sql/paimon/shims/Spark4Shim.scala b/paimon-spark/paimon-spark-4.0/src/main/scala/org/apache/spark/sql/paimon/shims/Spark4Shim.scala index 731b655cd973..0f7ea24e66af 100644 --- a/paimon-spark/paimon-spark-4.0/src/main/scala/org/apache/spark/sql/paimon/shims/Spark4Shim.scala +++ b/paimon-spark/paimon-spark-4.0/src/main/scala/org/apache/spark/sql/paimon/shims/Spark4Shim.scala @@ -256,6 +256,19 @@ class Spark4Shim extends SparkShim { withSchemaEvolution) } + override def notMatchedBySourceActions(merge: MergeIntoTable): Seq[MergeAction] = + merge.notMatchedBySourceActions + + override def createUpdateAction( + condition: Option[Expression], + assignments: Seq[Assignment]): UpdateAction = + UpdateAction(condition, assignments) + + override def createInsertAction( + condition: Option[Expression], + assignments: Seq[Assignment]): InsertAction = + InsertAction(condition, assignments) + override def copyDataSourceV2Relation( relation: DataSourceV2Relation, table: Table, @@ -263,18 +276,6 @@ class Spark4Shim extends SparkShim { relation.copy(table = table, output = output) } - override def copyUpdateAction( - action: UpdateAction, - assignments: Seq[Assignment]): UpdateAction = { - action.copy(assignments = assignments) - } - - override def copyInsertAction( - action: InsertAction, - assignments: Seq[Assignment]): InsertAction = { - action.copy(assignments = assignments) - } - // Spark 4.0 still has `SubstituteUnresolvedOrdinals` (Spark 4.1 removed it because the new // resolver framework handles ordinals inline). `PaimonViewResolver` applies the shim's early // rules to the parsed view text before storing, so we must substitute `ORDER BY 1` → diff --git a/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/catalyst/analysis/AssignmentAlignmentHelper.scala b/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/catalyst/analysis/AssignmentAlignmentHelper.scala deleted file mode 100644 index f85baf846ac6..000000000000 --- a/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/catalyst/analysis/AssignmentAlignmentHelper.scala +++ /dev/null @@ -1,250 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.paimon.spark.catalyst.analysis - -import org.apache.paimon.spark.SparkTypeUtils.CURRENT_DEFAULT_COLUMN_METADATA_KEY -import org.apache.paimon.spark.catalyst.analysis.expressions.ExpressionHelper - -import org.apache.spark.sql.{PaimonUtils, SparkSession} -import org.apache.spark.sql.catalyst.SQLConfHelper -import org.apache.spark.sql.catalyst.expressions.{Alias, ArrayTransform, Attribute, CreateNamedStruct, Expression, GetStructField, If, IsNull, LambdaFunction, Literal, MapFromArrays, MapKeys, MapValues, NamedExpression, NamedLambdaVariable} -import org.apache.spark.sql.catalyst.plans.logical.{Assignment, DeleteAction, InsertAction, InsertStarAction, MergeAction, MergeIntoTable, UpdateAction, UpdateStarAction} -import org.apache.spark.sql.paimon.shims.SparkShimLoader -import org.apache.spark.sql.types.{ArrayType, DataType, MapType, StructType} - -trait AssignmentAlignmentHelper extends SQLConfHelper with ExpressionHelper { - - private lazy val resolver = conf.resolver - - /** - * @param ref - * attribute reference seq, e.g. a => Seq["a"], s.c1 => Seq["s", "c1"] - * @param expr - * update expression - */ - private case class AttrUpdate(ref: Seq[String], expr: Expression) - - /** - * Generate aligned expressions, only supports PrimitiveType and StructType. For example, if attrs - * are [a int, b int, s struct(c1 int, c2 int)] and update assignments are [a = 1, s.c1 = 2], will - * return [1, b, struct(2, c2)]. - * @param attrs - * target attrs - * @param assignments - * update assignments - * @return - * aligned expressions - */ - protected def generateAlignedExpressions( - attrs: Seq[Attribute], - assignments: Seq[Assignment], - isInsert: Boolean = false): Seq[Expression] = { - val attrUpdates = assignments.map(a => AttrUpdate(toRefSeq(a.key), a.value)) - recursiveAlignUpdates(attrs, attrUpdates, Nil, isInsert) - } - - protected def alignAssignments( - attrs: Seq[Attribute], - assignments: Seq[Assignment], - isInsert: Boolean = false): Seq[Assignment] = { - generateAlignedExpressions(attrs, assignments, isInsert).zip(attrs).map { - case (expression, field) => Assignment(field, expression) - } - } - - /** Align a MergeAction's assignments to target output. Star actions must already be expanded. */ - protected def alignMergeAction(action: MergeAction, targetOutput: Seq[Attribute]): MergeAction = { - // `copyXxxAction` rebuilds the node and drops tags; re-carry FROM_STAR so merge-schema works. - val aligned = action match { - case d @ DeleteAction(_) => d - case u @ PaimonUpdateAction(_, assignments) => - SparkShimLoader.shim.copyUpdateAction(u, alignAssignments(targetOutput, assignments)) - case i @ InsertAction(_, assignments) => - SparkShimLoader.shim.copyInsertAction( - i, - alignAssignments(targetOutput, assignments, isInsert = true)) - case _: UpdateStarAction | _: InsertStarAction => - throw new RuntimeException(s"Star action should already be expanded: $action") - case _ => - throw new RuntimeException(s"Can't recognize this action: $action") - } - PaimonMergeActionTags.carryFromStar(action, aligned) - } - - private def recursiveAlignUpdates( - targetAttrs: Seq[NamedExpression], - updates: Seq[AttrUpdate], - namePrefix: Seq[String] = Nil, - isInsert: Boolean = false): Seq[Expression] = { - - // build aligned updated expression for each target attr - targetAttrs.map { - targetAttr => - val headMatchedUpdates = updates.filter(u => resolver(u.ref.head, targetAttr.name)) - if (headMatchedUpdates.isEmpty) { - if (isInsert) { - // For Insert, use default value or NULL for missing columns - getDefaultValueOrNull(targetAttr) - } else { - // For Update, return the attr as is - targetAttr - } - } else { - val exactMatchedUpdate = headMatchedUpdates.find(_.ref.size == 1) - if (exactMatchedUpdate.isDefined) { - if (headMatchedUpdates.size == 1) { - // when an exact match (no nested fields) occurs, it must be the only match, then return it's expr - resolveByNameAndCast(exactMatchedUpdate.get.expr, targetAttr.dataType) - } else { - // otherwise, there must be conflicting updates, for example: - // - update the same attr multiple times - // - update a struct attr and its fields at the same time (e.g. s and s.c1) - val conflictingAttrNames = - headMatchedUpdates.map(u => (namePrefix ++ u.ref).mkString(".")).distinct - throw new UnsupportedOperationException( - s"Conflicting update/insert on attrs: ${conflictingAttrNames.mkString(", ")}" - ) - } - } else { - targetAttr.dataType match { - case StructType(fields) => - val fieldExprs = fields.zipWithIndex.map { - case (field, ordinal) => - Alias(GetStructField(targetAttr, ordinal, Some(field.name)), field.name)() - } - val newUpdates = updates.map(u => u.copy(ref = u.ref.tail)) - // process StructType's nested fields recursively - val updatedFieldExprs = - recursiveAlignUpdates( - fieldExprs, - newUpdates, - namePrefix :+ targetAttr.name, - isInsert) - - // build updated struct expression - CreateNamedStruct(fields.zip(updatedFieldExprs).flatMap { - case (field, expr) => - Seq(Literal(field.name), expr) - }) - case _ => - // can't reach here - throw new UnsupportedOperationException("") - } - } - } - } - } - - /** Get the default value expression for an attribute, or NULL if no default value is defined. */ - private def getDefaultValueOrNull(attr: NamedExpression): Expression = { - attr match { - case a: Attribute if a.metadata.contains(CURRENT_DEFAULT_COLUMN_METADATA_KEY) => - val defaultValueStr = a.metadata.getString(CURRENT_DEFAULT_COLUMN_METADATA_KEY) - parseAndResolveDefaultValue(defaultValueStr, a) - case _ => - Literal(null, attr.dataType) - } - } - - /** Parse the default value string and resolve it to an expression. */ - private def parseAndResolveDefaultValue(defaultValueStr: String, attr: Attribute): Expression = { - try { - val spark = SparkSession.active - val parsed = spark.sessionState.sqlParser.parseExpression(defaultValueStr) - castIfNeeded(parsed, attr.dataType) - } catch { - case _: Exception => - // If parsing fails, fall back to NULL - Literal(null, attr.dataType) - } - } - - /** - * Resolve an assignment value expression by-name against the target type, then cast if needed. - * Recursively reorders nested type fields (Struct, Array, Map and any combination) by name to - * match target field order before casting. This is consistent with Spark's native MERGE INTO - * behavior (see TableOutputResolver.resolveUpdate). - */ - private def resolveByNameAndCast(expression: Expression, targetType: DataType): Expression = { - if (PaimonUtils.sameType(expression.dataType, targetType)) { - // Types already structurally identical — no reordering needed. - // This guarantees idempotence when the rule is applied multiple times. - castIfNeeded(expression, targetType) - } else { - val reordered = reorderFieldsByName(expression, expression.dataType, targetType) - castIfNeeded(reordered, targetType) - } - } - - /** - * Recursively reorder nested type fields by name to match target type's field order. Supports - * StructType, ArrayType and MapType in any nesting combination. Returns the original expression - * if no reordering is needed. - */ - private def reorderFieldsByName( - expression: Expression, - sourceType: DataType, - targetType: DataType): Expression = { - (sourceType, targetType) match { - case (s: StructType, t: StructType) if s != t => - reorderStructByName(expression, s, t) - case (ArrayType(sElem, sNull), ArrayType(tElem, _)) if sElem != tElem => - val elementVar = NamedLambdaVariable("element", sElem, sNull) - val reordered = reorderFieldsByName(elementVar, sElem, tElem) - ArrayTransform(expression, LambdaFunction(reordered, Seq(elementVar))) - case (MapType(sKey, sVal, sValNull), MapType(tKey, tVal, _)) - if sKey != tKey || sVal != tVal => - val keyVar = NamedLambdaVariable("key", sKey, nullable = false) - val valVar = NamedLambdaVariable("value", sVal, sValNull) - val reorderedKey = reorderFieldsByName(keyVar, sKey, tKey) - val reorderedVal = reorderFieldsByName(valVar, sVal, tVal) - val newKeys = ArrayTransform(MapKeys(expression), LambdaFunction(reorderedKey, Seq(keyVar))) - val newVals = - ArrayTransform(MapValues(expression), LambdaFunction(reorderedVal, Seq(valVar))) - MapFromArrays(newKeys, newVals) - case _ => - expression - } - } - - /** Reorder source struct fields to match target field order by name, recursing into nested types. */ - private def reorderStructByName( - expression: Expression, - sourceStruct: StructType, - targetStruct: StructType): Expression = { - val reorderedFields = targetStruct.map { - targetField => - sourceStruct.fields.zipWithIndex.find(_._1.name == targetField.name) match { - case Some((sourceField, sourceIdx)) => - val fieldExpr = GetStructField(expression, sourceIdx, Some(sourceField.name)) - val reordered = - reorderFieldsByName(fieldExpr, sourceField.dataType, targetField.dataType) - Alias(reordered, targetField.name)() - case None => - Alias(Literal(null, targetField.dataType), targetField.name)() - } - } - val struct = CreateNamedStruct(reorderedFields.flatMap(a => Seq(Literal(a.name), a.child))) - if (expression.nullable) { - If(IsNull(expression), Literal(null, struct.dataType), struct) - } else { - struct - } - } -} diff --git a/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/catalyst/analysis/MergeSchemaEvolutionHelper.scala b/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/catalyst/analysis/MergeSchemaEvolutionHelper.scala index 9e44f8c31140..e94621c2de1b 100644 --- a/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/catalyst/analysis/MergeSchemaEvolutionHelper.scala +++ b/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/catalyst/analysis/MergeSchemaEvolutionHelper.scala @@ -27,26 +27,22 @@ import org.apache.paimon.table.FileStoreTable import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.analysis.Resolver -import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Expression, Literal} -import org.apache.spark.sql.catalyst.plans.logical.{Assignment, DeleteAction, InsertAction, MergeAction, MergeIntoTable, UpdateAction} +import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Expression, ExprId, Literal} +import org.apache.spark.sql.catalyst.plans.logical.{Assignment, DeleteAction, InsertAction, MergeAction, MergeIntoTable} import org.apache.spark.sql.connector.catalog.TableCatalog import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation import org.apache.spark.sql.paimon.shims.SparkShimLoader import org.apache.spark.sql.types.{StructField, StructType} /** - * Shared MERGE INTO `merge-schema=true` evolution logic. Mixed in by both the postHoc V1 path - * ([[PaimonMergeIntoBase]]) and the Spark 4.1 Resolution-batch rewrite for pure append-only tables - * (`Spark41MergeIntoRewrite`). Only fires when at least one action was authored as `UPDATE *` / - * `INSERT *`, tracked via [[PaimonMergeActionTags]]. + * Shared MERGE INTO `merge-schema=true` evolution. Triggers on `UPDATE *` / `INSERT *` (via + * [[PaimonMergeActionTags]]) or on any explicit assignment whose key resolved to a source-bound + * attribute (via [[PaimonMergeIntoResolver.resolveAssignments]] fallback — that shape is the + * marker, no extra tag). Evolution is scoped to source columns referenced by matched / not-matched + * actions; NOT MATCHED BY SOURCE can't reference source columns. */ trait MergeSchemaEvolutionHelper extends ExpressionHelper { - /** - * @param resolveNotMatchedBySource - * how to resolve `notMatchedBySourceActions` on Spark 3.2+/4.x; the version-specific shim is - * supplied by the caller. - */ protected def evolveTargetIfNeeded( merge: MergeIntoTable, relation: DataSourceV2Relation, @@ -58,11 +54,33 @@ trait MergeSchemaEvolutionHelper extends ExpressionHelper { val notMatchedBySourceActions = resolveNotMatchedBySource(merge) val allActions = merge.matchedActions ++ merge.notMatchedActions ++ notMatchedBySourceActions - if (!allActions.exists(PaimonMergeActionTags.isFromStar)) return None + if ( + !allActions + .exists(a => PaimonMergeActionTags.isFromStar(a) || hasSourceBoundKey(a, merge)) + ) { + return None + } + + val resolver = spark.sessionState.conf.resolver + val scopedActions = merge.matchedActions ++ merge.notMatchedActions + val sourceExprIds: Set[ExprId] = merge.sourceTable.output.map(_.exprId).toSet + val containsStar = scopedActions.exists(PaimonMergeActionTags.isFromStar) + val scopedNames: Set[String] = if (containsStar) { + merge.sourceTable.output.map(_.name).toSet + } else { + scopedActions + .flatMap(extractAssignments) + .collect { + case Assignment(attr: Attribute, _) if sourceExprIds.contains(attr.exprId) => attr.name + } + .toSet + } val fileStoreTable = v2Table.getTable.asInstanceOf[FileStoreTable] val sourceSchema = StructType( - merge.sourceTable.output.map(a => StructField(a.name, a.dataType, a.nullable))) + merge.sourceTable.output + .filter(a => scopedNames.exists(n => resolver(n, a.name))) + .map(a => StructField(a.name, a.dataType, a.nullable))) val filteredSourceSchema = SparkSystemColumns.filterSparkSystemColumns(sourceSchema) val allowExplicitCast = OptionUtils.writeMergeSchemaExplicitCastEnabled() val updatedFileStoreTable = SchemaHelper @@ -75,7 +93,6 @@ trait MergeSchemaEvolutionHelper extends ExpressionHelper { } val updatedV2Table = v2Table.copy(table = updatedFileStoreTable) - val resolver = spark.sessionState.conf.resolver val mergedSparkSchema = SparkTypeUtils.fromPaimonRowType(updatedFileStoreTable.schema().logicalRowType()) val newOutput = buildEvolvedOutput(mergedSparkSchema, relation.output, resolver) @@ -89,21 +106,41 @@ trait MergeSchemaEvolutionHelper extends ExpressionHelper { val oldNames = relation.output.map(_.name).toSet newOutput.filterNot(a => oldNames.exists(resolver(_, a.name))) } - val expand = expandAction(newAttrs, merge.sourceTable.output, resolver) _ + // Refresh target-side refs to the NEW dataType — `semanticEquals` checks dataType too. + val newAttrById: Map[ExprId, AttributeReference] = newOutput.map(a => a.exprId -> a).toMap + val refresh = refreshTargetRefs(newAttrById) _ + val transformOne = transformAction(refresh, newAttrs, merge.sourceTable.output, resolver) _ val updatedMerge = SparkShimLoader.shim.createMergeIntoTable( updatedTargetTable, merge.sourceTable, - merge.mergeCondition, - merge.matchedActions.map(expand), - merge.notMatchedActions.map(expand), - notMatchedBySourceActions.map(expand), + refresh(merge.mergeCondition), + merge.matchedActions.map(transformOne), + merge.notMatchedActions.map(transformOne), + notMatchedBySourceActions.map(transformOne), withSchemaEvolution = false ) Some((updatedMerge, updatedRelation, updatedV2Table)) } - /** Rebuild the relation's output: reuse existing attribute ids, fabricate ones for new fields. */ + protected def alignAllMergeActions( + m: MergeIntoTable, + targetOutput: Seq[Attribute]): MergeIntoTable = { + val mergeSchemaEnabled = OptionUtils.writeMergeSchemaEnabled() + val shim = SparkShimLoader.shim + shim.createMergeIntoTable( + m.targetTable, + m.sourceTable, + m.mergeCondition, + PaimonAssignmentUtils.alignActions(m.matchedActions, targetOutput, mergeSchemaEnabled), + PaimonAssignmentUtils.alignActions(m.notMatchedActions, targetOutput, mergeSchemaEnabled), + PaimonAssignmentUtils + .alignActions(shim.notMatchedBySourceActions(m), targetOutput, mergeSchemaEnabled), + withSchemaEvolution = false + ) + } + + /** Reuse existing attribute ids; fabricate ones for new fields. */ private def buildEvolvedOutput( mergedSparkSchema: StructType, oldOutput: Seq[Attribute], @@ -120,34 +157,78 @@ trait MergeSchemaEvolutionHelper extends ExpressionHelper { } } + /** Rewrite target-side `AttributeReference`s to the NEW attribute; source-side refs untouched. */ + private def refreshTargetRefs(newAttrById: Map[ExprId, AttributeReference])( + expr: Expression): Expression = expr.transformUp { + case ar: AttributeReference => newAttrById.getOrElse(ar.exprId, ar) + } + /** - * Append assignments for newly-added columns and re-tag the action. Star clauses pull values from - * source by name; explicit clauses fill NULL. + * Single pass per action: refresh target refs + rebind source-bound keys to the evolved target + + * fill remaining new columns (`*`-actions pull from source by name, explicit clauses NULL). */ - private def expandAction( + private def transformAction( + refresh: Expression => Expression, newAttrs: Seq[AttributeReference], sourceOutput: Seq[Attribute], resolver: Resolver)(action: MergeAction): MergeAction = { val fromStar = PaimonMergeActionTags.isFromStar(action) - val newAssignments = newAttrs.map { - attr => - val value: Expression = if (fromStar) { - sourceOutput - .find(s => resolver(s.name, attr.name)) - .map(s => castIfNeeded(s, attr.dataType)) - .getOrElse(Literal(null, attr.dataType)) - } else { - Literal(null, attr.dataType) - } - Assignment(attr, value) + val sourceExprIds = sourceOutput.map(_.exprId).toSet + val newAttrByName: Map[String, AttributeReference] = newAttrs.map(a => a.name -> a).toMap + + def transformAssignments(assignments: Seq[Assignment]): Seq[Assignment] = { + val covered = scala.collection.mutable.Set.empty[String] + val transformed = assignments.map { + case Assignment(key: Attribute, value) if sourceExprIds.contains(key.exprId) => + newAttrByName + .collectFirst { + case (name, target) if resolver(name, key.name) => + covered += target.name + Assignment(target, refresh(value)) + } + .getOrElse(Assignment(refresh(key), refresh(value))) + case a => Assignment(refresh(a.key), refresh(a.value)) + } + val fill = newAttrs.filterNot(a => covered.exists(resolver(_, a.name))).map { + attr => + val value: Expression = if (fromStar) { + sourceOutput + .find(s => resolver(s.name, attr.name)) + .map(s => castIfNeeded(s, attr.dataType)) + .getOrElse(Literal(null, attr.dataType)) + } else { + Literal(null, attr.dataType) + } + Assignment(attr, value) + } + transformed ++ fill } - val expanded = action match { - case i: InsertAction => - SparkShimLoader.shim.copyInsertAction(i, i.assignments ++ newAssignments) - case u: UpdateAction => - SparkShimLoader.shim.copyUpdateAction(u, u.assignments ++ newAssignments) - case d: DeleteAction => d + + val shim = SparkShimLoader.shim + val rebuilt = action match { + case DeleteAction(condition) => + DeleteAction(condition.map(refresh)) + case PaimonUpdateAction(condition, assignments) => + shim.createUpdateAction(condition.map(refresh), transformAssignments(assignments)) + case InsertAction(condition, assignments) => + shim.createInsertAction(condition.map(refresh), transformAssignments(assignments)) + case other => other } - PaimonMergeActionTags.carryFromStar(action, expanded) + PaimonMergeActionTags.carryFromStar(action, rebuilt) + } + + private def hasSourceBoundKey(action: MergeAction, merge: MergeIntoTable): Boolean = { + val sourceSet = merge.sourceTable.outputSet + val targetSet = merge.targetTable.outputSet + extractAssignments(action).exists(_.key match { + case attr: Attribute => sourceSet.contains(attr) && !targetSet.contains(attr) + case _ => false + }) + } + + private def extractAssignments(action: MergeAction): Seq[Assignment] = action match { + case PaimonUpdateAction(_, assignments) => assignments + case InsertAction(_, assignments) => assignments + case _ => Nil } } diff --git a/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/catalyst/analysis/PaimonAnalysis.scala b/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/catalyst/analysis/PaimonAnalysis.scala index d3ff947ce736..5400e9865fab 100644 --- a/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/catalyst/analysis/PaimonAnalysis.scala +++ b/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/catalyst/analysis/PaimonAnalysis.scala @@ -51,7 +51,6 @@ class PaimonAnalysis(session: SparkSession) extends Rule[LogicalPlan] { table.output, a.query, a.isByName, - conf, mergeSchemaEnabled) if (newQuery ne a.query) { // Tag to short-circuit the next Analyzer pass; otherwise inline-kept extras would loop. diff --git a/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/catalyst/analysis/PaimonAssignmentUtils.scala b/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/catalyst/analysis/PaimonAssignmentUtils.scala new file mode 100644 index 000000000000..2b2c13bba8bd --- /dev/null +++ b/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/catalyst/analysis/PaimonAssignmentUtils.scala @@ -0,0 +1,248 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.catalyst.analysis + +import org.apache.paimon.spark.SparkTypeUtils.CURRENT_DEFAULT_COLUMN_METADATA_KEY +import org.apache.paimon.spark.catalyst.Compatibility +import org.apache.paimon.spark.catalyst.analysis.PaimonOutputResolver.MissingFieldBehavior + +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.SQLConfHelper +import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, CreateNamedStruct, Expression, GetStructField, Literal} +import org.apache.spark.sql.catalyst.plans.logical.{Assignment, DeleteAction, InsertAction, InsertStarAction, MergeAction, UpdateStarAction} +import org.apache.spark.sql.catalyst.util.CharVarcharUtils +import org.apache.spark.sql.paimon.shims.SparkShimLoader +import org.apache.spark.sql.types.StructType + +/** + * Top-level alignment for MERGE actions. Per-value work goes through + * [[PaimonOutputResolver.resolveValue]] with a [[MissingFieldBehavior]] picked here: `UPDATE *` + + * struct target under merge-schema → `PreserveTarget` (keeps current target value); otherwise + * `MissingFieldBehavior.of(mergeSchemaEnabled)`. + * + * Strict mode: top-level `INSERT *` / `UPDATE *` missing source column throws; explicit clauses + * still fill / preserve unmentioned columns. Merge-schema mode: missing top-level columns get + * `CURRENT_DEFAULT` / NULL; source-extras kept for write-time schema evolution. + */ +object PaimonAssignmentUtils extends SQLConfHelper { + + def alignActions( + actions: Seq[MergeAction], + targetOutput: Seq[Attribute], + mergeSchemaEnabled: Boolean): Seq[MergeAction] = { + actions.map(alignAction(_, targetOutput, mergeSchemaEnabled)) + } + + // Re-carries `FROM_STAR` (dropped on reconstruct); construction goes through the version shim. + def alignAction( + action: MergeAction, + targetOutput: Seq[Attribute], + mergeSchemaEnabled: Boolean): MergeAction = { + val fromStar = PaimonMergeActionTags.isFromStar(action) + val shim = SparkShimLoader.shim + val aligned = action match { + case d @ DeleteAction(_) => d + case PaimonUpdateAction(condition, assignments) => + shim.createUpdateAction( + condition, + alignUpdateAssignments(targetOutput, assignments, fromStar, mergeSchemaEnabled)) + case InsertAction(condition, assignments) => + shim.createInsertAction( + condition, + alignInsertAssignments(targetOutput, assignments, fromStar, mergeSchemaEnabled)) + case _: UpdateStarAction | _: InsertStarAction => + throw new RuntimeException(s"Star action should already be expanded: $action") + case _ => + throw new RuntimeException(s"Can't recognize this action: $action") + } + PaimonMergeActionTags.carryFromStar(action, aligned) + } + + /** Align UPDATE assignments (incl. nested keys like `s.c1`) to the target attribute list. */ + def alignUpdateAssignments( + attrs: Seq[Attribute], + assignments: Seq[Assignment], + fromStar: Boolean, + mergeSchemaEnabled: Boolean): Seq[Assignment] = { + + val output = attrs.map { + attr => + applyAssignments( + col = restoreActualType(attr), + colExpr = attr, + assignments = assignments, + colPath = Seq(attr.name), + mergeSchemaEnabled = mergeSchemaEnabled, + updateStar = fromStar + ) + } + attrs.zip(output).map { case (attr, expr) => Assignment(attr, expr) } + } + + /** Align INSERT assignments. Nested-key INSERT is rejected. */ + def alignInsertAssignments( + attrs: Seq[Attribute], + assignments: Seq[Assignment], + fromStar: Boolean, + mergeSchemaEnabled: Boolean): Seq[Assignment] = { + + val (topLevel, nested) = assignments.partition(_.key.isInstanceOf[Attribute]) + if (nested.nonEmpty) { + val nestedStr = nested.map(_.sql).mkString(", ") + throw new RuntimeException(s"INSERT assignment keys cannot be nested fields: $nestedStr") + } + val resolver = conf.resolver + + attrs.map { + attr => + val matching = topLevel.collect { + case a if a.key.semanticEquals(attr) => a + case a @ Assignment(k: Attribute, _) if resolver(k.name, attr.name) => a + } + val value = if (matching.isEmpty) { + if (fromStar && !mergeSchemaEnabled) { + throw new RuntimeException( + s"Cannot INSERT * into target column '${attr.name}': source is missing this " + + s"column. Enable 'spark.paimon.write.merge-schema' to fill it with " + + s"CURRENT_DEFAULT / NULL.") + } + getDefaultValueOrNull(attr) + } else if (matching.size > 1) { + val conflict = matching.map(_.value.sql).mkString(", ") + throw new RuntimeException(s"Multiple assignments for '${attr.name}': $conflict") + } else { + val actual = restoreActualType(attr) + PaimonOutputResolver.resolveValue( + matching.head.value, + actual, + MissingFieldBehavior.of(mergeSchemaEnabled), + None, + Seq(attr.name)) + } + Assignment(attr, value) + } + } + + private def applyAssignments( + col: Attribute, + colExpr: Expression, + assignments: Seq[Assignment], + colPath: Seq[String], + mergeSchemaEnabled: Boolean, + updateStar: Boolean = false): Expression = { + + val (exactAssignments, otherAssignments) = + assignments.partition(assignment => assignment.key.semanticEquals(colExpr)) + + val fieldAssignments = + otherAssignments.filter( + assignment => assignment.key.find(_.semanticEquals(colExpr)).isDefined) + + if (exactAssignments.size > 1) { + val conflict = exactAssignments.map(_.value.sql).mkString(", ") + throw new RuntimeException(s"Multiple assignments for '${colPath.mkString(".")}': $conflict") + } else if (exactAssignments.nonEmpty && fieldAssignments.nonEmpty) { + val conflict = (exactAssignments ++ fieldAssignments).map(_.sql).mkString(", ") + throw new RuntimeException( + s"Conflicting assignments for '${colPath.mkString(".")}': $conflict") + } else if (exactAssignments.isEmpty && fieldAssignments.isEmpty) { + // Strict `UPDATE *` requires every top-level target column to have a source match. + if (updateStar && !mergeSchemaEnabled && colPath.size == 1) { + throw new RuntimeException( + s"Cannot UPDATE * target column '${colPath.mkString(".")}': source is missing this " + + s"column. Enable 'spark.paimon.write.merge-schema' to preserve the current value.") + } + colExpr + } else if (exactAssignments.nonEmpty) { + val value = exactAssignments.head.value + val (behavior, targetExpr) = col.dataType match { + case _: StructType if updateStar && mergeSchemaEnabled => + (MissingFieldBehavior.PreserveTarget, Some(colExpr)) + case _ => + (MissingFieldBehavior.of(mergeSchemaEnabled), None) + } + PaimonOutputResolver.resolveValue(value, col, behavior, targetExpr, colPath) + } else { + applyFieldAssignments(col, colExpr, fieldAssignments, colPath, mergeSchemaEnabled) + } + } + + private def applyFieldAssignments( + col: Attribute, + colExpr: Expression, + assignments: Seq[Assignment], + colPath: Seq[String], + mergeSchemaEnabled: Boolean): Expression = { + + col.dataType match { + case structType: StructType => + val fieldAttrs = toAttributes(structType) + val fieldExprs = structType.fields.zipWithIndex.map { + case (field, ordinal) => GetStructField(colExpr, ordinal, Some(field.name)) + } + val updatedFieldExprs = fieldAttrs.zip(fieldExprs).map { + case (fieldAttr, fieldExpr) => + applyAssignments( + fieldAttr, + fieldExpr, + assignments, + colPath :+ fieldAttr.name, + mergeSchemaEnabled) + } + toNamedStruct(structType, updatedFieldExprs) + + case otherType => + throw new RuntimeException( + s"Updating nested fields is only supported for StructType but " + + s"'${colPath.mkString(".")}' is of type $otherType") + } + } + + private def toNamedStruct(structType: StructType, fieldExprs: Seq[Expression]): Expression = { + val parts = structType.fields.zip(fieldExprs).flatMap { + case (field, expr) => Seq(Literal(field.name), expr) + } + CreateNamedStruct(parts) + } + + private def restoreActualType(attr: Attribute): Attribute = { + attr.withDataType(CharVarcharUtils.getRawType(attr.metadata).getOrElse(attr.dataType)) + } + + private def toAttributes(structType: StructType): Seq[Attribute] = { + structType.map(f => AttributeReference(f.name, f.dataType, f.nullable, f.metadata)()) + } + + /** Parse the column's `CURRENT_DEFAULT` SQL (cast to column type) or fall back to NULL. */ + private def getDefaultValueOrNull(attr: Attribute): Expression = { + val nullLit = Literal(null, attr.dataType) + if (attr.metadata.contains(CURRENT_DEFAULT_COLUMN_METADATA_KEY)) { + try { + val parsed = SparkSession.active.sessionState.sqlParser + .parseExpression(attr.metadata.getString(CURRENT_DEFAULT_COLUMN_METADATA_KEY)) + if (parsed.dataType == attr.dataType) parsed + else Compatibility.cast(parsed, attr.dataType, Option(conf.sessionLocalTimeZone)) + } catch { + case scala.util.control.NonFatal(_) => nullLit + } + } else { + nullLit + } + } +} diff --git a/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/catalyst/analysis/PaimonMergeActionTags.scala b/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/catalyst/analysis/PaimonMergeActionTags.scala index 04e283319e9c..3e343cc3e94a 100644 --- a/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/catalyst/analysis/PaimonMergeActionTags.scala +++ b/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/catalyst/analysis/PaimonMergeActionTags.scala @@ -18,20 +18,24 @@ package org.apache.paimon.spark.catalyst.analysis -import org.apache.spark.sql.catalyst.plans.logical.MergeAction +import org.apache.spark.sql.catalyst.plans.logical.{MergeAction, MergeIntoTable} import org.apache.spark.sql.catalyst.trees.TreeNodeTag /** - * Marks a `MergeAction` that originated from `INSERT *` / `UPDATE *`. After resolution a star - * action is indistinguishable from an explicit clause that happens to list every column, but - * merge-schema needs the original intent to decide between pulling new source columns and filling - * NULL. Tags do not survive fresh constructor calls — rebuild sites must re-tag via - * `carryFromStar`. + * [[FROM_STAR]] records that a `MergeAction` came from `INSERT *` / `UPDATE *` — merge-schema needs + * the original intent to decide between pulling new source columns and filling NULL. Tags are + * dropped on reconstruct, so rebuild sites must re-tag via `carryFromStar`. + * + * [[ALIGNED]] is a node-level tag on `MergeIntoTable` set after the post-hoc rule has aligned + * assignments to the target output. The outer analyzer batch re-runs the rule once to verify + * idempotence; this tag short-circuits re-application so values aren't wrapped twice. */ object PaimonMergeActionTags { val FROM_STAR: TreeNodeTag[Boolean] = TreeNodeTag[Boolean]("paimon.merge.fromStar") + val ALIGNED: TreeNodeTag[Boolean] = TreeNodeTag[Boolean]("paimon.merge.aligned") + def isFromStar(action: MergeAction): Boolean = action.getTagValue(FROM_STAR).contains(true) @@ -43,4 +47,12 @@ object PaimonMergeActionTags { def carryFromStar[T <: MergeAction](source: MergeAction, target: T): T = { if (isFromStar(source)) markFromStar(target) else target } + + def isAligned(merge: MergeIntoTable): Boolean = + merge.getTagValue(ALIGNED).contains(true) + + def markAligned[T <: MergeIntoTable](merge: T): T = { + merge.setTagValue(ALIGNED, true) + merge + } } diff --git a/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/catalyst/analysis/PaimonMergeInto.scala b/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/catalyst/analysis/PaimonMergeInto.scala index 45916be76136..eaa5cb19b828 100644 --- a/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/catalyst/analysis/PaimonMergeInto.scala +++ b/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/catalyst/analysis/PaimonMergeInto.scala @@ -18,29 +18,132 @@ package org.apache.paimon.spark.catalyst.analysis +import org.apache.paimon.spark.SparkTable +import org.apache.paimon.spark.catalyst.analysis.expressions.ExpressionHelper +import org.apache.paimon.spark.commands.{MergeIntoPaimonDataEvolutionTable, MergeIntoPaimonTable} + import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.catalyst.expressions.Attribute -import org.apache.spark.sql.catalyst.plans.logical.{MergeAction, MergeIntoTable} +import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet, Expression, SubqueryExpression} +import org.apache.spark.sql.catalyst.plans.logical._ +import org.apache.spark.sql.catalyst.rules.Rule +import org.apache.spark.sql.paimon.shims.SparkShimLoader + +import scala.collection.JavaConverters._ /** A post-hoc resolution rule for MergeInto. */ -case class PaimonMergeInto(spark: SparkSession) extends PaimonMergeIntoBase { - - /** - * Align all MergeActions in a MergeIntoTable based on the target table's output attributes. - * Returns a new MergeIntoTable with aligned matchedActions, notMatchedActions, and - * notMatchedBySourceActions. - */ - override def alignMergeIntoTable( - m: MergeIntoTable, - targetOutput: Seq[Attribute]): MergeIntoTable = { - m.copy( - matchedActions = m.matchedActions.map(alignMergeAction(_, targetOutput)), - notMatchedActions = m.notMatchedActions.map(alignMergeAction(_, targetOutput)), - notMatchedBySourceActions = m.notMatchedBySourceActions.map(alignMergeAction(_, targetOutput)) - ) +case class PaimonMergeInto(spark: SparkSession) + extends Rule[LogicalPlan] + with RowLevelHelper + with ExpressionHelper + with MergeSchemaEvolutionHelper { + + override val operation: RowLevelOp = MergeInto + + def apply(plan: LogicalPlan): LogicalPlan = { + // Spark 4.1 marks the plan analyzed before postHoc runs, bypassing `resolveOperators`. Pure + // append-only tables on 4.1+ are handled earlier by `Spark41MergeIntoRewrite`. + AnalysisHelper.allowInvokingTransformsInAnalyzer { + plan.transformDown { + case merge: MergeIntoTable + if merge.resolved && PaimonRelation.isPaimonTable(merge.targetTable) && + !PaimonMergeActionTags.isAligned(merge) => + val relation = PaimonRelation.getPaimonRelation(merge.targetTable) + var v2Table = relation.table.asInstanceOf[SparkTable] + + checkPaimonTable(v2Table.getTable) + checkCondition(merge.mergeCondition) + (merge.matchedActions ++ merge.notMatchedActions) + .flatMap(_.condition) + .foreach(checkCondition) + + val primaryKeys = v2Table.getTable.primaryKeys().asScala.toSeq + if (primaryKeys.nonEmpty) { + val updateActions = merge.matchedActions.collect { case a: UpdateAction => a } + checkUpdateActionValidity( + AttributeSet(relation.output), + merge.mergeCondition, + updateActions, + primaryKeys) + } + + // Commit schema changes before alignment so the aligned plan sees new columns. + val (resolvedMerge, targetOutput) = + evolveTargetIfNeeded(merge, relation, v2Table, spark, resolveNotMatchedBySourceActions) + .map { case (m, r, t) => v2Table = t; (m, r.output) } + .getOrElse((merge, relation.output)) + + val aligned = alignAllMergeActions(resolvedMerge, targetOutput) + + if (!shouldFallbackToV1MergeInto(aligned)) { + // Tag so the analyzer's idempotence re-run skips this node (V1 fallback is a different + // node type and doesn't need the tag). + PaimonMergeActionTags.markAligned(aligned) + } else { + buildV1Command(v2Table, resolvedMerge, aligned) + } + } + } + } + + private def buildV1Command( + v2Table: SparkTable, + resolvedMerge: MergeIntoTable, + aligned: MergeIntoTable): LogicalPlan = { + val notMatchedBySource = resolveNotMatchedBySourceActions(aligned) + if (v2Table.coreOptions.dataEvolutionEnabled()) { + MergeIntoPaimonDataEvolutionTable( + v2Table, + resolvedMerge.targetTable, + resolvedMerge.sourceTable, + resolvedMerge.mergeCondition, + aligned.matchedActions, + aligned.notMatchedActions, + notMatchedBySource + ) + } else { + MergeIntoPaimonTable( + v2Table, + resolvedMerge.targetTable, + resolvedMerge.sourceTable, + resolvedMerge.mergeCondition, + aligned.matchedActions, + aligned.notMatchedActions, + notMatchedBySource + ) + } + } + + private def checkCondition(condition: Expression): Unit = { + if (!condition.resolved) { + throw new RuntimeException(s"Condition $condition should have been resolved.") + } + if (SubqueryExpression.hasSubquery(condition)) { + throw new RuntimeException(s"Condition $condition with subquery can't be supported.") + } } - override def resolveNotMatchedBySourceActions(merge: MergeIntoTable): Seq[MergeAction] = { - merge.notMatchedBySourceActions + private def checkUpdateActionValidity( + targetOutput: AttributeSet, + mergeCondition: Expression, + actions: Seq[UpdateAction], + primaryKeys: Seq[String]): Unit = { + lazy val isMergeConditionValid = { + val mergeExpressions = splitConjunctivePredicates(mergeCondition) + primaryKeys.forall { + primaryKey => isUpdateExpressionToPrimaryKey(targetOutput, mergeExpressions, primaryKey) + } + } + + def isUpdateActionValid(action: UpdateAction): Boolean = { + validUpdateAssignment(targetOutput, primaryKeys, action.assignments) + } + + val valid = isMergeConditionValid || actions.forall(isUpdateActionValid) + if (!valid) { + throw new RuntimeException("Can't update the primary key column in update clause.") + } } + + private def resolveNotMatchedBySourceActions(merge: MergeIntoTable): Seq[MergeAction] = + SparkShimLoader.shim.notMatchedBySourceActions(merge) } diff --git a/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/catalyst/analysis/PaimonMergeIntoBase.scala b/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/catalyst/analysis/PaimonMergeIntoBase.scala deleted file mode 100644 index 747b0477a5f6..000000000000 --- a/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/catalyst/analysis/PaimonMergeIntoBase.scala +++ /dev/null @@ -1,152 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.paimon.spark.catalyst.analysis - -import org.apache.paimon.spark.SparkTable -import org.apache.paimon.spark.catalyst.analysis.expressions.ExpressionHelper -import org.apache.paimon.spark.commands.{MergeIntoPaimonDataEvolutionTable, MergeIntoPaimonTable} - -import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet, Expression, SubqueryExpression} -import org.apache.spark.sql.catalyst.plans.logical._ -import org.apache.spark.sql.catalyst.rules.Rule - -import scala.collection.JavaConverters._ - -trait PaimonMergeIntoBase - extends Rule[LogicalPlan] - with RowLevelHelper - with ExpressionHelper - with AssignmentAlignmentHelper - with MergeSchemaEvolutionHelper { - - val spark: SparkSession - - override val operation: RowLevelOp = MergeInto - - def apply(plan: LogicalPlan): LogicalPlan = { - // Spark 4.1 marks the plan analyzed before postHoc runs, so `transformDown` is needed to - // bypass `resolveOperators`'s short-circuit. Pure append-only tables on 4.1+ are handled - // earlier by `Spark41MergeIntoRewrite` and never reach here. - AnalysisHelper.allowInvokingTransformsInAnalyzer { - plan.transformDown { - case merge: MergeIntoTable - if merge.resolved && PaimonRelation.isPaimonTable(merge.targetTable) => - val relation = PaimonRelation.getPaimonRelation(merge.targetTable) - var v2Table = relation.table.asInstanceOf[SparkTable] - - checkPaimonTable(v2Table.getTable) - checkCondition(merge.mergeCondition) - (merge.matchedActions ++ merge.notMatchedActions) - .flatMap(_.condition) - .foreach(checkCondition) - - val primaryKeys = v2Table.getTable.primaryKeys().asScala.toSeq - if (primaryKeys.nonEmpty) { - val updateActions = merge.matchedActions.collect { case a: UpdateAction => a } - checkUpdateActionValidity( - AttributeSet(relation.output), - merge.mergeCondition, - updateActions, - primaryKeys) - } - - // Commit schema changes before alignment so the aligned plan sees new columns. - val (resolvedMerge, targetOutput) = - evolveTargetIfNeeded(merge, relation, v2Table, spark, resolveNotMatchedBySourceActions) - .map { case (m, r, t) => v2Table = t; (m, r.output) } - .getOrElse((merge, relation.output)) - - val aligned = alignMergeIntoTable(resolvedMerge, targetOutput) - - if (!shouldFallbackToV1MergeInto(aligned)) { - aligned - } else { - buildV1Command(v2Table, resolvedMerge, aligned) - } - } - } - } - - private def buildV1Command( - v2Table: SparkTable, - resolvedMerge: MergeIntoTable, - aligned: MergeIntoTable): LogicalPlan = { - val notMatchedBySource = resolveNotMatchedBySourceActions(aligned) - if (v2Table.coreOptions.dataEvolutionEnabled()) { - MergeIntoPaimonDataEvolutionTable( - v2Table, - resolvedMerge.targetTable, - resolvedMerge.sourceTable, - resolvedMerge.mergeCondition, - aligned.matchedActions, - aligned.notMatchedActions, - notMatchedBySource - ) - } else { - MergeIntoPaimonTable( - v2Table, - resolvedMerge.targetTable, - resolvedMerge.sourceTable, - resolvedMerge.mergeCondition, - aligned.matchedActions, - aligned.notMatchedActions, - notMatchedBySource - ) - } - } - - private def checkCondition(condition: Expression): Unit = { - if (!condition.resolved) { - throw new RuntimeException(s"Condition $condition should have been resolved.") - } - if (SubqueryExpression.hasSubquery(condition)) { - throw new RuntimeException(s"Condition $condition with subquery can't be supported.") - } - } - - /** This check will avoid to update the primary key columns */ - private def checkUpdateActionValidity( - targetOutput: AttributeSet, - mergeCondition: Expression, - actions: Seq[UpdateAction], - primaryKeys: Seq[String]): Unit = { - // Check whether there are enough `EqualTo` expressions related to all the primary keys. - lazy val isMergeConditionValid = { - val mergeExpressions = splitConjunctivePredicates(mergeCondition) - primaryKeys.forall { - primaryKey => isUpdateExpressionToPrimaryKey(targetOutput, mergeExpressions, primaryKey) - } - } - - // Check whether there are an update expression related to any primary key. - def isUpdateActionValid(action: UpdateAction): Boolean = { - validUpdateAssignment(targetOutput, primaryKeys, action.assignments) - } - - val valid = isMergeConditionValid || actions.forall(isUpdateActionValid) - if (!valid) { - throw new RuntimeException("Can't update the primary key column in update clause.") - } - } - - def resolveNotMatchedBySourceActions(merge: MergeIntoTable): Seq[MergeAction] - - def alignMergeIntoTable(m: MergeIntoTable, targetOutput: Seq[Attribute]): MergeIntoTable -} diff --git a/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/catalyst/analysis/PaimonMergeIntoResolver.scala b/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/catalyst/analysis/PaimonMergeIntoResolver.scala index b43b7a59a6c5..23defe00c38e 100644 --- a/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/catalyst/analysis/PaimonMergeIntoResolver.scala +++ b/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/catalyst/analysis/PaimonMergeIntoResolver.scala @@ -18,16 +18,95 @@ package org.apache.paimon.spark.catalyst.analysis -import org.apache.spark.sql.catalyst.expressions.Expression -import org.apache.spark.sql.catalyst.plans.logical.{DeleteAction, LogicalPlan, MergeAction, MergeIntoTable, UpdateAction} +import org.apache.paimon.spark.catalyst.analysis.expressions.ExpressionHelper +import org.apache.paimon.spark.util.OptionUtils -/** Resolve all the expressions for MergeInto. */ -object PaimonMergeIntoResolver extends PaimonMergeIntoResolverBase { +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.analysis.Resolver +import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} +import org.apache.spark.sql.catalyst.plans.logical._ +import org.apache.spark.sql.paimon.shims.SparkShimLoader - def resolveNotMatchedBySourceActions( +object PaimonMergeIntoResolver extends ExpressionHelper { + + def apply(merge: MergeIntoTable, spark: SparkSession): LogicalPlan = { + val target = merge.targetTable + val source = merge.sourceTable + assert(target.resolved, "Target should have been resolved here.") + assert(source.resolved, "Source should have been resolved here.") + + val resolve: (Expression, LogicalPlan) => Expression = resolveExpression(spark) + + val resolvedCond = resolveCondition(resolve, merge.mergeCondition, merge, ALL) + val resolvedMatched = resolveMatchedByTargetActions(merge, resolve, spark) + val resolvedNotMatched = resolveNotMatchedByTargetActions(merge, resolve, spark) + val resolvedNotMatchedBySource = resolveNotMatchedBySourceActions(merge, resolve) + + SparkShimLoader.shim.createMergeIntoTable( + merge.targetTable, + merge.sourceTable, + resolvedCond, + resolvedMatched, + resolvedNotMatched, + resolvedNotMatchedBySource, + withSchemaEvolution = false + ) + } + + private def resolveMatchedByTargetActions( + merge: MergeIntoTable, + resolve: (Expression, LogicalPlan) => Expression, + spark: SparkSession): Seq[MergeAction] = { + merge.matchedActions.map { + case DeleteAction(condition) => + // The condition can be from both target and source tables + val resolvedCond = condition.map(resolveCondition(resolve, _, merge, ALL)) + DeleteAction(resolvedCond) + case PaimonUpdateAction(condition, assignments) => + // The condition and value can be from both target and source tables + val resolvedCond = condition.map(resolveCondition(resolve, _, merge, ALL)) + val resolvedAssignments = resolveAssignments(resolve, assignments, merge, ALL) + SparkShimLoader.shim.createUpdateAction(resolvedCond, resolvedAssignments) + case UpdateStarAction(condition) => + // The condition can be from both target and source tables, but the value must be from the source table + val resolvedCond = condition.map(resolveCondition(resolve, _, merge, ALL)) + val assignments = expandStarAssignments(merge, spark) + // Tag so merge-schema can distinguish `UPDATE *` from a fully-listed explicit clause. + PaimonMergeActionTags.markFromStar( + SparkShimLoader.shim.createUpdateAction(resolvedCond, assignments)) + case action => + throw new RuntimeException(s"Can't recognize this action: $action") + } + } + + private def resolveNotMatchedByTargetActions( + merge: MergeIntoTable, + resolve: (Expression, LogicalPlan) => Expression, + spark: SparkSession): Seq[MergeAction] = { + merge.notMatchedActions.map { + case InsertAction(condition, assignments) => + // The condition and value must be from the source table + val resolvedCond = + condition.map(resolveCondition(resolve, _, merge, SOURCE_ONLY)) + val resolvedAssignments = + resolveAssignments(resolve, assignments, merge, SOURCE_ONLY) + SparkShimLoader.shim.createInsertAction(resolvedCond, resolvedAssignments) + case InsertStarAction(condition) => + // The condition and value must be from the source table + val resolvedCond = + condition.map(resolveCondition(resolve, _, merge, SOURCE_ONLY)) + val assignments = expandStarAssignments(merge, spark) + PaimonMergeActionTags.markFromStar( + SparkShimLoader.shim.createInsertAction(resolvedCond, assignments)) + case action => + throw new RuntimeException(s"Can't recognize this action: $action") + } + } + + private def resolveNotMatchedBySourceActions( merge: MergeIntoTable, resolve: (Expression, LogicalPlan) => Expression): Seq[MergeAction] = { - merge.notMatchedBySourceActions.map { + SparkShimLoader.shim.notMatchedBySourceActions(merge).map { case DeleteAction(condition) => // The condition must be from the target table val resolvedCond = condition.map(resolveCondition(resolve, _, merge, TARGET_ONLY)) @@ -36,24 +115,72 @@ object PaimonMergeIntoResolver extends PaimonMergeIntoResolverBase { // The condition and value must be from the target table val resolvedCond = condition.map(resolveCondition(resolve, _, merge, TARGET_ONLY)) val resolvedAssignments = resolveAssignments(resolve, assignments, merge, TARGET_ONLY) - UpdateAction(resolvedCond, resolvedAssignments) + SparkShimLoader.shim.createUpdateAction(resolvedCond, resolvedAssignments) case action => throw new RuntimeException(s"Can't recognize this action: $action") } } - def build( - merge: MergeIntoTable, - resolvedCond: Expression, - resolvedMatched: Seq[MergeAction], - resolvedNotMatched: Seq[MergeAction], - resolvedNotMatchedBySource: Seq[MergeAction]): MergeIntoTable = { - merge.copy( - mergeCondition = resolvedCond, - matchedActions = resolvedMatched, - notMatchedActions = resolvedNotMatched, - notMatchedBySourceActions = resolvedNotMatchedBySource - ) + /** + * Expand `UPDATE *` / `INSERT *` to source-driven assignments. Missing target columns are filled + * later during alignment; source-only top-level columns are dropped here, then re-appended by + * [[MergeSchemaEvolutionHelper]] when merge-schema is enabled. + */ + private def expandStarAssignments(merge: MergeIntoTable, spark: SparkSession): Seq[Assignment] = { + val resolver: Resolver = spark.sessionState.analyzer.resolver + def findTarget(name: String): Option[Attribute] = { + merge.targetTable.output.find(targetAttr => resolver(name, targetAttr.name)) + } + + merge.sourceTable.output.flatMap { + sourceAttr => + findTarget(sourceAttr.name).map(targetAttr => Assignment(targetAttr, sourceAttr)) + } + } + + sealed private trait ResolvedWith + private case object ALL extends ResolvedWith + private case object SOURCE_ONLY extends ResolvedWith + private case object TARGET_ONLY extends ResolvedWith + + private def resolveCondition( + resolve: (Expression, LogicalPlan) => Expression, + condition: Expression, + mergeInto: MergeIntoTable, + resolvedWith: ResolvedWith): Expression = { + resolvedWith match { + case ALL => resolve(condition, mergeInto) + case SOURCE_ONLY => resolve(condition, Project(Nil, mergeInto.sourceTable)) + case TARGET_ONLY => resolve(condition, Project(Nil, mergeInto.targetTable)) + } } + private def resolveAssignments( + resolve: (Expression, LogicalPlan) => Expression, + assignments: Seq[Assignment], + mergeInto: MergeIntoTable, + resolvedWith: ResolvedWith): Seq[Assignment] = { + assignments.map { + assign => + // Merge-schema: fall back to source so a key naming a not-yet-existing target column + // surfaces as a source-bound Attribute for [[MergeSchemaEvolutionHelper]] to detect. + // Strict mode lets the target-attempt failure propagate as "column not found". + val resolvedKey = + if (OptionUtils.writeMergeSchemaEnabled()) { + try resolve(assign.key, Project(Nil, mergeInto.targetTable)) + catch { + case scala.util.control.NonFatal(_) => + resolve(assign.key, Project(Nil, mergeInto.sourceTable)) + } + } else { + resolve(assign.key, Project(Nil, mergeInto.targetTable)) + } + val resolvedValue = resolvedWith match { + case ALL => resolve(assign.value, mergeInto) + case SOURCE_ONLY => resolve(assign.value, Project(Nil, mergeInto.sourceTable)) + case TARGET_ONLY => resolve(assign.value, Project(Nil, mergeInto.targetTable)) + } + Assignment(resolvedKey, resolvedValue) + } + } } diff --git a/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/catalyst/analysis/PaimonMergeIntoResolverBase.scala b/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/catalyst/analysis/PaimonMergeIntoResolverBase.scala deleted file mode 100644 index 4222c54d6e9d..000000000000 --- a/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/catalyst/analysis/PaimonMergeIntoResolverBase.scala +++ /dev/null @@ -1,144 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.paimon.spark.catalyst.analysis - -import org.apache.paimon.spark.catalyst.analysis.expressions.ExpressionHelper - -import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute -import org.apache.spark.sql.catalyst.expressions.Expression -import org.apache.spark.sql.catalyst.plans.logical._ - -trait PaimonMergeIntoResolverBase extends ExpressionHelper { - - def apply(merge: MergeIntoTable, spark: SparkSession): LogicalPlan = { - val target = merge.targetTable - val source = merge.sourceTable - assert(target.resolved, "Target should have been resolved here.") - assert(source.resolved, "Source should have been resolved here.") - - val resolve: (Expression, LogicalPlan) => Expression = resolveExpression(spark) - - val resolvedCond = resolveCondition(resolve, merge.mergeCondition, merge, ALL) - val resolvedMatched = resolveMatchedByTargetActions(merge, resolve) - val resolvedNotMatched = resolveNotMatchedByTargetActions(merge, resolve) - val resolvedNotMatchedBySource = resolveNotMatchedBySourceActions(merge, resolve) - - build(merge, resolvedCond, resolvedMatched, resolvedNotMatched, resolvedNotMatchedBySource) - } - - def build( - merge: MergeIntoTable, - resolvedCond: Expression, - resolvedMatched: Seq[MergeAction], - resolvedNotMatched: Seq[MergeAction], - resolvedNotMatchedBySource: Seq[MergeAction]): MergeIntoTable - - private def resolveMatchedByTargetActions( - merge: MergeIntoTable, - resolve: (Expression, LogicalPlan) => Expression): Seq[MergeAction] = { - merge.matchedActions.map { - case DeleteAction(condition) => - // The condition can be from both target and source tables - val resolvedCond = condition.map(resolveCondition(resolve, _, merge, ALL)) - DeleteAction(resolvedCond) - case PaimonUpdateAction(condition, assignments) => - // The condition and value can be from both target and source tables - val resolvedCond = condition.map(resolveCondition(resolve, _, merge, ALL)) - val resolvedAssignments = resolveAssignments(resolve, assignments, merge, ALL) - UpdateAction(resolvedCond, resolvedAssignments) - case UpdateStarAction(condition) => - // The condition can be from both target and source tables, but the value must be from the source table - val resolvedCond = condition.map(resolveCondition(resolve, _, merge, ALL)) - val assignments = merge.targetTable.output.map { - attr => Assignment(attr, UnresolvedAttribute(Seq(attr.name))) - } - val resolvedAssignments = - resolveAssignments(resolve, assignments, merge, SOURCE_ONLY) - // Tag so merge-schema can distinguish `UPDATE *` from a fully-listed explicit clause. - PaimonMergeActionTags.markFromStar(UpdateAction(resolvedCond, resolvedAssignments)) - case action => - throw new RuntimeException(s"Can't recognize this action: $action") - } - } - - private def resolveNotMatchedByTargetActions( - merge: MergeIntoTable, - resolve: (Expression, LogicalPlan) => Expression): Seq[MergeAction] = { - merge.notMatchedActions.map { - case InsertAction(condition, assignments) => - // The condition and value must be from the source table - val resolvedCond = - condition.map(resolveCondition(resolve, _, merge, SOURCE_ONLY)) - val resolvedAssignments = - resolveAssignments(resolve, assignments, merge, SOURCE_ONLY) - InsertAction(resolvedCond, resolvedAssignments) - case InsertStarAction(condition) => - // The condition and value must be from the source table - val resolvedCond = - condition.map(resolveCondition(resolve, _, merge, SOURCE_ONLY)) - val assignments = merge.targetTable.output.map { - attr => Assignment(attr, UnresolvedAttribute(Seq(attr.name))) - } - val resolvedAssignments = - resolveAssignments(resolve, assignments, merge, SOURCE_ONLY) - PaimonMergeActionTags.markFromStar(InsertAction(resolvedCond, resolvedAssignments)) - case action => - throw new RuntimeException(s"Can't recognize this action: $action") - } - } - - def resolveNotMatchedBySourceActions( - merge: MergeIntoTable, - resolve: (Expression, LogicalPlan) => Expression): Seq[MergeAction] - - sealed trait ResolvedWith - case object ALL extends ResolvedWith - case object SOURCE_ONLY extends ResolvedWith - case object TARGET_ONLY extends ResolvedWith - - def resolveCondition( - resolve: (Expression, LogicalPlan) => Expression, - condition: Expression, - mergeInto: MergeIntoTable, - resolvedWith: ResolvedWith): Expression = { - resolvedWith match { - case ALL => resolve(condition, mergeInto) - case SOURCE_ONLY => resolve(condition, Project(Nil, mergeInto.sourceTable)) - case TARGET_ONLY => resolve(condition, Project(Nil, mergeInto.targetTable)) - } - } - - def resolveAssignments( - resolve: (Expression, LogicalPlan) => Expression, - assignments: Seq[Assignment], - mergeInto: MergeIntoTable, - resolvedWith: ResolvedWith): Seq[Assignment] = { - assignments.map { - assign => - val resolvedKey = resolve(assign.key, Project(Nil, mergeInto.targetTable)) - val resolvedValue = resolvedWith match { - case ALL => resolve(assign.value, mergeInto) - case SOURCE_ONLY => resolve(assign.value, Project(Nil, mergeInto.sourceTable)) - case TARGET_ONLY => resolve(assign.value, Project(Nil, mergeInto.targetTable)) - } - Assignment(resolvedKey, resolvedValue) - } - } -} diff --git a/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/catalyst/analysis/PaimonOutputResolver.scala b/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/catalyst/analysis/PaimonOutputResolver.scala index 7a8d4a878d5f..20da69c32123 100644 --- a/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/catalyst/analysis/PaimonOutputResolver.scala +++ b/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/catalyst/analysis/PaimonOutputResolver.scala @@ -20,39 +20,58 @@ package org.apache.paimon.spark.catalyst.analysis import org.apache.paimon.spark.catalyst.Compatibility +import org.apache.spark.sql.catalyst.SQLConfHelper import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.objects.AssertNotNull import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project} import org.apache.spark.sql.catalyst.util.CharVarcharUtils -import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ import scala.collection.mutable /** - * Trimmed fork of Spark's `TableOutputResolver` with `mergeSchemaEnabled` for schema evolution. - * - * - Missing target fields: NULL-fill at any depth. - * - Extra source fields: throw when `mergeSchemaEnabled = false`; otherwise kept in the - * projection at any depth so `SchemaHelper.mergeSchema` evolves the table at write time. Safe - * because `PaimonSparkTableBase` advertises `ACCEPT_ANY_SCHEMA`, short-circuiting Spark's - * `outputResolved` check. - * - Nullable downcast: wrap with `AssertNotNull`. - * - Type mismatch: wrap with `Compatibility.cast` tagged for downstream explicit-cast check. + * Forked `TableOutputResolver` parameterized by [[MissingFieldBehavior]]. Top-level always + * NULL-fills missing columns (mirrors Spark INSERT FILL). Source-extras kept at any depth relies on + * Paimon's `ACCEPT_ANY_SCHEMA` short-circuiting `outputResolved`. */ -object PaimonOutputResolver { +object PaimonOutputResolver extends SQLConfHelper { + + /** + * How nested struct field misalignment is handled: + * - [[FailMissing]]: strict — nested missing target / source-extra throws. + * - [[NullForMissing]]: merge-schema for INSERT / explicit UPDATE — missing NULL-fills, + * source-extras kept so [[org.apache.paimon.spark.commands.SchemaHelper]] evolves the table. + * - [[PreserveTarget]]: merge-schema for `UPDATE *` struct — missing source field substitutes + * `GetStructField(targetExpr, ordinal)` to keep the current target value. + */ + sealed trait MissingFieldBehavior + object MissingFieldBehavior { + case object FailMissing extends MissingFieldBehavior + case object NullForMissing extends MissingFieldBehavior + case object PreserveTarget extends MissingFieldBehavior + + def of(mergeSchemaEnabled: Boolean): MissingFieldBehavior = + if (mergeSchemaEnabled) NullForMissing else FailMissing + } + + import MissingFieldBehavior._ def resolveOutputColumns( tableName: String, expected: Seq[Attribute], query: LogicalPlan, byName: Boolean, - conf: SQLConf, mergeSchemaEnabled: Boolean): LogicalPlan = { val resolved: Seq[NamedExpression] = if (byName) { - reorderColumnsByName(tableName, query.output, expected, conf, mergeSchemaEnabled, Nil) + reorderColumnsByName( + tableName, + query.output, + expected, + MissingFieldBehavior.of(mergeSchemaEnabled), + None, + Nil) } else { - resolveColumnsByPosition(tableName, query.output, expected, conf, Nil) + resolveColumnsByPosition(tableName, query.output, expected, Nil) } if (resolved == query.output) { query @@ -61,44 +80,74 @@ object PaimonOutputResolver { } } + /** + * Align a MERGE/UPDATE assignment value to its target attribute. Returns the raw value (no outer + * Alias) — the downstream rewrite re-wraps it. `targetExpr` is required when + * `behavior = PreserveTarget`. + */ + def resolveValue( + value: Expression, + expected: Attribute, + behavior: MissingFieldBehavior, + targetExpr: Option[Expression], + colPath: Seq[String]): Expression = { + if (behavior == PreserveTarget && targetExpr.isEmpty) { + throw new IllegalArgumentException("PreserveTarget behavior requires a targetExpr") + } + val named: NamedExpression = value match { + case ne: NamedExpression => ne + case other => Alias(other, expected.name)() + } + stripOuterAlias( + resolveField( + tableName = "", + input = named, + expected = restoreActualType(expected), + byName = true, + behavior = behavior, + targetExpr = targetExpr, + colPath = colPath)) + } + private def reorderColumnsByName( tableName: String, inputCols: Seq[NamedExpression], expectedCols: Seq[Attribute], - conf: SQLConf, - mergeSchemaEnabled: Boolean, + behavior: MissingFieldBehavior, + targetExpr: Option[Expression], colPath: Seq[String]): Seq[NamedExpression] = { val isTopLevel = colPath.isEmpty val matchedNames = mutable.HashSet.empty[String] - val reordered = expectedCols.map { - expectedCol => + val reordered = expectedCols.zipWithIndex.map { + case (expectedCol, ordinal) => val matches = inputCols.filter(col => conf.resolver(col.name, expectedCol.name)) val newColPath = colPath :+ expectedCol.name if (matches.isEmpty) { - nullFill(expectedCol) + fillMissing(tableName, expectedCol, ordinal, isTopLevel, behavior, targetExpr, newColPath) } else if (matches.length > 1) { throw new RuntimeException( s"Cannot write to `$tableName`, " + s"due to ambiguous column name `${newColPath.mkString(".")}`.") } else { matchedNames += matches.head.name - val actualExpectedCol = expectedCol.withDataType { - CharVarcharUtils.getRawType(expectedCol.metadata).getOrElse(expectedCol.dataType) - } + val childTarget = + if (behavior == PreserveTarget) { + targetExpr.map(t => GetStructField(t, ordinal, Some(expectedCol.name))) + } else None resolveField( tableName, matches.head, - actualExpectedCol, + restoreActualType(expectedCol), byName = true, - conf, - mergeSchemaEnabled, + behavior, + childTarget, newColPath) } } if (matchedNames.size < inputCols.length) { val extras = inputCols.filterNot(col => matchedNames.contains(col.name)) - if (!mergeSchemaEnabled) { + if (behavior == FailMissing) { val extrasStr = extras.map(c => s"`${c.name}`").mkString(", ") val msg = if (isTopLevel) { s"Cannot write to `$tableName`, extra columns: $extrasStr" @@ -114,18 +163,34 @@ object PaimonOutputResolver { } } + private def fillMissing( + tableName: String, + expectedCol: Attribute, + ordinal: Int, + isTopLevel: Boolean, + behavior: MissingFieldBehavior, + targetExpr: Option[Expression], + newColPath: Seq[String]): NamedExpression = behavior match { + // Top-level always NULL-fills (mirrors Spark INSERT FILL). Nested-level strict throws. + case FailMissing if !isTopLevel => + throw new RuntimeException( + s"Cannot write to `$tableName`, nested struct field " + + s"`${newColPath.mkString(".")}` is missing in source. " + + s"Enable 'spark.paimon.write.merge-schema' to fill it with NULL.") + case PreserveTarget if targetExpr.isDefined => + applyColumnMetadata( + GetStructField(targetExpr.get, ordinal, Some(expectedCol.name)), + expectedCol) + case _ => + nullFill(expectedCol) + } + private def resolveColumnsByPosition( tableName: String, inputCols: Seq[NamedExpression], expectedCols: Seq[Attribute], - conf: SQLConf, colPath: Seq[String]): Seq[NamedExpression] = { - val actualExpectedCols = expectedCols.map { - attr => - attr.withDataType { - CharVarcharUtils.getRawType(attr.metadata).getOrElse(attr.dataType) - } - } + val actualExpectedCols = expectedCols.map(restoreActualType) if (inputCols.size != actualExpectedCols.size) { val where = if (colPath.isEmpty) { s"`$tableName`" @@ -142,8 +207,8 @@ object PaimonOutputResolver { inputCol, expectedCol, byName = false, - conf, - mergeSchemaEnabled = false, + behavior = FailMissing, + targetExpr = None, colPath :+ expectedCol.name) } } @@ -153,8 +218,8 @@ object PaimonOutputResolver { input: NamedExpression, expected: Attribute, byName: Boolean, - conf: SQLConf, - mergeSchemaEnabled: Boolean, + behavior: MissingFieldBehavior, + targetExpr: Option[Expression], colPath: Seq[String]): NamedExpression = { (input.dataType, expected.dataType) match { case (sourceType: StructType, targetType: StructType) => @@ -165,8 +230,8 @@ object PaimonOutputResolver { expected, targetType, byName, - conf, - mergeSchemaEnabled, + behavior, + targetExpr, colPath) case (sourceType: ArrayType, targetType: ArrayType) => resolveArrayType( @@ -176,8 +241,7 @@ object PaimonOutputResolver { expected, targetType, byName, - conf, - mergeSchemaEnabled, + behavior, colPath) case (sourceType: MapType, targetType: MapType) => resolveMapType( @@ -187,11 +251,10 @@ object PaimonOutputResolver { expected, targetType, byName, - conf, - mergeSchemaEnabled, + behavior, colPath) case _ => - checkField(input, expected, conf, colPath) + checkField(input, expected, colPath) } } @@ -202,8 +265,8 @@ object PaimonOutputResolver { expected: Attribute, targetType: StructType, byName: Boolean, - conf: SQLConf, - mergeSchemaEnabled: Boolean, + behavior: MissingFieldBehavior, + targetExpr: Option[Expression], colPath: Seq[String]): NamedExpression = { val nullCheckedInput = checkNullability(input, expected, colPath) val fields = sourceType.zipWithIndex.map { @@ -215,19 +278,67 @@ object PaimonOutputResolver { tableName, fields, toAttributes(targetType), - conf, - mergeSchemaEnabled, + behavior, + targetExpr, colPath) } else { - resolveColumnsByPosition(tableName, fields, toAttributes(targetType), conf, colPath) + resolveColumnsByPosition(tableName, fields, toAttributes(targetType), colPath) } - val struct = CreateStruct(resolved) - val res = if (nullCheckedInput.nullable) { - If(IsNull(nullCheckedInput), Literal(null, struct.dataType), struct) - } else { - struct + val targetNamedStruct = CreateStruct(resolved) + val res = maybeWrapWithNullPreservation( + sourceExpr = nullCheckedInput, + sourceType = sourceType, + targetType = targetType, + targetNamedStructExpr = targetNamedStruct, + originalTargetExprOpt = if (behavior == PreserveTarget) targetExpr else None + ) + applyColumnMetadata(res, expected) + } + + /** + * Collapse `NULL -> struct(NULL, ...)` expansion back to NULL. Under `PreserveTarget` with + * target-only fields, also requires the original target to be NULL so preserved leaves survive. + */ + private def maybeWrapWithNullPreservation( + sourceExpr: Expression, + sourceType: StructType, + targetType: StructType, + targetNamedStructExpr: Expression, + originalTargetExprOpt: Option[Expression]): Expression = { + if (!sourceExpr.nullable) return targetNamedStructExpr + + val sourceNullCondition = IsNull(sourceExpr) + val targetHasExtraFieldsToPreserveValue = + hasExtraStructFieldsToPreserveValue(sourceType, targetType) + val fullNullCondition = originalTargetExprOpt match { + case Some(originalTargetExpr) if targetHasExtraFieldsToPreserveValue => + And(sourceNullCondition, IsNull(originalTargetExpr)) + case Some(_) | None => sourceNullCondition + } + If(fullNullCondition, Literal(null, targetNamedStructExpr.dataType), targetNamedStructExpr) + } + + /** True if `targetStruct` has fields, at any nesting level, missing from `sourceStruct`. */ + private def hasExtraStructFieldsToPreserveValue( + sourceStruct: StructType, + targetStruct: StructType): Boolean = { + if (targetStruct.length > sourceStruct.length) return true + + val (commonFields, targetOnlyFields) = targetStruct.fields.partition { + targetField => sourceStruct.exists(f => conf.resolver(f.name, targetField.name)) + } + if (targetOnlyFields.nonEmpty) return true + + commonFields.exists { + targetField => + sourceStruct.find(f => conf.resolver(f.name, targetField.name)).exists { + sourceField => + (sourceField.dataType, targetField.dataType) match { + case (s: StructType, t: StructType) => hasExtraStructFieldsToPreserveValue(s, t) + case _ => false + } + } } - Alias(res, expected.name)(explicitMetadata = Option(expected.metadata)) } private def resolveArrayType( @@ -237,17 +348,16 @@ object PaimonOutputResolver { expected: Attribute, targetType: ArrayType, byName: Boolean, - conf: SQLConf, - mergeSchemaEnabled: Boolean, + behavior: MissingFieldBehavior, colPath: Seq[String]): NamedExpression = { val nullCheckedInput = checkNullability(input, expected, colPath) val param = NamedLambdaVariable("element", sourceType.elementType, sourceType.containsNull) val fakeAttr = AttributeReference("element", targetType.elementType, targetType.containsNull)() val resolved = if (byName) { - reorderColumnsByName(tableName, Seq(param), Seq(fakeAttr), conf, mergeSchemaEnabled, colPath) + reorderColumnsByName(tableName, Seq(param), Seq(fakeAttr), behavior, None, colPath) } else { - resolveColumnsByPosition(tableName, Seq(param), Seq(fakeAttr), conf, colPath) + resolveColumnsByPosition(tableName, Seq(param), Seq(fakeAttr), colPath) } assert(resolved.length == 1) val elementExpr = stripOuterAlias(resolved.head) @@ -256,7 +366,7 @@ object PaimonOutputResolver { } else { ArrayTransform(nullCheckedInput, LambdaFunction(elementExpr, Seq(param))) } - Alias(transformed, expected.name)(explicitMetadata = Option(expected.metadata)) + applyColumnMetadata(transformed, expected) } private def resolveMapType( @@ -266,22 +376,15 @@ object PaimonOutputResolver { expected: Attribute, targetType: MapType, byName: Boolean, - conf: SQLConf, - mergeSchemaEnabled: Boolean, + behavior: MissingFieldBehavior, colPath: Seq[String]): NamedExpression = { val nullCheckedInput = checkNullability(input, expected, colPath) val keyParam = NamedLambdaVariable("key", sourceType.keyType, nullable = false) val fakeKeyAttr = AttributeReference("key", targetType.keyType, nullable = false)() val resolvedKey = if (byName) { - reorderColumnsByName( - tableName, - Seq(keyParam), - Seq(fakeKeyAttr), - conf, - mergeSchemaEnabled, - colPath) + reorderColumnsByName(tableName, Seq(keyParam), Seq(fakeKeyAttr), behavior, None, colPath) } else { - resolveColumnsByPosition(tableName, Seq(keyParam), Seq(fakeKeyAttr), conf, colPath) + resolveColumnsByPosition(tableName, Seq(keyParam), Seq(fakeKeyAttr), colPath) } val valueParam = @@ -289,15 +392,9 @@ object PaimonOutputResolver { val fakeValueAttr = AttributeReference("value", targetType.valueType, targetType.valueContainsNull)() val resolvedValue = if (byName) { - reorderColumnsByName( - tableName, - Seq(valueParam), - Seq(fakeValueAttr), - conf, - mergeSchemaEnabled, - colPath) + reorderColumnsByName(tableName, Seq(valueParam), Seq(fakeValueAttr), behavior, None, colPath) } else { - resolveColumnsByPosition(tableName, Seq(valueParam), Seq(fakeValueAttr), conf, colPath) + resolveColumnsByPosition(tableName, Seq(valueParam), Seq(fakeValueAttr), colPath) } assert(resolvedKey.length == 1 && resolvedValue.length == 1) @@ -318,13 +415,12 @@ object PaimonOutputResolver { } MapFromArrays(newKeys, newValues) } - Alias(transformed, expected.name)(explicitMetadata = Option(expected.metadata)) + applyColumnMetadata(transformed, expected) } private def checkField( input: NamedExpression, expected: Attribute, - conf: SQLConf, colPath: Seq[String]): NamedExpression = { val attrTypeHasCharVarchar = CharVarcharUtils.hasCharVarchar(expected.dataType) val attrTypeWithoutCharVarchar = if (attrTypeHasCharVarchar) { @@ -335,7 +431,7 @@ object PaimonOutputResolver { val casted = if (input.dataType == attrTypeWithoutCharVarchar) { input } else { - addCast(input, attrTypeWithoutCharVarchar, conf) + addCast(input, attrTypeWithoutCharVarchar) } val withStrLenCheck = if (conf.charVarcharAsString || !attrTypeHasCharVarchar) { casted @@ -343,7 +439,7 @@ object PaimonOutputResolver { CharVarcharUtils.stringLengthCheck(casted, expected.dataType) } val nullChecked = checkNullability(withStrLenCheck, expected, colPath) - Alias(nullChecked, expected.name)(explicitMetadata = Option(expected.metadata)) + applyColumnMetadata(nullChecked, expected) } private def checkNullability( @@ -357,21 +453,18 @@ object PaimonOutputResolver { } } - private def addCast(expr: Expression, dataType: DataType, conf: SQLConf): Expression = { + private def addCast(expr: Expression, dataType: DataType): Expression = { val cast = Compatibility.cast(expr, dataType, Option(conf.sessionLocalTimeZone)) cast.setTagValue(Compatibility.castByTableInsertionTag, ()) cast } private def nullFill(expected: Attribute): NamedExpression = { - Alias(Literal(null, expected.dataType), expected.name)( - explicitMetadata = Option(expected.metadata)) + applyColumnMetadata(Literal(null, expected.dataType), expected) } - private def preserveAsAlias(expr: NamedExpression): NamedExpression = expr match { - case a: Alias => a - case other => - Alias(other, other.name)(explicitMetadata = Option(other.metadata)) + private def preserveAsAlias(expr: NamedExpression): NamedExpression = { + applyColumnMetadata(expr, expr.toAttribute) } private def stripOuterAlias(expr: Expression): Expression = expr match { @@ -382,4 +475,34 @@ object PaimonOutputResolver { private def toAttributes(structType: StructType): Seq[Attribute] = { structType.map(f => AttributeReference(f.name, f.dataType, f.nullable, f.metadata)()) } + + private def restoreActualType(attr: Attribute): Attribute = { + attr.withDataType(CharVarcharUtils.getRawType(attr.metadata).getOrElse(attr.dataType)) + } + + // Inlined `CharVarcharUtils.CHAR_VARCHAR_TYPE_STRING_METADATA_KEY` — the constant is + // `private[sql]` but stable across Spark 3.2–4.1. + private val CHAR_VARCHAR_TYPE_STRING_KEY = "__CHAR_VARCHAR_TYPE_STRING" + + // Mirrors `CharVarcharUtils.cleanMetadata` (public only in 4.1). Strips the read-side marker + // before metadata reaches a Write-side Alias. + private def cleanMetadata(metadata: Metadata): Metadata = + new MetadataBuilder().withMetadata(metadata).remove(CHAR_VARCHAR_TYPE_STRING_KEY).build() + + // Mirrors `TableOutputResolver.applyColumnMetadata` (SPARK-52772). The explicit-metadata Alias + // prevents later rewrites from inlining the inner AttributeReference and leaking source-side + // metadata (CURRENT_DEFAULT, char/varchar marker) into the Write, which would flip the plan + // back to unresolved. + private def applyColumnMetadata(expr: Expression, expected: Attribute): NamedExpression = { + val required = cleanMetadata(expected.metadata) + expr match { + case v: NamedLambdaVariable if v.name == expected.name && v.metadata == required => v + case _ => + val stripped = expr match { + case a: Alias => a.child + case _ => expr + } + Alias(stripped, expected.name)(explicitMetadata = Some(required)) + } + } } diff --git a/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/catalyst/analysis/PaimonUpdateTable.scala b/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/catalyst/analysis/PaimonUpdateTable.scala index a5bf7c713819..e03e658141a6 100644 --- a/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/catalyst/analysis/PaimonUpdateTable.scala +++ b/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/catalyst/analysis/PaimonUpdateTable.scala @@ -18,30 +18,23 @@ package org.apache.paimon.spark.catalyst.analysis +import org.apache.paimon.spark.catalyst.analysis.expressions.ExpressionHelper import org.apache.paimon.spark.commands.UpdatePaimonTableCommand import org.apache.paimon.table.FileStoreTable import org.apache.spark.sql.catalyst.expressions.Literal.TrueLiteral -import org.apache.spark.sql.catalyst.plans.logical.{AnalysisHelper, Assignment, LogicalPlan, UpdateTable} +import org.apache.spark.sql.catalyst.plans.logical.{AnalysisHelper, LogicalPlan, UpdateTable} import org.apache.spark.sql.catalyst.rules.Rule import scala.collection.JavaConverters._ -object PaimonUpdateTable - extends Rule[LogicalPlan] - with RowLevelHelper - with AssignmentAlignmentHelper { +object PaimonUpdateTable extends Rule[LogicalPlan] with RowLevelHelper with ExpressionHelper { override val operation: RowLevelOp = Update override def apply(plan: LogicalPlan): LogicalPlan = { - // Spark 4.1 moved RewriteUpdateTable from the "DML rewrite" batch into the main Resolution - // batch, which marks the logical plan as analyzed before the Post-Hoc Resolution batch runs. - // `plan.resolveOperators` then short-circuits on the already-analyzed UPDATE node, leaving the - // plan for Spark's physical planner to reject with "Table does not support UPDATE TABLE". Use - // `transformDown` (which unconditionally visits every node) guarded by - // `AnalysisHelper.allowInvokingTransformsInAnalyzer` so the in-analyzer assertion does not - // trip. The pattern guard keeps the rewrite restricted to fully resolved plans. + // Spark 4.1 marks the plan analyzed before postHoc runs, so `resolveOperators` would + // short-circuit. Use `transformDown` under `allowInvokingTransformsInAnalyzer` instead. AnalysisHelper.allowInvokingTransformsInAnalyzer { plan.transformDown { case u @ UpdateTable(PaimonRelation(table), assignments, condition) if u.resolved => @@ -61,12 +54,16 @@ object PaimonUpdateTable "Update operation is not supported when data evolution is enabled yet.") } - val alignedExpressions = - generateAlignedExpressions(relation.output, assignments).zip(relation.output) - - val alignedAssignments = alignedExpressions.map { - case (expression, field) => Assignment(field, expression) - } + // Align against `u.table.output`: for CHAR/VARCHAR columns the analyzer adds a + // `readSidePadding` Project whose output has different exprIds than `relation`, and + // the parsed assignment keys reference the Project's attributes. Order matches + // `relation.output` 1:1, so the subsequent zip stays correct. + val alignedAssignments = PaimonAssignmentUtils.alignUpdateAssignments( + u.table.output, + assignments, + fromStar = false, + mergeSchemaEnabled = false) + val alignedExpressions = alignedAssignments.map(_.value).zip(relation.output) val alignedUpdateTable = u.copy(assignments = alignedAssignments) diff --git a/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/catalyst/analysis/RewriteUpsertTable.scala b/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/catalyst/analysis/RewriteUpsertTable.scala index fd8776d6dbb4..ba6d0f8efa7b 100644 --- a/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/catalyst/analysis/RewriteUpsertTable.scala +++ b/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/catalyst/analysis/RewriteUpsertTable.scala @@ -106,8 +106,9 @@ case class RewriteUpsertTable(spark: SparkSession) extends Rule[LogicalPlan] { val assignments: Seq[Assignment] = target.output.zip(source.output).map(a => Assignment(a._1, a._2)) - val mergeActions = Seq(UpdateAction(updateCondiction, assignments)) - val notMatchedActions = Seq(InsertAction(None, assignments)) + val shim = SparkShimLoader.shim + val mergeActions = Seq(shim.createUpdateAction(updateCondiction, assignments)) + val notMatchedActions = Seq(shim.createInsertAction(None, assignments)) SparkShimLoader.shim.createMergeIntoTable( target, diff --git a/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/catalyst/analysis/RowLevelHelper.scala b/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/catalyst/analysis/RowLevelHelper.scala index 69bf3de0082e..4bbdb8bbd8c2 100644 --- a/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/catalyst/analysis/RowLevelHelper.scala +++ b/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/catalyst/analysis/RowLevelHelper.scala @@ -26,7 +26,7 @@ import org.apache.spark.sql.catalyst.SQLConfHelper import org.apache.spark.sql.catalyst.expressions.{AttributeReference, AttributeSet, BinaryExpression, EqualTo, Expression, SubqueryExpression} import org.apache.spark.sql.catalyst.plans.logical.{Assignment, MergeIntoTable, UpdateTable} -trait RowLevelHelper extends SQLConfHelper with AssignmentAlignmentHelper { +trait RowLevelHelper extends SQLConfHelper { val operation: RowLevelOp diff --git a/paimon-spark/paimon-spark-common/src/main/scala/org/apache/spark/sql/paimon/shims/SparkShim.scala b/paimon-spark/paimon-spark-common/src/main/scala/org/apache/spark/sql/paimon/shims/SparkShim.scala index b8b26237f0b0..7883903b30df 100644 --- a/paimon-spark/paimon-spark-common/src/main/scala/org/apache/spark/sql/paimon/shims/SparkShim.scala +++ b/paimon-spark/paimon-spark-common/src/main/scala/org/apache/spark/sql/paimon/shims/SparkShim.scala @@ -159,16 +159,24 @@ trait SparkShim { notMatchedBySourceActions: Seq[MergeAction], withSchemaEvolution: Boolean): MergeIntoTable + // Spark 3.4 added `notMatchedBySourceActions` to `MergeIntoTable`. On 3.2/3.3 the field doesn't + // exist on the AST, so this returns `Seq.empty`. Lets `paimon-spark-common` (which compiles + // against 3.5/4.1) reference NMBS via a single accessor that works on all minor versions. + def notMatchedBySourceActions(merge: MergeIntoTable): Seq[MergeAction] + + // Per-version shim: Spark 4.1 added a 3rd `fromStar: Boolean = false` field. A 2-arg call site + // compiled against 4.1 emits an `apply$default$3()` lookup absent on 4.0. Paimon tracks star + // intent via [[PaimonMergeActionTags]], so `fromStar` stays unused here. + def createUpdateAction(condition: Option[Expression], assignments: Seq[Assignment]): UpdateAction + + def createInsertAction(condition: Option[Expression], assignments: Seq[Assignment]): InsertAction + def copyDataSourceV2Relation( relation: DataSourceV2Relation, table: Table, output: Seq[org.apache.spark.sql.catalyst.expressions.AttributeReference]) : DataSourceV2Relation - def copyUpdateAction(action: UpdateAction, assignments: Seq[Assignment]): UpdateAction - - def copyInsertAction(action: InsertAction, assignments: Seq[Assignment]): InsertAction - /** * Returns the list of "early" substitution rules Paimon needs to apply on a parsed view plan. * Spark 3.x exposes both `CTESubstitution` and `SubstituteUnresolvedOrdinals`, but 4.1 removed diff --git a/paimon-spark/paimon-spark-ut/src/test/scala/org/apache/paimon/spark/sql/AssignmentAlignmentHelperTest.scala b/paimon-spark/paimon-spark-ut/src/test/scala/org/apache/paimon/spark/sql/AssignmentAlignmentHelperTest.scala deleted file mode 100644 index 725e4084570b..000000000000 --- a/paimon-spark/paimon-spark-ut/src/test/scala/org/apache/paimon/spark/sql/AssignmentAlignmentHelperTest.scala +++ /dev/null @@ -1,143 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.paimon.spark.sql - -import org.apache.paimon.spark.PaimonSparkTestBase -import org.apache.paimon.spark.catalyst.analysis.AssignmentAlignmentHelper - -import org.apache.spark.sql.Row -import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression, Literal} -import org.apache.spark.sql.catalyst.plans.logical.{Assignment, DeleteAction, InsertAction, MergeIntoTable, UpdateAction} -import org.apache.spark.sql.types.{IntegerType, Metadata, MetadataBuilder, StringType} - -import scala.reflect.ClassTag - -/** - * Test suite for [[AssignmentAlignmentHelper]] methods: - * - alignMergeAction (with isInsert parameter) - */ -class AssignmentAlignmentHelperTest extends PaimonSparkTestBase with AssignmentAlignmentHelper { - - /** Assert assignment key name and value type. */ - private def assertAssignment[T <: Expression: ClassTag]( - assignment: Assignment, - expectedKeyName: String): Unit = { - assert(assignment.key.asInstanceOf[AttributeReference].name == expectedKeyName) - val expectedType = implicitly[ClassTag[T]].runtimeClass - assert( - expectedType.isInstance(assignment.value), - s"Expected value type ${expectedType.getSimpleName}, " + - s"but got ${assignment.value.getClass.getSimpleName}" - ) - } - - /** Assert assignment key name and literal value. */ - private def assertLiteralValue( - assignment: Assignment, - expectedKeyName: String, - expectedValue: Any): Unit = { - assertAssignment[Literal](assignment, expectedKeyName) - assert( - assignment.value.asInstanceOf[Literal].value == expectedValue, - s"Expected literal value $expectedValue, but got ${assignment.value.asInstanceOf[Literal].value}" - ) - } - - test("alignMergeAction: DeleteAction should remain unchanged") { - val condition = Some(Literal(true)) - val deleteAction = DeleteAction(condition) - - val targetOutput = Seq( - AttributeReference("a", IntegerType)(), - AttributeReference("b", IntegerType)(), - AttributeReference("c", StringType)() - ) - - val aligned = alignMergeAction(deleteAction, targetOutput) - - assert(aligned.isInstanceOf[DeleteAction]) - assert(aligned.asInstanceOf[DeleteAction].condition == condition) - } - - test("alignMergeAction: UpdateAction should keep missing columns as-is") { - val targetA = AttributeReference("a", IntegerType)() - val targetB = AttributeReference("b", IntegerType)() - val targetC = AttributeReference("c", StringType)() - val targetOutput = Seq(targetA, targetB, targetC) - - // Only update column 'a', 'b' and 'c' should be kept as is - val assignments = Seq(Assignment(targetA, Literal(100))) - val updateAction = UpdateAction(None, assignments) - - val aligned = alignMergeAction(updateAction, targetOutput) - - assert(aligned.isInstanceOf[UpdateAction]) - val alignedAssignments = aligned.asInstanceOf[UpdateAction].assignments - assert(alignedAssignments.size == 3) - assertAssignment[Literal](alignedAssignments(0), "a") // a = 100 - assertAssignment[AttributeReference](alignedAssignments(1), "b") // b = b (unchanged) - assertAssignment[AttributeReference](alignedAssignments(2), "c") // c = c (unchanged) - } - - test("alignMergeAction: InsertAction should use NULL for missing columns") { - val targetA = AttributeReference("a", IntegerType)() - val targetB = AttributeReference("b", IntegerType)() - val targetC = AttributeReference("c", StringType)() - val targetOutput = Seq(targetA, targetB, targetC) - - // Only insert column 'a', 'b' and 'c' should be NULL - val sourceA = AttributeReference("a", IntegerType)() - val assignments = Seq(Assignment(targetA, sourceA)) - val insertAction = InsertAction(None, assignments) - - val aligned = alignMergeAction(insertAction, targetOutput) - - assert(aligned.isInstanceOf[InsertAction]) - val alignedAssignments = aligned.asInstanceOf[InsertAction].assignments - assert(alignedAssignments.size == 3) - assertAssignment[AttributeReference](alignedAssignments(0), "a") // a = source.a - assertLiteralValue(alignedAssignments(1), "b", null) // b = NULL (isInsert mode) - assertLiteralValue(alignedAssignments(2), "c", null) // c = NULL (isInsert mode) - } - - test("alignMergeAction: InsertAction should use default value for missing columns") { - val targetA = AttributeReference("a", IntegerType)() - // Column 'b' has default value 100 - val metadataWithDefault = new MetadataBuilder() - .putString("CURRENT_DEFAULT", "100") - .build() - val targetB = AttributeReference("b", IntegerType, nullable = true, metadataWithDefault)() - val targetC = AttributeReference("c", StringType)() - val targetOutput = Seq(targetA, targetB, targetC) - - // Only insert column 'a', 'b' should use default value 100, 'c' should be NULL - val sourceA = AttributeReference("a", IntegerType)() - val assignments = Seq(Assignment(targetA, sourceA)) - val insertAction = InsertAction(None, assignments) - - val aligned = alignMergeAction(insertAction, targetOutput) - - assert(aligned.isInstanceOf[InsertAction]) - val alignedAssignments = aligned.asInstanceOf[InsertAction].assignments - assert(alignedAssignments.size == 3) - assertAssignment[AttributeReference](alignedAssignments(0), "a") // a = source.a - assertLiteralValue(alignedAssignments(1), "b", 100) // b = 100 (default value) - assertLiteralValue(alignedAssignments(2), "c", null) // c = NULL (no default) - } -} diff --git a/paimon-spark/paimon-spark-ut/src/test/scala/org/apache/paimon/spark/sql/MergeIntoAlignmentTest.scala b/paimon-spark/paimon-spark-ut/src/test/scala/org/apache/paimon/spark/sql/MergeIntoAlignmentTest.scala new file mode 100644 index 000000000000..6dd00cc7c176 --- /dev/null +++ b/paimon-spark/paimon-spark-ut/src/test/scala/org/apache/paimon/spark/sql/MergeIntoAlignmentTest.scala @@ -0,0 +1,661 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.sql + +import org.apache.paimon.spark.PaimonSparkTestBase + +import org.apache.spark.sql.Row + +class MergeIntoAlignmentTest extends PaimonSparkTestBase { + + test("basic merge: matched UPDATE *, not-matched INSERT *") { + withTable("t") { + sql("""CREATE TABLE t (id INT, name STRING) + | USING paimon + | TBLPROPERTIES ('primary-key' = 'id', 'bucket' = '1')""".stripMargin) + sql("INSERT INTO t VALUES (1, 'a'), (2, 'b')") + + spark + .sql("SELECT 1 AS id, 'A' AS name UNION ALL SELECT 3 AS id, 'c' AS name") + .createOrReplaceTempView("s") + + sql("""MERGE INTO t USING s ON t.id = s.id + | WHEN MATCHED THEN UPDATE SET * + | WHEN NOT MATCHED THEN INSERT *""".stripMargin) + + checkAnswer( + sql("SELECT id, name FROM t ORDER BY id"), + Seq(Row(1, "A"), Row(2, "b"), Row(3, "c")) + ) + } + } + + // Source top-level extras are silently dropped under star expansion when merge-schema + // is off; `*` expands over target columns. + test( + "merge into: source extra columns silently dropped under star expansion (mergeSchema=false)") { + withTable("t") { + sql("""CREATE TABLE t (id INT, name STRING) + | USING paimon + | TBLPROPERTIES ('primary-key' = 'id', 'bucket' = '1')""".stripMargin) + sql("INSERT INTO t VALUES (1, 'a')") + + spark + .sql("""SELECT 1 AS id, 'A' AS name, 'upd' AS op + | UNION ALL SELECT 2 AS id, 'b' AS name, 'ins' AS op""".stripMargin) + .createOrReplaceTempView("s") + + sql("""MERGE INTO t USING s ON t.id = s.id + | WHEN MATCHED THEN UPDATE SET * + | WHEN NOT MATCHED THEN INSERT *""".stripMargin) + + checkAnswer( + sql("SELECT id, name FROM t ORDER BY id"), + Seq(Row(1, "A"), Row(2, "b")) + ) + } + } + + // With merge-schema enabled, the source-extra column is evolved into the target schema. + test("merge into: source extra columns evolve target schema (mergeSchema=true)") { + withTable("t") { + withSparkSQLConf("spark.paimon.write.merge-schema" -> "true") { + sql("""CREATE TABLE t (id INT, name STRING) + | USING paimon + | TBLPROPERTIES ('primary-key' = 'id', 'bucket' = '1')""".stripMargin) + sql("INSERT INTO t VALUES (1, 'a')") + + spark + .sql("""SELECT 1 AS id, 'A' AS name, 'upd' AS op + | UNION ALL SELECT 2 AS id, 'b' AS name, 'ins' AS op""".stripMargin) + .createOrReplaceTempView("s") + + sql("""MERGE INTO t USING s ON t.id = s.id + | WHEN MATCHED THEN UPDATE SET * + | WHEN NOT MATCHED THEN INSERT *""".stripMargin) + + checkAnswer( + sql("SELECT id, name, op FROM t ORDER BY id"), + Seq(Row(1, "A", "upd"), Row(2, "b", "ins")) + ) + } + } + } + + test("merge into: target has extra column missing in source") { + withTable("news") { + withSparkSQLConf("spark.paimon.write.merge-schema" -> "true") { + sql("""CREATE TABLE news ( + | newsId STRING, + | newsName STRING, + | tenantIds ARRAY + |) USING paimon + | TBLPROPERTIES ('primary-key' = 'newsId', 'bucket' = '1')""".stripMargin) + sql("INSERT INTO news VALUES ('n0', 'old', array('t1'))") + + spark + .sql("""SELECT 'n0' AS newsId, 'updated' AS newsName + | UNION ALL SELECT 'n1' AS newsId, 'foo' AS newsName""".stripMargin) + .createOrReplaceTempView("checkStatus") + + sql("""MERGE INTO news a + | USING checkStatus b ON a.newsId = b.newsId + | WHEN MATCHED THEN UPDATE SET * + | WHEN NOT MATCHED THEN INSERT *""".stripMargin) + + checkAnswer( + sql("SELECT newsId, newsName, tenantIds FROM news ORDER BY newsId"), + Seq( + Row("n0", "updated", Seq("t1")), + Row("n1", "foo", null) + ) + ) + } + } + } + + // Strict default: `UPDATE *` with a top-level target column missing in source throws. + test("merge into: UPDATE * top-level missing source col throws (mergeSchema=false)") { + withTable("t") { + sql("""CREATE TABLE t (id INT, name STRING, score INT) + | USING paimon + | TBLPROPERTIES ('primary-key' = 'id', 'bucket' = '1')""".stripMargin) + sql("INSERT INTO t VALUES (1, 'a', 10)") + + spark.sql("SELECT 1 AS id, 'A' AS name").createOrReplaceTempView("s") + + val ex = intercept[Exception] { + sql("""MERGE INTO t USING s ON t.id = s.id + | WHEN MATCHED THEN UPDATE SET *""".stripMargin) + } + assert(ex.getMessage.toLowerCase.contains("score")) + } + } + + // Strict default: `INSERT *` with a top-level target column missing in source throws. + test("merge into: INSERT * top-level missing source col throws (mergeSchema=false)") { + withTable("t") { + sql("""CREATE TABLE t (id INT, name STRING, score INT) + | USING paimon + | TBLPROPERTIES ('primary-key' = 'id', 'bucket' = '1')""".stripMargin) + + spark.sql("SELECT 2 AS id, 'b' AS name").createOrReplaceTempView("s") + + val ex = intercept[Exception] { + sql("""MERGE INTO t USING s ON t.id = s.id + | WHEN NOT MATCHED THEN INSERT *""".stripMargin) + } + assert(ex.getMessage.toLowerCase.contains("score")) + } + } + + // Explicit `INSERT (col list)` continues to NULL-fill unmentioned target columns (standard SQL). + test("merge into: explicit INSERT (col list) NULL-fills unmentioned target columns") { + withTable("t") { + sql("""CREATE TABLE t (id INT, name STRING, score INT) + | USING paimon + | TBLPROPERTIES ('primary-key' = 'id', 'bucket' = '1')""".stripMargin) + + spark.sql("SELECT 2 AS id, 'b' AS name").createOrReplaceTempView("s") + + sql("""MERGE INTO t USING s ON t.id = s.id + | WHEN NOT MATCHED THEN INSERT (id, name) VALUES (s.id, s.name)""".stripMargin) + + checkAnswer(sql("SELECT id, name, score FROM t"), Seq(Row(2, "b", null))) + } + } + + // UPDATE * with a nested struct value whose source struct is *narrower* than the target struct + // (source has `a` only; target has `a` and `b`). Under `merge-schema=true`, the source-missing + // leaf preserves the target's current value. + test("merge into: UPDATE * nested struct, source missing a leaf preserves target leaf") { + withTable("t") { + withSparkSQLConf("spark.paimon.write.merge-schema" -> "true") { + sql("""CREATE TABLE t ( + | id INT, + | info STRUCT + |) USING paimon + | TBLPROPERTIES ('primary-key' = 'id', 'bucket' = '1')""".stripMargin) + sql("INSERT INTO t VALUES (1, named_struct('a', 'oldA', 'b', 'oldB'))") + + spark + .sql("""SELECT 1 AS id, named_struct('a', 'newA') AS info""") + .createOrReplaceTempView("s") + + sql("""MERGE INTO t USING s ON t.id = s.id + | WHEN MATCHED THEN UPDATE SET *""".stripMargin) + + checkAnswer( + sql("SELECT id, info.a, info.b FROM t"), + Seq(Row(1, "newA", "oldB")) + ) + } + } + } + + // Strict default: source struct narrower than target throws. + test( + "merge into: UPDATE * nested struct, source missing target field throws (mergeSchema=false)") { + withTable("t") { + sql("""CREATE TABLE t ( + | id INT, + | info STRUCT + |) USING paimon + | TBLPROPERTIES ('primary-key' = 'id', 'bucket' = '1')""".stripMargin) + sql("INSERT INTO t VALUES (1, named_struct('a', 'oldA', 'b', 'oldB'))") + + spark + .sql("""SELECT 1 AS id, named_struct('a', 'newA') AS info""") + .createOrReplaceTempView("s") + + val ex = intercept[Exception] { + sql("""MERGE INTO t USING s ON t.id = s.id + | WHEN MATCHED THEN UPDATE SET *""".stripMargin) + } + assert(ex.getMessage.toLowerCase.contains("info") || ex.getMessage.toLowerCase.contains("b")) + } + } + + // Strict default: source struct wider than target throws — silently dropping would lose data. + test("merge into: UPDATE * nested struct, source extra field throws (mergeSchema=false)") { + withTable("t") { + sql("""CREATE TABLE t ( + | id INT, + | info STRUCT + |) USING paimon + | TBLPROPERTIES ('primary-key' = 'id', 'bucket' = '1')""".stripMargin) + sql("INSERT INTO t VALUES (1, named_struct('a', 'oldA'))") + + spark + .sql("""SELECT 1 AS id, named_struct('a', 'newA', 'b', 'newB') AS info""") + .createOrReplaceTempView("s") + + val ex = intercept[Exception] { + sql("""MERGE INTO t USING s ON t.id = s.id + | WHEN MATCHED THEN UPDATE SET *""".stripMargin) + } + assert(ex.getMessage.toLowerCase.contains("info") || ex.getMessage.toLowerCase.contains("b")) + } + } + + // With schema evolution on, source-wider nested case evolves the target nested struct. + test("merge into: UPDATE * nested struct, source extra field evolves schema (mergeSchema=true)") { + withTable("t") { + withSparkSQLConf("spark.paimon.write.merge-schema" -> "true") { + sql("""CREATE TABLE t ( + | id INT, + | info STRUCT + |) USING paimon + | TBLPROPERTIES ('primary-key' = 'id', 'bucket' = '1')""".stripMargin) + sql("INSERT INTO t VALUES (1, named_struct('a', 'oldA'))") + + spark + .sql("""SELECT 1 AS id, named_struct('a', 'newA', 'b', 'newB') AS info""") + .createOrReplaceTempView("s") + + sql("""MERGE INTO t USING s ON t.id = s.id + | WHEN MATCHED THEN UPDATE SET *""".stripMargin) + + checkAnswer(sql("SELECT id, info.a, info.b FROM t"), Seq(Row(1, "newA", "newB"))) + } + } + } + + // UPDATE * with a nested struct value that is entirely NULL at source. `fixNullExpansion` + // should collapse the constructed `named_struct(null, current_b)` to a NULL struct when the + // target struct is also nullable. + test("merge into: UPDATE * nested struct, NULL source collapses to NULL struct") { + withTable("t") { + sql("""CREATE TABLE t ( + | id INT, + | info STRUCT + |) USING paimon + | TBLPROPERTIES ('primary-key' = 'id', 'bucket' = '1')""".stripMargin) + sql("INSERT INTO t VALUES (1, named_struct('a', 'oldA', 'b', 'oldB'))") + + spark + .sql("""SELECT 1 AS id, CAST(NULL AS STRUCT) AS info""") + .createOrReplaceTempView("s") + + sql("""MERGE INTO t USING s ON t.id = s.id + | WHEN MATCHED THEN UPDATE SET *""".stripMargin) + + checkAnswer( + sql("SELECT id, info FROM t"), + Seq(Row(1, null)) + ) + } + } + + // mergeSchema=true PreserveTarget path: source NULL must still collapse to NULL when + // source/target struct schemas match exactly (no target-only fields to preserve). + // Without the hasExtraTargetFields check the per-field expansion produces struct(null, null). + test( + "merge into: UPDATE * nested struct matching schemas, NULL source collapses (mergeSchema=true)") { + withTable("t") { + withSparkSQLConf("spark.paimon.write.merge-schema" -> "true") { + sql("""CREATE TABLE t ( + | id INT, + | info STRUCT + |) USING paimon + | TBLPROPERTIES ('primary-key' = 'id', 'bucket' = '1')""".stripMargin) + sql("INSERT INTO t VALUES (1, named_struct('a', 'oldA', 'b', 'oldB'))") + + spark + .sql("""SELECT 1 AS id, CAST(NULL AS STRUCT) AS info""") + .createOrReplaceTempView("s") + + sql("""MERGE INTO t USING s ON t.id = s.id + | WHEN MATCHED THEN UPDATE SET *""".stripMargin) + + checkAnswer( + sql("SELECT id, info FROM t"), + Seq(Row(1, null)) + ) + } + } + } + + // Strict default: INSERT * with a nested struct narrower than the target throws — uniform with + // the explicit struct-assignment / `UPDATE *` paths. + test("merge into: INSERT * nested struct, source missing a leaf throws (mergeSchema=false)") { + withTable("t") { + sql("""CREATE TABLE t ( + | id INT, + | info STRUCT + |) USING paimon + | TBLPROPERTIES ('primary-key' = 'id', 'bucket' = '1')""".stripMargin) + + spark + .sql("""SELECT 2 AS id, named_struct('a', 'newA') AS info""") + .createOrReplaceTempView("s") + + val ex = intercept[Exception] { + sql("""MERGE INTO t USING s ON t.id = s.id + | WHEN NOT MATCHED THEN INSERT *""".stripMargin) + } + assert(ex.getMessage.toLowerCase.contains("info") || ex.getMessage.toLowerCase.contains("b")) + } + } + + // INSERT has no existing target row to preserve, so source-missing nested leaf is NULL-filled + // when merge-schema is enabled. + test("merge into: INSERT * nested struct, source missing a leaf gets NULL (mergeSchema=true)") { + withTable("t") { + withSparkSQLConf("spark.paimon.write.merge-schema" -> "true") { + sql("""CREATE TABLE t ( + | id INT, + | info STRUCT + |) USING paimon + | TBLPROPERTIES ('primary-key' = 'id', 'bucket' = '1')""".stripMargin) + + spark + .sql("""SELECT 2 AS id, named_struct('a', 'newA') AS info""") + .createOrReplaceTempView("s") + + sql("""MERGE INTO t USING s ON t.id = s.id + | WHEN NOT MATCHED THEN INSERT *""".stripMargin) + + checkAnswer( + sql("SELECT id, info.a, info.b FROM t"), + Seq(Row(2, "newA", null)) + ) + } + } + } + + // UPDATE * with type coercion (source INT for a BIGINT column). Alignment must inject a Cast. + test("merge into: UPDATE * with type coercion (INT -> BIGINT)") { + withTable("t") { + sql("""CREATE TABLE t (id INT, cnt BIGINT) + | USING paimon + | TBLPROPERTIES ('primary-key' = 'id', 'bucket' = '1')""".stripMargin) + sql("INSERT INTO t VALUES (1, 100L)") + + spark + .sql("SELECT 1 AS id, CAST(42 AS INT) AS cnt") + .createOrReplaceTempView("s") + + sql("""MERGE INTO t USING s ON t.id = s.id + | WHEN MATCHED THEN UPDATE SET *""".stripMargin) + + checkAnswer(sql("SELECT id, cnt FROM t"), Seq(Row(1, 42L))) + } + } + + // Explicit SET clause (not star): unspecified target columns preserve their current value. + test("merge into: explicit SET preserves unmentioned target columns") { + withTable("t") { + sql("""CREATE TABLE t (id INT, name STRING, score INT) + | USING paimon + | TBLPROPERTIES ('primary-key' = 'id', 'bucket' = '1')""".stripMargin) + sql("INSERT INTO t VALUES (1, 'a', 10)") + + spark.sql("SELECT 1 AS id, 'A' AS name").createOrReplaceTempView("s") + + sql("""MERGE INTO t USING s ON t.id = s.id + | WHEN MATCHED THEN UPDATE SET t.name = s.name""".stripMargin) + + checkAnswer(sql("SELECT id, name, score FROM t"), Seq(Row(1, "A", 10))) + } + } + + // Mixed actions: DELETE on one match, UPDATE on another, INSERT for unmatched. + test("merge into: mixed DELETE + UPDATE + INSERT actions") { + withTable("t") { + sql("""CREATE TABLE t (id INT, name STRING) + | USING paimon + | TBLPROPERTIES ('primary-key' = 'id', 'bucket' = '1')""".stripMargin) + sql("INSERT INTO t VALUES (1, 'a'), (2, 'b'), (3, 'c')") + + spark + .sql("""SELECT 1 AS id, 'A' AS name, 'del' AS op + | UNION ALL SELECT 2 AS id, 'B' AS name, 'upd' AS op + | UNION ALL SELECT 4 AS id, 'd' AS name, 'ins' AS op""".stripMargin) + .createOrReplaceTempView("s") + + sql("""MERGE INTO t USING s ON t.id = s.id + | WHEN MATCHED AND s.op = 'del' THEN DELETE + | WHEN MATCHED AND s.op = 'upd' THEN UPDATE SET t.name = s.name + | WHEN NOT MATCHED THEN INSERT (id, name) VALUES (s.id, s.name)""".stripMargin) + + checkAnswer( + sql("SELECT id, name FROM t ORDER BY id"), + Seq(Row(2, "B"), Row(3, "c"), Row(4, "d")) + ) + } + } + + // 3-level nested struct under UPDATE *. Innermost leaf missing in source preserves target value + // at every depth under `merge-schema=true`. + test("merge into: UPDATE * 3-level nested struct preserves deep leaves") { + withTable("t") { + withSparkSQLConf("spark.paimon.write.merge-schema" -> "true") { + sql("""CREATE TABLE t ( + | id INT, + | l1 STRUCT>> + |) USING paimon + | TBLPROPERTIES ('primary-key' = 'id', 'bucket' = '1')""".stripMargin) + sql("""INSERT INTO t VALUES ( + | 1, + | named_struct( + | 'a', 'oldA', + | 'l2', named_struct( + | 'b', 'oldB', + | 'l3', named_struct('c', 'oldC', 'd', 'oldD'))) + |)""".stripMargin) + + // Source omits `l1.l2.l3.d`; we expect `oldD` to be preserved. + spark + .sql("""SELECT 1 AS id, + | named_struct( + | 'a', 'newA', + | 'l2', named_struct( + | 'b', 'newB', + | 'l3', named_struct('c', 'newC'))) AS l1""".stripMargin) + .createOrReplaceTempView("s") + + sql("""MERGE INTO t USING s ON t.id = s.id + | WHEN MATCHED THEN UPDATE SET *""".stripMargin) + + checkAnswer( + sql("SELECT id, l1.a, l1.l2.b, l1.l2.l3.c, l1.l2.l3.d FROM t"), + Seq(Row(1, "newA", "newB", "newC", "oldD")) + ) + } + } + } + + // Explicit `INSERT (newCol) VALUES (s.newCol)` should evolve the target schema under + // `merge-schema=true` — same outcome as `INSERT *`. The resolver fallback marks the key as + // source-bound; `MergeSchemaEvolutionHelper` rebinds it onto the evolved target attribute. + test("merge into: explicit INSERT (newCol) evolves target schema (mergeSchema=true)") { + withTable("t") { + withSparkSQLConf("spark.paimon.write.merge-schema" -> "true") { + sql("""CREATE TABLE t (id INT, name STRING) + | USING paimon + | TBLPROPERTIES ('primary-key' = 'id', 'bucket' = '1')""".stripMargin) + + spark + .sql("SELECT 2 AS id, 'b' AS name, 99 AS score") + .createOrReplaceTempView("s") + + sql("""MERGE INTO t USING s ON t.id = s.id + | WHEN NOT MATCHED THEN INSERT (id, name, score) VALUES (s.id, s.name, s.score) + |""".stripMargin) + + checkAnswer( + sql("SELECT id, name, score FROM t ORDER BY id"), + Seq(Row(2, "b", 99)) + ) + } + } + } + + // Explicit `UPDATE SET newCol = s.x` should evolve the target schema under `merge-schema=true`. + test("merge into: explicit UPDATE SET newCol = s.x evolves target schema (mergeSchema=true)") { + withTable("t") { + withSparkSQLConf("spark.paimon.write.merge-schema" -> "true") { + sql("""CREATE TABLE t (id INT, name STRING) + | USING paimon + | TBLPROPERTIES ('primary-key' = 'id', 'bucket' = '1')""".stripMargin) + sql("INSERT INTO t VALUES (1, 'a')") + + spark + .sql("SELECT 1 AS id, 'A' AS name, 99 AS score") + .createOrReplaceTempView("s") + + sql("""MERGE INTO t USING s ON t.id = s.id + | WHEN MATCHED THEN UPDATE SET t.id = s.id, t.name = s.name, score = s.score + |""".stripMargin) + + checkAnswer( + sql("SELECT id, name, score FROM t"), + Seq(Row(1, "A", 99)) + ) + } + } + } + + // Scoped evolution: explicit INSERT (newCol1) must only add newCol1, even if the source has + // additional unreferenced new columns. Only `*` actions pull in the full source schema. + test("merge into: explicit INSERT (newCol1) scopes evolution to mentioned columns") { + withTable("t") { + withSparkSQLConf("spark.paimon.write.merge-schema" -> "true") { + sql("""CREATE TABLE t (id INT, name STRING) + | USING paimon + | TBLPROPERTIES ('primary-key' = 'id', 'bucket' = '1')""".stripMargin) + + spark + .sql("SELECT 2 AS id, 'b' AS name, 10 AS new_col1, 20 AS new_col2") + .createOrReplaceTempView("s") + + sql("""MERGE INTO t USING s ON t.id = s.id + | WHEN NOT MATCHED THEN INSERT (id, name, new_col1) VALUES (s.id, s.name, s.new_col1) + |""".stripMargin) + + val cols = spark.table("t").schema.fieldNames.toSet + assert(cols == Set("id", "name", "new_col1"), s"unexpected columns: $cols") + checkAnswer(sql("SELECT id, name, new_col1 FROM t"), Seq(Row(2, "b", 10))) + } + } + } + + // Strict default: explicit assignment to a non-existent target column still fails — only + // merge-schema mode treats it as a new-column intent. + test("merge into: explicit INSERT (newCol) without merge-schema fails (mergeSchema=false)") { + withTable("t") { + sql("""CREATE TABLE t (id INT, name STRING) + | USING paimon + | TBLPROPERTIES ('primary-key' = 'id', 'bucket' = '1')""".stripMargin) + + spark + .sql("SELECT 2 AS id, 'b' AS name, 99 AS score") + .createOrReplaceTempView("s") + + val ex = intercept[Exception] { + sql("""MERGE INTO t USING s ON t.id = s.id + | WHEN NOT MATCHED THEN INSERT (id, name, score) VALUES (s.id, s.name, s.score) + |""".stripMargin) + } + assert(ex.getMessage.toLowerCase.contains("score")) + } + } + + // Explicit nested-field assignment via `t.s.c1 = ...`. The other struct field must be preserved + // (handled by `applyFieldAssignments` recursing through the target struct). + test("merge into: UPDATE explicit nested-field assignment preserves sibling field") { + withTable("t") { + sql("""CREATE TABLE t ( + | id INT, + | info STRUCT + |) USING paimon + | TBLPROPERTIES ('primary-key' = 'id', 'bucket' = '1')""".stripMargin) + sql("INSERT INTO t VALUES (1, named_struct('a', 'oldA', 'b', 'oldB'))") + + spark.sql("SELECT 1 AS id, 'newA' AS new_a").createOrReplaceTempView("s") + + sql("""MERGE INTO t USING s ON t.id = s.id + | WHEN MATCHED THEN UPDATE SET t.info.a = s.new_a""".stripMargin) + + checkAnswer(sql("SELECT id, info.a, info.b FROM t"), Seq(Row(1, "newA", "oldB"))) + } + } + + test("merge into: top-level column names match case-insensitively (UPDATE * / INSERT *)") { + withTable("t") { + sql("""CREATE TABLE t (id INT, name STRING) + | USING paimon + | TBLPROPERTIES ('primary-key' = 'id', 'bucket' = '1')""".stripMargin) + sql("INSERT INTO t VALUES (1, 'a'), (2, 'b')") + + // Source uses uppercase, target uses lowercase. + spark + .sql("SELECT 1 AS ID, 'A' AS NAME UNION ALL SELECT 3 AS ID, 'c' AS NAME") + .createOrReplaceTempView("s") + + sql("""MERGE INTO t USING s ON t.id = s.ID + | WHEN MATCHED THEN UPDATE SET * + | WHEN NOT MATCHED THEN INSERT *""".stripMargin) + + checkAnswer( + sql("SELECT id, name FROM t ORDER BY id"), + Seq(Row(1, "A"), Row(2, "b"), Row(3, "c")) + ) + } + } + + test("merge into: explicit SET target column LHS matches case-insensitively") { + withTable("t") { + sql("""CREATE TABLE t (id INT, name STRING) + | USING paimon + | TBLPROPERTIES ('primary-key' = 'id', 'bucket' = '1')""".stripMargin) + sql("INSERT INTO t VALUES (1, 'old')") + + spark.sql("SELECT 1 AS id, 'NEW' AS new_name").createOrReplaceTempView("s") + + // LHS `NAME` (uppercase) matches target `name`. + sql("""MERGE INTO t USING s ON t.id = s.id + | WHEN MATCHED THEN UPDATE SET NAME = s.new_name""".stripMargin) + + checkAnswer(sql("SELECT id, name FROM t"), Seq(Row(1, "NEW"))) + } + } + + test("merge into: nested struct field names match case-insensitively (UPDATE *)") { + withTable("t") { + sql("""CREATE TABLE t ( + | id INT, + | info STRUCT + |) USING paimon + | TBLPROPERTIES ('primary-key' = 'id', 'bucket' = '1')""".stripMargin) + sql("INSERT INTO t VALUES (1, named_struct('a', 'oldA', 'b', 'oldB'))") + + // Source struct fields use uppercase. + spark + .sql("SELECT 1 AS id, named_struct('A', 'newA', 'B', 'newB') AS info") + .createOrReplaceTempView("s") + + sql("""MERGE INTO t USING s ON t.id = s.id + | WHEN MATCHED THEN UPDATE SET *""".stripMargin) + + checkAnswer(sql("SELECT id, info.a, info.b FROM t"), Seq(Row(1, "newA", "newB"))) + } + } +} diff --git a/paimon-spark/paimon-spark-ut/src/test/scala/org/apache/paimon/spark/sql/MergeIntoTableTestBase.scala b/paimon-spark/paimon-spark-ut/src/test/scala/org/apache/paimon/spark/sql/MergeIntoTableTestBase.scala index b328fa186739..bc0924ac1783 100644 --- a/paimon-spark/paimon-spark-ut/src/test/scala/org/apache/paimon/spark/sql/MergeIntoTableTestBase.scala +++ b/paimon-spark/paimon-spark-ut/src/test/scala/org/apache/paimon/spark/sql/MergeIntoTableTestBase.scala @@ -471,15 +471,16 @@ abstract class MergeIntoTableTestBase extends PaimonSparkTestBase with PaimonTab } } - test("Paimon MergeInto: fail in case that miss some columns in insert") { - withTable("source", "target") { + test( + "Paimon MergeInto: UPDATE * / INSERT * with missing source column throws (mergeSchema=false)") { + withTable("source", "target") { Seq((1, "c11"), (3, "c33")).toDF("a", "c").createOrReplaceTempView("source") createTable("target", "a INT, b INT, c STRING", Seq("a")) spark.sql("INSERT INTO target values (1, 10, 'c1'), (2, 20, 'c2')") - val error = intercept[RuntimeException] { + val error = intercept[Exception] { spark.sql(s""" |MERGE INTO target |USING source @@ -490,7 +491,55 @@ abstract class MergeIntoTableTestBase extends PaimonSparkTestBase with PaimonTab |THEN INSERT * |""".stripMargin) }.getMessage - assert(error.contains("cannot resolve b from Project")) + assert(error.toLowerCase.contains("b")) + } + } + + test("Paimon MergeInto: UPDATE * preserves target column missing in source (mergeSchema=true)") { + withTable("source", "target") { + withSparkSQLConf("spark.paimon.write.merge-schema" -> "true") { + Seq((1, "c11")).toDF("a", "c").createOrReplaceTempView("source") + + createTable("target", "a INT, b INT, c STRING", Seq("a")) + spark.sql("INSERT INTO target values (1, 10, 'c1'), (2, 20, 'c2')") + + spark.sql(s""" + |MERGE INTO target + |USING source + |ON target.a = source.a + |WHEN MATCHED THEN + |UPDATE SET * + |""".stripMargin) + + // Matched row keeps current `b` (source has no `b`); unmatched row untouched. + checkAnswer( + spark.sql("SELECT * FROM target ORDER BY a"), + Row(1, 10, "c11") :: Row(2, 20, "c2") :: Nil) + } + } + } + + test("Paimon MergeInto: INSERT * NULL-fills target column missing in source (mergeSchema=true)") { + withTable("source", "target") { + withSparkSQLConf("spark.paimon.write.merge-schema" -> "true") { + Seq((3, "c33")).toDF("a", "c").createOrReplaceTempView("source") + + createTable("target", "a INT, b INT, c STRING", Seq("a")) + spark.sql("INSERT INTO target values (1, 10, 'c1'), (2, 20, 'c2')") + + spark.sql(s""" + |MERGE INTO target + |USING source + |ON target.a = source.a + |WHEN NOT MATCHED + |THEN INSERT * + |""".stripMargin) + + // New row gets `b = NULL` (source has no `b`); existing rows untouched. + checkAnswer( + spark.sql("SELECT * FROM target ORDER BY a"), + Row(1, 10, "c1") :: Row(2, 20, "c2") :: Row(3, null, "c33") :: Nil) + } } } @@ -568,7 +617,7 @@ abstract class MergeIntoTableTestBase extends PaimonSparkTestBase with PaimonTab |THEN INSERT * |""".stripMargin) }.getMessage - assert(error1.contains("Conflicting update/insert on attrs: b")) + assert(error1.contains("Multiple assignments for 'b'")) val error2 = intercept[RuntimeException] { spark.sql(s""" @@ -581,7 +630,7 @@ abstract class MergeIntoTableTestBase extends PaimonSparkTestBase with PaimonTab |THEN INSERT (a, a, c) VALUES (a, b, c) |""".stripMargin) }.getMessage - assert(error2.contains("Conflicting update/insert on attrs: a")) + assert(error2.contains("Multiple assignments for 'a'")) } } @@ -915,7 +964,7 @@ abstract class MergeIntoTableTestBase extends PaimonSparkTestBase with PaimonTab } } - test("Paimon MergeInto: merge-schema disabled should not add new columns") { + test("Paimon MergeInto: merge-schema disabled silently drops source-extra columns") { withTable("source", "target") { withSparkSQLConf("spark.paimon.write.merge-schema" -> "false") { createTable("target", "a INT, b STRING", Seq("a")) @@ -1379,32 +1428,36 @@ abstract class MergeIntoTableTestBase extends PaimonSparkTestBase with PaimonTab } test("Paimon MergeInto: struct field reorder when target has fields absent from source") { - withTable("source", "target") { - // Target struct has 3 sub-fields; source struct only has 2 of them in a different order. - createTable("target", "id INT, info STRUCT", Seq("id")) - spark.sql("INSERT INTO target VALUES (1, struct(1, 2, 3)), (2, struct(4, 5, 6))") + // Source struct narrower than target: strict mode rejects the nested missing field; + // merge-schema NULL-fills it (INSERT path has no prior row to preserve). + withSparkSQLConf("spark.paimon.write.merge-schema" -> "true") { + withTable("source", "target") { + // Target struct has 3 sub-fields; source struct only has 2 of them in a different order. + createTable("target", "id INT, info STRUCT", Seq("id")) + spark.sql("INSERT INTO target VALUES (1, struct(1, 2, 3)), (2, struct(4, 5, 6))") - createTable("source", "id INT, info STRUCT", Seq("id")) - spark.sql("INSERT INTO source VALUES (1, struct(20, 10)), (3, struct(80, 70))") + createTable("source", "id INT, info STRUCT", Seq("id")) + spark.sql("INSERT INTO source VALUES (1, struct(20, 10)), (3, struct(80, 70))") - spark.sql(""" - |MERGE INTO target - |USING source - |ON target.id = source.id - |WHEN MATCHED THEN - |UPDATE SET target.info = source.info - |WHEN NOT MATCHED THEN - |INSERT (id, info) VALUES (source.id, source.info) - |""".stripMargin) + spark.sql(""" + |MERGE INTO target + |USING source + |ON target.id = source.id + |WHEN MATCHED THEN + |UPDATE SET target.info = source.info + |WHEN NOT MATCHED THEN + |INSERT (id, info) VALUES (source.id, source.info) + |""".stripMargin) - checkAnswer( - spark.sql("SELECT * FROM target ORDER BY id"), - Seq( - Row(1, Row(10, 20, null)), - Row(2, Row(4, 5, 6)), - Row(3, Row(70, 80, null)) + checkAnswer( + spark.sql("SELECT * FROM target ORDER BY id"), + Seq( + Row(1, Row(10, 20, null)), + Row(2, Row(4, 5, 6)), + Row(3, Row(70, 80, null)) + ) ) - ) + } } } diff --git a/paimon-spark/paimon-spark-ut/src/test/scala/org/apache/paimon/spark/sql/UpdateTableTestBase.scala b/paimon-spark/paimon-spark-ut/src/test/scala/org/apache/paimon/spark/sql/UpdateTableTestBase.scala index 072324ce3a98..665a08017a94 100644 --- a/paimon-spark/paimon-spark-ut/src/test/scala/org/apache/paimon/spark/sql/UpdateTableTestBase.scala +++ b/paimon-spark/paimon-spark-ut/src/test/scala/org/apache/paimon/spark/sql/UpdateTableTestBase.scala @@ -353,7 +353,7 @@ abstract class UpdateTableTestBase extends PaimonSparkTestBase { assertThatThrownBy( () => spark.sql("UPDATE T SET s.c2 = 'a_new', s = struct(11, 'a_new') WHERE s.c1 = 1")) - .hasMessageContaining("Conflicting update/insert on attrs: s.c2, s") + .hasMessageContaining("Conflicting assignments for 's'") } test("Paimon update: update table with char type") { @@ -363,6 +363,26 @@ abstract class UpdateTableTestBase extends PaimonSparkTestBase { checkAnswer(sql("SELECT * FROM T"), Seq(Row(1, "s", "b"))) } + test("Paimon update: overlong CHAR value throws (same as INSERT)") { + withTable("t_char") { + sql("CREATE TABLE t_char (id INT, c CHAR(2))") + sql("INSERT INTO t_char VALUES (1, 'aa')") + assertThatThrownBy(() => sql("UPDATE t_char SET c = 'abc' WHERE id = 1")) + .hasMessageContaining("char/varchar") + checkAnswer(sql("SELECT * FROM t_char"), Seq(Row(1, "aa"))) + } + } + + test("Paimon update: overlong VARCHAR value throws (same as INSERT)") { + withTable("t_varchar") { + sql("CREATE TABLE t_varchar (id INT, v VARCHAR(2))") + sql("INSERT INTO t_varchar VALUES (1, 'bb')") + assertThatThrownBy(() => sql("UPDATE t_varchar SET v = 'abc' WHERE id = 1")) + .hasMessageContaining("char/varchar") + checkAnswer(sql("SELECT * FROM t_varchar"), Seq(Row(1, "bb"))) + } + } + test("Paimon update: non pk table commit kind") { for (dvEnabled <- Seq(true, false)) { withTable("t") { diff --git a/paimon-spark/paimon-spark3-common/src/main/scala/org/apache/spark/sql/paimon/shims/MinorVersionShim.scala b/paimon-spark/paimon-spark3-common/src/main/scala/org/apache/spark/sql/paimon/shims/MinorVersionShim.scala index c069c02dc5b7..8877446d7615 100644 --- a/paimon-spark/paimon-spark3-common/src/main/scala/org/apache/spark/sql/paimon/shims/MinorVersionShim.scala +++ b/paimon-spark/paimon-spark3-common/src/main/scala/org/apache/spark/sql/paimon/shims/MinorVersionShim.scala @@ -44,4 +44,7 @@ object MinorVersionShim { notMatchedActions, notMatchedBySourceActions) } + + def notMatchedBySourceActions(merge: MergeIntoTable): Seq[MergeAction] = + merge.notMatchedBySourceActions } diff --git a/paimon-spark/paimon-spark3-common/src/main/scala/org/apache/spark/sql/paimon/shims/Spark3Shim.scala b/paimon-spark/paimon-spark3-common/src/main/scala/org/apache/spark/sql/paimon/shims/Spark3Shim.scala index 682dc89014a1..cbefee5d0487 100644 --- a/paimon-spark/paimon-spark3-common/src/main/scala/org/apache/spark/sql/paimon/shims/Spark3Shim.scala +++ b/paimon-spark/paimon-spark3-common/src/main/scala/org/apache/spark/sql/paimon/shims/Spark3Shim.scala @@ -239,6 +239,19 @@ class Spark3Shim extends SparkShim { notMatchedBySourceActions) } + override def notMatchedBySourceActions(merge: MergeIntoTable): Seq[MergeAction] = + MinorVersionShim.notMatchedBySourceActions(merge) + + override def createUpdateAction( + condition: Option[Expression], + assignments: Seq[Assignment]): UpdateAction = + UpdateAction(condition, assignments) + + override def createInsertAction( + condition: Option[Expression], + assignments: Seq[Assignment]): InsertAction = + InsertAction(condition, assignments) + override def copyDataSourceV2Relation( relation: DataSourceV2Relation, table: Table, @@ -246,18 +259,6 @@ class Spark3Shim extends SparkShim { relation.copy(table = table, output = output) } - override def copyUpdateAction( - action: UpdateAction, - assignments: Seq[Assignment]): UpdateAction = { - action.copy(assignments = assignments) - } - - override def copyInsertAction( - action: InsertAction, - assignments: Seq[Assignment]): InsertAction = { - action.copy(assignments = assignments) - } - override def earlyBatchRules(): Seq[Rule[LogicalPlan]] = Seq(CTESubstitution, SubstituteUnresolvedOrdinals) diff --git a/paimon-spark/paimon-spark4-common/src/main/scala/org/apache/spark/sql/catalyst/analysis/Spark41MergeIntoRewrite.scala b/paimon-spark/paimon-spark4-common/src/main/scala/org/apache/spark/sql/catalyst/analysis/Spark41MergeIntoRewrite.scala index 921092572e18..0b2e6c607e17 100644 --- a/paimon-spark/paimon-spark4-common/src/main/scala/org/apache/spark/sql/catalyst/analysis/Spark41MergeIntoRewrite.scala +++ b/paimon-spark/paimon-spark4-common/src/main/scala/org/apache/spark/sql/catalyst/analysis/Spark41MergeIntoRewrite.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.analysis import org.apache.paimon.spark.SparkTable -import org.apache.paimon.spark.catalyst.analysis.{AssignmentAlignmentHelper, MergeSchemaEvolutionHelper, PaimonRelation} +import org.apache.paimon.spark.catalyst.analysis.{MergeSchemaEvolutionHelper, PaimonRelation} import org.apache.spark.sql.{AnalysisException, SparkSession} import org.apache.spark.sql.catalyst.expressions.{Alias, And, Attribute, AttributeReference, Exists, Expression, IsNotNull, Literal, MetadataAttribute, MonotonicallyIncreasingID, OuterReference, PredicateHelper, SubqueryExpression} @@ -38,49 +38,27 @@ import org.apache.spark.sql.types.IntegerType import org.apache.spark.sql.util.CaseInsensitiveStringMap /** - * Spark 4.1-only Resolution-batch rule that rewrites MERGE INTO plans targeting pure append-only - * Paimon tables (no PK / RT / DE / DV) into the V2 `ReplaceData` / `AppendData` plans — the same - * forms Spark's built-in `RewriteMergeIntoTable` produces for `SupportsRowLevelOperations` tables - * that don't implement `SupportsDelta`. + * Spark 4.1-only Resolution-batch rule that rewrites MERGE INTO on pure append-only Paimon tables + * (no PK / RT / DE / DV) into V2 `ReplaceData` / `AppendData` plans, mirroring Spark's built-in + * `RewriteMergeIntoTable` for non-`SupportsDelta` row-level tables. * - * Why this rule exists: Spark 4.1 moved `RewriteMergeIntoTable` into the main Resolution batch AND - * implemented its `apply` with `plan resolveOperators { ... }`. `AnalysisHelper.resolveOperators*` - * short-circuits on already-`analyzed` plans, so by the time the rewrite would run the - * `MergeIntoTable` node has transitioned to `analyzed=true` and the rewrite silently skips. The - * `MergeIntoTable` then falls through to the physical planner which rejects it with - * `UNSUPPORTED_FEATURE.TABLE_OPERATION`. This mirrors the `Spark41AppendOnlyRowLevelRewrite` - * treatment of UPDATE. + * In Spark 4.1, `RewriteMergeIntoTable` runs in the Resolution batch via `resolveOperators`, which + * short-circuits on `analyzed=true` plans — by the time it would fire, the `MergeIntoTable` is + * already marked analyzed and silently skipped, so the planner rejects it with + * `UNSUPPORTED_FEATURE.TABLE_OPERATION`. We intercept via `transformDown` under + * `allowInvokingTransformsInAnalyzer` and inline the three `ReplaceData`/`AppendData` branches. + * `SupportsDelta` is omitted — Paimon is copy-on-write only. * - * Firing in the Resolution batch via `transformDown` guarded by - * `AnalysisHelper.allowInvokingTransformsInAnalyzer` intercepts the plan before the analyzed flag - * traps Spark's own rule. The body is a near-verbatim transcription of `RewriteMergeIntoTable`'s - * three `ReplaceData`/`AppendData` branches (kept in lockstep with Spark 4.1.1) so the produced - * plans go through Paimon's V2 row-level write path (`PaimonSparkCopyOnWriteOperation` -> - * `PaimonV2WriteBuilder` -> `PaimonBatchWrite`) exactly like Spark would have produced. The - * `SupportsDelta` branch is intentionally omitted — Paimon's `PaimonSparkCopyOnWriteOperation` is - * copy-on-write only. + * We fire before `ResolveAssignments`, so `m.aligned` is `false`. The rule pre-aligns each action + * list via `PaimonAssignmentUtils.alignActions` (shared with the postHoc `PaimonMergeInto` rule). * - * Like the UPDATE rule, we fire before `ResolveAssignments` has run, so `m.aligned` is still - * `false` when we see the plan. The rule mixes in `AssignmentAlignmentHelper` and aligns all - * merge-action assignments itself (same helper the postHoc `PaimonMergeInto` rule uses) before - * dispatching to the branch logic, which depends on `m.aligned == true`. - * - * Scope mirrors `Spark41AppendOnlyRowLevelRewrite`: - * - `SPARK_VERSION >= "4.1"` - * - pure append-only `FileStoreTable`s (no PK / RT / DE / DV) - * - no CHAR columns — Spark's `CharVarcharCodegenUtils.readSidePadding` Project races with the - * rewrite and trips CheckAnalysis when intercepting before the padding stabilises; those plans - * fall through to Paimon's postHoc `PaimonMergeInto` V1 fallback instead. - * - * Tables with PK / row-tracking / data-evolution / deletion-vectors still route to the postHoc - * `PaimonMergeInto` V1 command (`MergeIntoPaimonTable` / `MergeIntoPaimonDataEvolutionTable`) via - * `RowLevelHelper.shouldFallbackToV1MergeInto` — the V1 path is feature-complete for those table - * shapes and this rule leaves them alone. + * CHAR columns are excluded — `readSidePadding` races with the rewrite and trips CheckAnalysis; + * those plans fall back to the postHoc `PaimonMergeInto` V1 path, which also owns PK / RT / DE / DV + * tables via `RowLevelHelper.shouldFallbackToV1MergeInto`. */ object Spark41MergeIntoRewrite extends RewriteRowLevelCommand with PredicateHelper - with AssignmentAlignmentHelper with MergeSchemaEvolutionHelper with PureAppendOnlyScope { @@ -94,8 +72,9 @@ object Spark41MergeIntoRewrite case m: MergeIntoTable if m.resolved && m.rewritable && !m.needSchemaEvolution && targetsPureAppendOnly(m.targetTable) => - // Pure append-only tables never reach the postHoc `PaimonMergeIntoBase`, so evolve here. - rewrite(alignMergeIntoTable(evolveSchemaIfPaimon(m))) + // Pure append-only tables skip postHoc `PaimonMergeInto`, so evolve schema here. + val evolved = evolveSchemaIfPaimon(m) + rewrite(alignAllMergeActions(evolved, evolved.targetTable.output)) } } } @@ -109,11 +88,6 @@ object Spark41MergeIntoRewrite .getOrElse(m) } - /* ------------------------------------------------------------------------------------------- * - * Dispatcher mirroring `RewriteMergeIntoTable.apply`'s three `ReplaceData`/`AppendData` - * branches. The `SupportsDelta` branch from Spark is intentionally omitted. - * ------------------------------------------------------------------------------------------- */ - private def rewrite(m: MergeIntoTable): LogicalPlan = { val MergeIntoTable( aliasedTable, @@ -151,27 +125,7 @@ object Spark41MergeIntoRewrite } } - /* ------------------------------------------------------------------------------------------- * - * Assignment alignment. Spark 4.1's `RewriteMergeIntoTable` gates on `m.aligned`, which is set - * by `ResolveAssignments` earlier in the Resolution batch. Because we fire before that rule, - * `m.aligned` is still `false`; align manually using the helper `PaimonMergeInto` uses for V1. - * ------------------------------------------------------------------------------------------- */ - - private def alignMergeIntoTable(m: MergeIntoTable): MergeIntoTable = { - val targetOutput = m.targetTable.output - m.copy( - matchedActions = m.matchedActions.map(alignMergeAction(_, targetOutput)), - notMatchedActions = m.notMatchedActions.map(alignMergeAction(_, targetOutput)), - notMatchedBySourceActions = m.notMatchedBySourceActions.map(alignMergeAction(_, targetOutput)) - ) - } - - /* ------------------------------------------------------------------------------------------- * - * Fast-path #1: single NOT MATCHED InsertAction, no MATCHED, no NOT MATCHED BY SOURCE. Rewrite - * as an append over a left-anti join so Spark can skip the row-level merge machinery. - * Transcribed from `RewriteMergeIntoTable.apply` case 1. - * ------------------------------------------------------------------------------------------- */ - + // Fast-path #1: single NOT MATCHED InsertAction. Append over a left-anti join. private def buildSingleInsertAppendPlan( r: DataSourceV2Relation, source: LogicalPlan, @@ -190,12 +144,7 @@ object Spark41MergeIntoRewrite AppendData.byPosition(r, project) } - /* ------------------------------------------------------------------------------------------- * - * Fast-path #2: only NOT MATCHED actions (possibly multiple), no MATCHED, no NOT MATCHED BY - * SOURCE. Rewrite as an append over a left-anti join with a `MergeRows` on top to dispatch - * between the multiple InsertActions. Transcribed from `RewriteMergeIntoTable.apply` case 2. - * ------------------------------------------------------------------------------------------- */ - + // Fast-path #2: only NOT MATCHED actions. Append over a left-anti join with `MergeRows`. private def buildNotMatchedOnlyAppendPlan( r: DataSourceV2Relation, source: LogicalPlan, @@ -224,12 +173,8 @@ object Spark41MergeIntoRewrite AppendData.byPosition(r, mergeRows) } - /* ------------------------------------------------------------------------------------------- * - * General path producing a `ReplaceData` plan. Transcribed near-verbatim from - * `RewriteMergeIntoTable.buildReplaceDataPlan` + `buildReplaceDataMergeRowsPlan`. Kept in - * lockstep with Spark 4.1.1. - * ------------------------------------------------------------------------------------------- */ - + // General path producing a `ReplaceData` plan. Mirrors Spark 4.1.1's + // `RewriteMergeIntoTable.buildReplaceDataPlan` + `buildReplaceDataMergeRowsPlan`. private def buildReplaceDataPlan( relation: DataSourceV2Relation, operationTable: RowLevelOperationTable, @@ -276,9 +221,7 @@ object Spark41MergeIntoRewrite metadataAttrs: Seq[Attribute], checkCardinality: Boolean): MergeRows = { - // target records that were read but did not match any MATCHED or NOT MATCHED BY SOURCE - // actions must be copied over and included in the new state of the table as groups are being - // replaced. + // Unmatched target rows must be copied through since groups are being replaced wholesale. val carryoverRowsOutput = Literal(WRITE_WITH_METADATA_OPERATION) +: targetTable.output val keepCarryoverRowsInstruction = Keep(Copy, TrueLiteral, carryoverRowsOutput) @@ -354,7 +297,6 @@ object Spark41MergeIntoRewrite Join(targetTableProj, sourceTableProj, joinType, Some(joinCond), joinHint) } - /** Matches `RewriteMergeIntoTable.shouldCheckCardinality`. */ private def shouldCheckCardinality(matchedActions: Seq[MergeAction]): Boolean = { matchedActions match { case Nil => false @@ -363,10 +305,7 @@ object Spark41MergeIntoRewrite } } - /** - * Converts a MERGE action into an instruction for the group-based (copy-on-write) plan. Matches - * `RewriteMergeIntoTable.toInstruction(action, metadataAttrs)`. - */ + // Mirrors `RewriteMergeIntoTable.toInstruction`. private def toInstruction(action: MergeAction, metadataAttrs: Seq[Attribute]): Instruction = { action match { case UpdateAction(cond, assignments, _) => @@ -391,10 +330,7 @@ object Spark41MergeIntoRewrite } } - /* ------------------------------------------------------------------------------------------- * - * Condition validation. Mirrors `RewriteMergeIntoTable.validateMergeIntoConditions`. - * ------------------------------------------------------------------------------------------- */ - + // Mirrors `RewriteMergeIntoTable.validateMergeIntoConditions`. private def validateMergeIntoConditions(merge: MergeIntoTable): Unit = { checkMergeIntoCondition("SEARCH", merge.mergeCondition) val actions = merge.matchedActions ++ merge.notMatchedActions ++ merge.notMatchedBySourceActions @@ -418,7 +354,5 @@ object Spark41MergeIntoRewrite } } - // `targetsPureAppendOnly` / `hasCharColumn` are provided by `PureAppendOnlyScope` — kept in - // one place so this rule and `Spark41AppendOnlyRowLevelRewrite` can't drift apart on what - // qualifies as a "pure append-only" Paimon target. + // Scope checks live in `PureAppendOnlyScope`, shared with `Spark41AppendOnlyRowLevelRewrite`. } diff --git a/paimon-spark/paimon-spark4-common/src/main/scala/org/apache/spark/sql/catalyst/analysis/Spark41UpdateTableRewrite.scala b/paimon-spark/paimon-spark4-common/src/main/scala/org/apache/spark/sql/catalyst/analysis/Spark41UpdateTableRewrite.scala index abf3142d1ca2..d8082b592e08 100644 --- a/paimon-spark/paimon-spark4-common/src/main/scala/org/apache/spark/sql/catalyst/analysis/Spark41UpdateTableRewrite.scala +++ b/paimon-spark/paimon-spark4-common/src/main/scala/org/apache/spark/sql/catalyst/analysis/Spark41UpdateTableRewrite.scala @@ -18,7 +18,7 @@ package org.apache.spark.sql.catalyst.analysis -import org.apache.paimon.spark.catalyst.analysis.AssignmentAlignmentHelper +import org.apache.paimon.spark.catalyst.analysis.PaimonAssignmentUtils import org.apache.spark.sql.catalyst.expressions.{Alias, EqualNullSafe, Expression, If, Literal, MetadataAttribute, Not, SubqueryExpression} import org.apache.spark.sql.catalyst.expressions.Literal.TrueLiteral @@ -31,41 +31,26 @@ import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2Relation, Extr import org.apache.spark.sql.util.CaseInsensitiveStringMap /** - * Spark 4.1-only Resolution-batch rule that rewrites UPDATE plans targeting pure append-only Paimon - * tables (see [[PureAppendOnlyScope]] for the scope) into the V2 `ReplaceData` plan — the same form - * Spark's built-in `RewriteUpdateTable` produces for `SupportsRowLevelOperations` tables. + * Spark 4.1-only Resolution-batch rule that rewrites UPDATE on pure append-only Paimon tables (see + * [[PureAppendOnlyScope]]) into a V2 `ReplaceData` plan, mirroring Spark's built-in + * `RewriteUpdateTable`. * - * Why this rule exists: Spark 4.1 moved `RewriteUpdateTable` into the main Resolution batch AND - * implemented its `apply` with `plan resolveOperators { ... }`. `AnalysisHelper.resolveOperators*` - * short-circuits on already-`analyzed` plans, so by the time the rewrite would run the append-only - * `UpdateTable` node has transitioned to `analyzed=true` and the rewrite silently skips. The - * `UpdateTable` then falls through to the physical planner which rejects it with - * `UNSUPPORTED_FEATURE.TABLE_OPERATION`. + * In Spark 4.1, `RewriteUpdateTable` runs in the Resolution batch via `resolveOperators`, which + * short-circuits on `analyzed=true` plans — by the time it would fire, the `UpdateTable` is already + * marked analyzed and silently skipped, so the planner rejects it with + * `UNSUPPORTED_FEATURE.TABLE_OPERATION`. We intercept via `transformDown` under + * `allowInvokingTransformsInAnalyzer` and inline `buildReplaceDataPlan` / + * `buildReplaceDataWithUnionPlan`. The class sits in `org.apache.spark.sql.catalyst.analysis` to + * reach the package-private `RowLevelOperationTable` / `ReplaceData` types and the protected + * helpers on `RewriteRowLevelCommand`. * - * Firing this rule in the Resolution batch via `transformDown` guarded by - * `AnalysisHelper.allowInvokingTransformsInAnalyzer` intercepts the plan before the analyzed flag - * traps Spark's own rule. The body is a near-verbatim transcription of - * `RewriteUpdateTable.buildReplaceDataPlan` / `buildReplaceDataWithUnionPlan` so the result goes - * through Paimon's V2 row-level write path (`PaimonSparkCopyOnWriteOperation` -> - * `PaimonV2WriteBuilder` -> `PaimonBatchWrite`) exactly like Spark would have produced. The class - * lives in `org.apache.spark.sql.catalyst.analysis` so it can reference the package-private - * `RowLevelOperationTable` / `ReplaceData` types and the protected helpers on - * `RewriteRowLevelCommand`. + * We fire before `ResolveAssignments`, so `u.aligned` is `false`; the rule pre-aligns via + * `PaimonAssignmentUtils.alignUpdateAssignments` before building the plan. * - * One subtlety: Spark's `RewriteUpdateTable` guards on `u.aligned`, which is set by its - * `ResolveAssignments` rule running earlier in the Resolution batch. Because we fire before that - * alignment has taken effect, `u.aligned` is always `false` for us. The rule therefore mixes in - * Paimon's `AssignmentAlignmentHelper` and aligns assignments itself before invoking - * `buildReplaceDataPlan`, which expects one assignment per target data column. - * - * Tables with row-tracking / data-evolution / deletion-vectors still route through Spark's V2 path - * (which handles them correctly). Primary-key tables fall under Paimon's existing postHoc rule. - * DELETE is handled by [[Spark41DeleteMetadataRestore]]; MERGE by [[Spark41MergeIntoRewrite]]. + * PK tables go through the postHoc rule; RT / DE / DV tables go through Spark's V2 path. DELETE is + * handled by [[Spark41DeleteMetadataRestore]]; MERGE by [[Spark41MergeIntoRewrite]]. */ -object Spark41UpdateTableRewrite - extends RewriteRowLevelCommand - with AssignmentAlignmentHelper - with PureAppendOnlyScope { +object Spark41UpdateTableRewrite extends RewriteRowLevelCommand with PureAppendOnlyScope { override def apply(plan: LogicalPlan): LogicalPlan = { if (org.apache.spark.SPARK_VERSION < "4.1") return plan @@ -77,13 +62,13 @@ object Spark41UpdateTableRewrite case r @ ExtractV2Table(tbl: SupportsRowLevelOperations) => val table = buildOperationTable(tbl, UPDATE, CaseInsensitiveStringMap.empty()) val updateCond = cond.getOrElse(TrueLiteral) - // Spark's `RewriteUpdateTable` relies on `ResolveAssignments` having aligned - // `u.assignments` to the full target output first, but that rule fires later in - // the Resolution batch than ours, so `u.aligned` is still `false` when we see the - // plan. Align manually with Paimon's `AssignmentAlignmentHelper` (same helper - // the postHoc `PaimonUpdateTable` rule uses for its V1 fallback) before building - // the `ReplaceData` plan, which expects one assignment per target data column. - val alignedAssignments = alignAssignments(r.output, assignments) + // `ResolveAssignments` fires later in the batch, so `u.aligned` is still false. + // Pre-align via the same utility the postHoc V1 fallback uses. + val alignedAssignments = PaimonAssignmentUtils.alignUpdateAssignments( + r.output, + assignments, + fromStar = false, + mergeSchemaEnabled = false) if (SubqueryExpression.hasSubquery(updateCond)) { buildReplaceDataWithUnionPlan(r, table, alignedAssignments, updateCond) } else { @@ -96,13 +81,8 @@ object Spark41UpdateTableRewrite } } - /* ------------------------------------------------------------------------------------------- * - * Near-verbatim replicas of `RewriteUpdateTable`'s private `buildReplaceDataPlan` / - * `buildReplaceDataWithUnionPlan` / `buildReplaceDataUpdateProjection`. Kept in lockstep with - * the Spark 4.1.1 implementation so the produced `ReplaceData` shape matches Spark's and reuses - * Paimon's existing V2 write path. - * ------------------------------------------------------------------------------------------- */ - + // Mirrors Spark 4.1.1 `RewriteUpdateTable.{buildReplaceDataPlan, buildReplaceDataWithUnionPlan, + // buildReplaceDataUpdateProjection}`. private def buildReplaceDataPlan( relation: DataSourceV2Relation, operationTable: RowLevelOperationTable, diff --git a/paimon-spark/paimon-spark4-common/src/main/scala/org/apache/spark/sql/paimon/shims/Spark4Shim.scala b/paimon-spark/paimon-spark4-common/src/main/scala/org/apache/spark/sql/paimon/shims/Spark4Shim.scala index b87041d3dc31..9c4a4daa6a55 100644 --- a/paimon-spark/paimon-spark4-common/src/main/scala/org/apache/spark/sql/paimon/shims/Spark4Shim.scala +++ b/paimon-spark/paimon-spark4-common/src/main/scala/org/apache/spark/sql/paimon/shims/Spark4Shim.scala @@ -240,6 +240,19 @@ class Spark4Shim extends SparkShim { withSchemaEvolution) } + override def notMatchedBySourceActions(merge: MergeIntoTable): Seq[MergeAction] = + merge.notMatchedBySourceActions + + override def createUpdateAction( + condition: Option[Expression], + assignments: Seq[Assignment]): UpdateAction = + UpdateAction(condition, assignments) + + override def createInsertAction( + condition: Option[Expression], + assignments: Seq[Assignment]): InsertAction = + InsertAction(condition, assignments) + override def copyDataSourceV2Relation( relation: DataSourceV2Relation, table: Table, @@ -247,18 +260,6 @@ class Spark4Shim extends SparkShim { relation.copy(table = table, output = output) } - override def copyUpdateAction( - action: UpdateAction, - assignments: Seq[Assignment]): UpdateAction = { - action.copy(assignments = assignments) - } - - override def copyInsertAction( - action: InsertAction, - assignments: Seq[Assignment]): InsertAction = { - action.copy(assignments = assignments) - } - override def earlyBatchRules(): Seq[Rule[LogicalPlan]] = Seq(CTESubstitution) override def mergeRowsKeepCopy(condition: Expression, output: Seq[Expression]): AnyRef =