apache · AnishMahto · May 12, 2026 · May 12, 2026 · May 13, 2026 · May 13, 2026
diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json
@@ -191,6 +191,24 @@
     ],
     "sqlState" : "0A000"
   },
+  "AUTOCDC_COLUMNS_NOT_FOUND_IN_SCHEMA" : {
+    "message" : [
+      "Using <caseSensitivity> column name comparison, the following columns are not present in the <schemaName> schema: <missingColumns>. Available columns: <availableColumns>."
+    ],
+    "sqlState" : "42703"
+  },
+  "AUTOCDC_MULTIPART_COLUMN_IDENTIFIER" : {
+    "message" : [
+      "Expected a single column identifier; got the multi-part identifier <columnName> (parts: <nameParts>)."
+    ],
+    "sqlState" : "42703"
+  },
+  "AUTOCDC_RESERVED_COLUMN_NAME_CONFLICT" : {
+    "message" : [
+      "Using <caseSensitivity> column name comparison, the column `<columnName>` in the <schemaName> schema conflicts with the reserved AutoCDC column name `<reservedColumnName>`. Rename or remove the column."
+    ],
+    "sqlState" : "42710"
+  },
   "AVRO_CANNOT_WRITE_NULL_FIELD" : {
     "message" : [
       "Cannot write null value for field <name> defined as non-null Avro data type <dataType>.",

diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/ChangeArgs.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/ChangeArgs.scala
@@ -0,0 +1,159 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.pipelines.autocdc
+
+import org.apache.spark.sql.{AnalysisException, Column}
+import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
+import org.apache.spark.sql.catalyst.util.QuotingUtils
+import org.apache.spark.sql.types.StructType
+
+/**
+ * A single, unqualified column identifier (no nested path or table/alias qualifier). Backticks
+ * are consumed: "`a.b`" is stored as "a.b" in [[name]]. Use [[name]] for direct schema-fieldName
+ * comparison and [[quoted]] for APIs that re-parse identifier strings.
+ */
+case class UnqualifiedColumnName private (name: String) {
+  def quoted: String = QuotingUtils.quoteIdentifier(name)
+}
+
+object UnqualifiedColumnName {
+  def apply(input: String): UnqualifiedColumnName = {
+    val nameParts = CatalystSqlParser.parseMultipartIdentifier(input)
+    if (nameParts.length != 1) {
+      throw multipartColumnIdentifierError(input, nameParts)
+    }
+    new UnqualifiedColumnName(nameParts.head)
+  }
+
+  private def multipartColumnIdentifierError(
+      columnName: String,
+      nameParts: Seq[String]
+  ): AnalysisException =
+    new AnalysisException(
+      errorClass = "AUTOCDC_MULTIPART_COLUMN_IDENTIFIER",
+      messageParameters = Map(
+        "columnName" -> columnName,
+        "nameParts" -> nameParts.mkString(", ")
+      )
+    )
+}
+
+sealed trait ColumnSelection
+object ColumnSelection {
+
+  case class IncludeColumns(columns: Seq[UnqualifiedColumnName]) extends ColumnSelection
+  case class ExcludeColumns(columns: Seq[UnqualifiedColumnName])
+      extends ColumnSelection
+
+  /**
+   * Applies [[ColumnSelection]] to a [[StructType]] and returns the filtered schema. Field order
+   * follows the original schema; only matching fields are retained in the returned schema.
+   *
+   * @param schemaName      Logical name of the schema being filtered, surfaced in error messages
+   *                        when columns are not found (e.g. "microbatch", "target").
+   * @param schema          The schema to filter.
+   * @param columnSelection The user-provided selection. `None` is a no-op and returns `schema`
+   *                        unchanged.
+   * @param caseSensitive   Whether to match column names case-sensitively against the schema.
+   *                        Callers should derive this from the session, e.g.
+   *                        `session.sessionState.conf.caseSensitiveAnalysis`, so column matching
+   *                        stays consistent with `spark.sql.caseSensitive`.
+   */
+  def applyToSchema(
+      schemaName: String,
+      schema: StructType,
+      columnSelection: Option[ColumnSelection],
+      caseSensitive: Boolean): StructType = columnSelection match {
+    case None =>
+      // A None column selection is interpreted as a no-op.
+      schema
+    case Some(IncludeColumns(cols)) =>
+      val keepIndices = lookupFieldIndices(schemaName, schema, cols, caseSensitive)
+      StructType(schema.fields.zipWithIndex.collect {
+        case (field, idx) if keepIndices.contains(idx) => field
+      })
+    case Some(ExcludeColumns(cols)) =>
+      val dropIndices = lookupFieldIndices(schemaName, schema, cols, caseSensitive)
+      StructType(schema.fields.zipWithIndex.collect {
+        case (field, idx) if !dropIndices.contains(idx) => field
+      })
+  }
+
+  private def lookupFieldIndices(
+      schemaName: String,
+      schema: StructType,
+      fields: Seq[UnqualifiedColumnName],
+      caseSensitive: Boolean): Set[Int] = {
+    val caseAwareGetFieldIndex: String => Option[Int] =
+      if (caseSensitive) schema.getFieldIndex else schema.getFieldIndexCaseInsensitive
+
+    val fieldIndexResolutions = fields.map(f => f -> caseAwareGetFieldIndex(f.name))
+    val missingFieldNames = fieldIndexResolutions.collect { case (f, None) => f.name }.distinct
+    if (missingFieldNames.nonEmpty) {
+      throw new AnalysisException(
+        errorClass = "AUTOCDC_COLUMNS_NOT_FOUND_IN_SCHEMA",
+        messageParameters = Map(
+          "caseSensitivity" -> CaseSensitivityLabels.of(caseSensitive),
+          "schemaName" -> schemaName,
+          "missingColumns" -> missingFieldNames.mkString(", "),
+          "availableColumns" -> schema.fieldNames.mkString(", ")
+        )
+      )
+    }
+    fieldIndexResolutions.flatMap { case (_, idx) => idx }.toSet
+  }
+}
+
+/** User-facing case-sensitivity labels surfaced in AutoCDC error messages. */
+private[autocdc] object CaseSensitivityLabels {
+  val CaseSensitive: String = "case-sensitive"
+  val CaseInsensitive: String = "case-insensitive"
+
+  def of(caseSensitive: Boolean): String =
+    if (caseSensitive) CaseSensitive else CaseInsensitive
+}
+
+/** The SCD (Slowly Changing Dimension) strategy for a CDC flow. */
+sealed trait ScdType
+
+object ScdType {
+  /** Representation for the standard SCD1 strategy. */
+  case object Type1 extends ScdType
+  /** Representation for the standard SCD2 strategy. */
+  case object Type2 extends ScdType
+}
+
+/**
+ * Configuration for an AutoCDC flow.
+ *
+ * @param keys            The column(s) that uniquely identify a row in the source data.
+ * @param sequencing      Expression ordering CDC events to correctly resolve out-of-order
+ *                        arrivals. Must be a sortable type.
+ * @param deleteCondition Expression that marks a source row as a DELETE. When None, all
+ *                        rows are treated as upserts.
+ * @param storedAsScdType The SCD strategy these args should be applied to.
+ * @param columnSelection Which source columns to select in the target table. None means
+ *                        all columns.
+ */
+case class ChangeArgs(
+    keys: Seq[UnqualifiedColumnName],
+    sequencing: Column,
+    storedAsScdType: ScdType,
+    deleteCondition: Option[Column] = None,
+    columnSelection: Option[ColumnSelection] = None
+)
diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/Scd1BatchProcessor.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/Scd1BatchProcessor.scala
@@ -0,0 +1,175 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.pipelines.autocdc
+
+import org.apache.spark.SparkException
+import org.apache.spark.sql.{functions => F, AnalysisException}
+import org.apache.spark.sql.Column
+import org.apache.spark.sql.catalyst.util.QuotingUtils
+import org.apache.spark.sql.classic.DataFrame
+import org.apache.spark.sql.types.{DataType, StructField, StructType}
+import org.apache.spark.util.ArrayImplicits._
+
+/**
+ * Per-microbatch processor for SCD Type 1 AutoCDC flows, complying to the specified [[changeArgs]]
+ * configuration.
+ *
+ * @param changeArgs The CDC flow configuration.
+ * @param resolvedSequencingType The post-analysis [[DataType]] of the sequencing column, derived
+ *                               from the flow's resolved DataFrame at flow setup time.
+ */
+case class Scd1BatchProcessor(
+    changeArgs: ChangeArgs,
+    resolvedSequencingType: DataType) {
+
+  /**
+   * Deduplicate the incoming CDC microbatch by key, keeping the most recent event per key
+   * as ordered by [[ChangeArgs.sequencing]].
+   *
+   * For SCD1 we only care about the most recent (by sequence value) event per key. When
+   * multiple events share the same key and the same sequence value, the row selected is
+   * non-deterministic and undefined.
+   *
+   * @param validatedMicrobatch A microbatch that has already been validated such that the
+   *                            sequencing column should not contain null values, and its data type
+   *                            should support ordering.
+   *
+   * The schema of the returned dataframe matches the schema of the microbatch exactly.
+   */
+  def deduplicateMicrobatch(validatedMicrobatch: DataFrame): DataFrame = {
+    // The `max_by` API can only return a single column, so pack/unpack the entire row into a
+    // temporary column before and after the `max_by` operation.
+    val winningRowCol = Scd1BatchProcessor.winningRowColName
+
+    val allMicrobatchColumns =
+      validatedMicrobatch.columns
+        .map(colName => F.col(QuotingUtils.quoteIdentifier(colName)))
+        .toImmutableArraySeq
+
+    validatedMicrobatch
+      .groupBy(changeArgs.keys.map(k => F.col(k.quoted)): _*)
+      .agg(
+        F.max_by(F.struct(allMicrobatchColumns: _*), changeArgs.sequencing)
+          .as(winningRowCol)
+      )
+      .select(F.col(s"$winningRowCol.*"))
+  }
+
+  /**
+   * Project the CDC metadata column onto the microbatch.
+   *
+   * This must run before any column selection is applied to the microbatch. The
+   * [[ChangeArgs.deleteCondition]] and [[ChangeArgs.sequencing]] expressions are evaluated against
+   * the current microbatch schema, and column selection may drop inputs required by those
+   * expressions.
+   *
+   * Rows are classified as deletes only when [[ChangeArgs.deleteCondition]] evaluates to true. A
+   * false or null delete condition classifies the row as an upsert.
+   *
+   * The returned dataframe has all of the columns in the input microbatch + the CDC metadata
+   * column.
+   */
+  def extendMicrobatchRowsWithCdcMetadata(microbatchDf: DataFrame): DataFrame = {
+    // Proactively validate the reserved CDC metadata column does not exist in the microbatch.
+    validateCdcMetadataColumnNotPresent(microbatchDf)
+
+    val rowDeleteSequence: Column = changeArgs.deleteCondition match {
+      case Some(deleteCondition) =>
+        F.when(deleteCondition, changeArgs.sequencing).otherwise(F.lit(null))
+      case None =>
+        F.lit(null)
+    }
+
+    val rowUpsertSequence: Column =
+      // A row that is not a delete must be an upsert, these are mutually exclusive and a complete
+      // set of CDC event types.
+      F.when(rowDeleteSequence.isNull, changeArgs.sequencing).otherwise(F.lit(null))
+
+    microbatchDf.withColumn(
+      Scd1BatchProcessor.cdcMetadataColName,
+      Scd1BatchProcessor.constructCdcMetadataCol(
+        deleteSequence = rowDeleteSequence,
+        upsertSequence = rowUpsertSequence,
+        sequencingType = resolvedSequencingType
+      )
+    )
+  }
+
+  private def validateCdcMetadataColumnNotPresent(microbatchDf: DataFrame): Unit = {
+    val microbatchSqlConf = microbatchDf.sparkSession.sessionState.conf
+    val resolver = microbatchSqlConf.resolver
+
+    microbatchDf.schema.fieldNames
+      .find(resolver(_, Scd1BatchProcessor.cdcMetadataColName))
+      .foreach { conflictingColumnName =>
+        throw new AnalysisException(
+          errorClass = "AUTOCDC_RESERVED_COLUMN_NAME_CONFLICT",
+          messageParameters = Map(
+            "caseSensitivity" -> CaseSensitivityLabels.of(microbatchSqlConf.caseSensitiveAnalysis),
+            "columnName" -> conflictingColumnName,
+            "schemaName" -> "microbatch",
+            "reservedColumnName" -> Scd1BatchProcessor.cdcMetadataColName
+          )
+        )
+      }
+  }
+}
+
+object Scd1BatchProcessor {
+  // Columns prefixed with `__spark_autocdc_` are reserved for internal SDP AutoCDC processing.
+  private[autocdc] val winningRowColName: String = "__spark_autocdc_winning_row"
+  private[autocdc] val cdcMetadataColName: String = "__spark_autocdc_metadata"
+
+  private[autocdc] val cdcDeleteSequenceFieldName: String = "deleteSequence"
+  private[autocdc] val cdcUpsertSequenceFieldName: String = "upsertSequence"
+
+  /**
+   * Schema of the CDC metadata struct column for SCD1.
+   */
+  private def cdcMetadataColSchema(sequencingType: DataType): StructType =
+    StructType(
+      Seq(
+        // The sequencing of the event if it represents a delete, null otherwise.
+        StructField(cdcDeleteSequenceFieldName, sequencingType, nullable = true),
+        // The sequencing of the event if it represents an upsert, null otherwise.
+        StructField(cdcUpsertSequenceFieldName, sequencingType, nullable = true)
+      )
+    )
+
+  /**
+   * Construct the CDC metadata struct column for SCD1, following the exact schema and field
+   * ordering defined by [[cdcMetadataColSchema]].
+   */
+  private[autocdc] def constructCdcMetadataCol(
+      deleteSequence: Column,
+      upsertSequence: Column,
+      sequencingType: DataType): Column = {
+    val cdcMetadataFieldsInOrder = cdcMetadataColSchema(sequencingType).fields.map { field =>
+      val value = field.name match {
+        case `cdcDeleteSequenceFieldName` => deleteSequence
+        case `cdcUpsertSequenceFieldName` => upsertSequence
+        case other =>
+          throw SparkException.internalError(
+            s"Unable to construct SCD1 CDC metadata column due to unknown `${other}` field."
+          )
+      }
+      value.cast(field.dataType).as(field.name)
+    }
+    F.struct(cdcMetadataFieldsInOrder.toImmutableArraySeq: _*)
+  }
+}