[SPARK-56686][SQL][FOLLOWUP] Fail fast on NULL _commit_timestamp in streaming row-level rewrite

gengliangwang · gengliangwang · commit 791d5ce32469 · 2026-04-30T21:43:35.000-07:00
Address @zikangh's review on #55637 -- the streaming row-level rewrite should enforce non-NULL _commit_timestamp, mirroring the runtime guard in CdcNetChangesStatefulProcessor. A NULL _commit_timestamp on a streaming read is a connector contract violation that would silently stall the row's group: the downstream streaming Aggregate uses _commit_timestamp as an event-time watermark column AND a grouping key, and Spark's eviction predicate is LessThanOrEqual(eventTime, watermark) -- a NULL group key never satisfies that, so the group sits in state until end of stream producing no output and no error. Add a Filter at the top of the streaming row-level rewrite that raises CHANGELOG_CONTRACT_VIOLATION.NULL_COMMIT_TIMESTAMP via the same RaiseError pattern used for the multiple-changes-per-row-version guard in the batch path. Also adds the new error class to error-conditions.json. Tests: - Plan-shape tests: assert the guard Filter is present and sits directly above the streaming relation (so it runs before any downstream operator sees the NULL). - End-to-end test: feeding a row with a NULL _commit_timestamp surfaces CHANGELOG_CONTRACT_VIOLATION.NULL_COMMIT_TIMESTAMP at the streaming query level rather than producing no output. - Existing carry-over / update-detection plan-shape tests updated for the extra guard Filter (was 1 -> now 2 Filters in carry-over and combined paths; was 0 -> now 1 in update-detection-only). Also refreshed the addStreamingRowLevelPostProcessing Scaladoc to add a step 0 (the guard) and step 7 (the watermark-metadata strip), keeping the per-operator detail aligned with the rewrite's actual shape. Doc-only side effect: scalafmt reflowed the watermark-metadata bullet in DataStreamReader.changes() Scaladoc (no semantic change).
diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json
@@ -666,6 +666,11 @@
       "The Change Data Capture (CDC) connector violated the `Changelog` contract at runtime."
     ],
     "subClass" : {
+      "NULL_COMMIT_TIMESTAMP" : {
+        "message" : [
+          "Connector emitted a row with a NULL `_commit_timestamp` on a streaming read engaging post-processing. The `Changelog` contract requires `_commit_timestamp` to be non-NULL for streaming reads, since post-processing uses it as event time to advance the watermark."
+        ]
+      },
       "UNEXPECTED_CHANGE_TYPE" : {
         "message" : [
           "Connector emitted a row with a `_change_type` value that is not one of the four supported types (`insert`, `delete`, `update_preimage`, `update_postimage`). The `Changelog` contract requires every emitted row to carry one of these four values."
diff --git a/sql/api/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala b/sql/api/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
@@ -145,12 +145,11 @@ abstract class DataStreamReader {
    *     to the max `_commit_timestamp` observed in the previous batch). A stream that reads its
    *     last commit and stops will keep that commit's events in state until a subsequent
    *     (no-data) micro-batch fires.
-   *   - The query is constrained to `Append` output mode; `Update` and `Complete` are
-   *     rejected at writer-start time with
-   *     `STREAMING_OUTPUT_MODE.UNSUPPORTED_OPERATION`. The internal watermark metadata is
-   *     stripped from the user-visible `_commit_timestamp` output, so downstream
-   *     user-supplied watermarks on other columns do not interact with it via the global
-   *     multi-watermark policy.
+   *   - The query is constrained to `Append` output mode; `Update` and `Complete` are rejected at
+   *     writer-start time with `STREAMING_OUTPUT_MODE.UNSUPPORTED_OPERATION`. The internal
+   *     watermark metadata is stripped from the user-visible `_commit_timestamp` output, so
+   *     downstream user-supplied watermarks on other columns do not interact with it via the
+   *     global multi-watermark policy.
    *
    * @param tableName
    *   a qualified or unqualified name that designates a table.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveChangelogTable.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveChangelogTable.scala
@@ -34,7 +34,7 @@ import org.apache.spark.sql.catalyst.streaming.StreamingRelationV2
 import org.apache.spark.sql.connector.catalog.{Changelog, ChangelogInfo}
 import org.apache.spark.sql.errors.QueryCompilationErrors
 import org.apache.spark.sql.execution.datasources.v2.{ChangelogTable, DataSourceV2Relation}
-import org.apache.spark.sql.types.{IntegerType, MetadataBuilder, StringType}
+import org.apache.spark.sql.types.{BooleanType, IntegerType, MetadataBuilder, StringType}
 import org.apache.spark.unsafe.types.CalendarInterval
 
 /**
@@ -244,6 +244,7 @@ object ResolveChangelogTable extends Rule[LogicalPlan] {
    * the aggregate so no rows are lost.
    * {{{
    *   DataSourceV2Relation
+   *     -> Filter (RaiseError on NULL _commit_timestamp)
    *     -> EventTimeWatermark(_commit_timestamp, 0s)
    *     -> Aggregate
    *          group by (rowId..., _commit_version, _commit_timestamp)
@@ -255,6 +256,7 @@ object ResolveChangelogTable extends Rule[LogicalPlan] {
    *     -> Generate(Inline(__spark_cdc_events))   // re-emit one row per buffered input
    *     -> [Project (update relabel)]
    *     -> Project (drop helper columns)
+   *     -> Project (strip internal EventTimeWatermark metadata)
    * }}}
    *
    * ==Runtime walkthrough==
@@ -278,15 +280,16 @@ object ResolveChangelogTable extends Rule[LogicalPlan] {
    *
    * ==Per-operator detail==
    *
+   *  0. [[Filter]] guarding against NULL `_commit_timestamp` -- raises
+   *     `CHANGELOG_CONTRACT_VIOLATION.NULL_COMMIT_TIMESTAMP` for any row that
+   *     violates the contract. A NULL would never satisfy the downstream Aggregate's
+   *     `eventTime <= watermark` eviction predicate (NULL is silent in MAX, never
+   *     compares less-than-or-equal), so its group would be held in state forever.
+   *     Failing fast surfaces the connector bug instead of producing no output.
    *  1. [[EventTimeWatermark]] on `_commit_timestamp` (zero delay) -- required so the
    *     downstream stateful aggregate can emit groups in append output mode. By CDC
    *     contract every row in a single commit shares `_commit_timestamp`, so taking it
-   *     as event time is safe. Note: this is currently the only analyzer rule that
-   *     auto-injects an [[EventTimeWatermark]] (others resolve user-supplied watermarks).
-   *     The watermark metadata is preserved on the user-visible `_commit_timestamp`
-   *     output (since [[Generate]]'s `generatorOutput` copies attribute metadata), so a
-   *     downstream user-supplied `withWatermark` on a different column will interact
-   *     with this internal watermark under the global multi-watermark policy.
+   *     as event time is safe.
    *  2. [[Aggregate]] keyed by `(rowId..., _commit_version, _commit_timestamp)`. Computes
    *     the same `_del_cnt` / `_ins_cnt` / (`_min_rv` / `_max_rv` / `_rv_cnt`) helpers as
    *     the batch path, plus an `__spark_cdc_events` array-of-struct buffering every
@@ -304,17 +307,29 @@ object ResolveChangelogTable extends Rule[LogicalPlan] {
    *  5. [[Project]] (only when update detection is requested) applying the same
    *     `CHANGELOG_CONTRACT_VIOLATION.UNEXPECTED_MULTIPLE_CHANGES_PER_ROW_VERSION`
    *     guard and `_change_type` relabel as the batch path.
-   *  6. Final [[Project]] (via [[removeHelperColumns]]) drops `__spark_cdc_*` helpers so
+   *  6. [[Project]] (via [[removeHelperColumns]]) drops `__spark_cdc_*` helpers so
    *     the output schema matches the connector's declared schema.
+   *  7. Final [[Project]] (via [[stripCommitTimestampWatermarkMetadata]]) clears the
+   *     `EventTimeWatermark.delayKey` from the user-visible `_commit_timestamp`
+   *     attribute so a downstream user-supplied `withWatermark` on a different column
+   *     does not interact with our internal watermark via the global multi-watermark
+   *     policy.
    */
   private def addStreamingRowLevelPostProcessing(
       plan: LogicalPlan,
       cl: Changelog,
       requiresCarryOverRemoval: Boolean,
       requiresUpdateDetection: Boolean): LogicalPlan = {
-    val rawCommitTsAttr = getAttribute(plan, "_commit_timestamp")
+    // Fail fast on a NULL `_commit_timestamp`. The downstream Aggregate uses it as
+    // both an event-time watermark column and a grouping key; a NULL group-key value
+    // would never satisfy the `eventTime <= watermark` eviction predicate, so the
+    // group would silently stall (held in state until end of stream). Mirrors the
+    // runtime check in [[CdcNetChangesStatefulProcessor]] -- fail fast at the
+    // contract violation rather than producing no output.
+    val plan1 = addNullCommitTimestampGuard(plan)
+    val rawCommitTsAttr = getAttribute(plan1, "_commit_timestamp")
     val watermarked = EventTimeWatermark(
-      UUID.randomUUID(), rawCommitTsAttr, new CalendarInterval(0, 0, 0L), plan)
+      UUID.randomUUID(), rawCommitTsAttr, new CalendarInterval(0, 0, 0L), plan1)
 
     val rowIdExprs = V2ExpressionUtils.resolveRefs[NamedExpression](
       cl.rowId().toSeq, watermarked)
@@ -404,6 +419,26 @@ object ResolveChangelogTable extends Rule[LogicalPlan] {
     removeHelperColumns(cleaned)
   }
 
+  /**
+   * Adds a `Filter` that raises
+   * `CHANGELOG_CONTRACT_VIOLATION.NULL_COMMIT_TIMESTAMP` for any input row whose
+   * `_commit_timestamp` is `NULL`. Used as the first step of the streaming row-level
+   * rewrite so a contract-violating connector fails fast instead of silently stalling
+   * the downstream stateful aggregate's group.
+   */
+  private def addNullCommitTimestampGuard(input: LogicalPlan): LogicalPlan = {
+    val commitTsAttr = getAttribute(input, "_commit_timestamp")
+    val raise = RaiseError(
+      Literal("CHANGELOG_CONTRACT_VIOLATION.NULL_COMMIT_TIMESTAMP"),
+      CreateMap(Nil),
+      BooleanType)
+    // CaseWhen returns the default branch (true) for non-null timestamps and
+    // evaluates the side-effecting RaiseError for nulls; the row never passes the
+    // filter on a contract violation.
+    val checkExpr = CaseWhen(Seq(IsNull(commitTsAttr) -> raise), Literal(true))
+    Filter(checkExpr, input)
+  }
+
   /**
    * Final boundary for the streaming row-level rewrite: rebuilds the user-visible
    * `_commit_timestamp` attribute with empty watermark-related metadata. Other
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/ChangelogEndToEndSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/ChangelogEndToEndSuite.scala
@@ -857,4 +857,39 @@ class ChangelogEndToEndSuite extends SharedSparkSession {
     assert(e.getMessage.contains("Change Data Capture"),
       s"Error should mention CDC: ${e.getMessage}")
   }
+
+  test("streaming row-level rewrite raises on NULL _commit_timestamp") {
+    val id = recreateWithRowVersion()
+    catalog.setChangelogProperties(id, ChangelogProperties(
+      containsCarryoverRows = true,
+      rowIdNames = Seq("id"),
+      rowVersionName = Some("row_commit_version")))
+
+    // Insert a row with NULL _commit_timestamp (last column).
+    val row = InternalRow(
+      1L, UTF8String.fromString("Alice"), 1L,
+      UTF8String.fromString(CHANGE_TYPE_INSERT), 1L, null)
+    catalog.addChangeRows(id, Seq(row))
+
+    val q = spark.readStream
+      .option("startingVersion", "1")
+      .changes(fullTableName)
+      .writeStream
+      .format("memory")
+      .queryName("cdc_stream_null_ts")
+      .outputMode("append")
+      .start()
+    try {
+      val e = intercept[org.apache.spark.sql.streaming.StreamingQueryException] {
+        q.processAllAvailable()
+      }
+      // The CHANGELOG_CONTRACT_VIOLATION runtime error wraps the message; it should
+      // mention NULL_COMMIT_TIMESTAMP somewhere in the chain.
+      assert(e.getMessage.contains("NULL_COMMIT_TIMESTAMP") ||
+        Option(e.getCause).map(_.getMessage).getOrElse("").contains("NULL_COMMIT_TIMESTAMP"),
+        s"Expected NULL_COMMIT_TIMESTAMP in the error chain. Got: ${e.getMessage}")
+    } finally {
+      q.stop()
+    }
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/ResolveChangelogTableStreamingPostProcessingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/ResolveChangelogTableStreamingPostProcessingSuite.scala
@@ -133,6 +133,15 @@ class ResolveChangelogTableStreamingPostProcessingSuite
       s"Generate must use Inline. Plan:\n$plan")
   }
 
+  private def assertContainsNullCommitTimestampGuard(plan: LogicalPlan): Unit = {
+    val nullGuards = plan.collect {
+      case f: Filter
+        if f.condition.toString.contains("NULL_COMMIT_TIMESTAMP") => f
+    }
+    assert(nullGuards.size == 1,
+      s"Expected exactly one NULL_COMMIT_TIMESTAMP guard Filter. Plan:\n$plan")
+  }
+
   // ===========================================================================
   // Carry-over removal only
   // ===========================================================================
@@ -154,8 +163,11 @@ class ResolveChangelogTableStreamingPostProcessingSuite
     assert(groupingNames.toSet == Set("id", "_commit_version", "_commit_timestamp"),
       s"Expected grouping by (id, _commit_version, _commit_timestamp); got $groupingNames")
 
+    // Two Filters: the NULL `_commit_timestamp` guard + the carry-over predicate.
     val filters = analyzed.collect { case f: Filter => f }
-    assert(filters.size == 1, s"Expected one Filter for carry-over removal. Plan:\n$analyzed")
+    assert(filters.size == 2,
+      s"Expected NULL guard + carry-over Filter. Plan:\n$analyzed")
+    assertContainsNullCommitTimestampGuard(analyzed)
 
     assertInlineGenerate(analyzed)
     assertHelperColumnsRemoved(analyzed)
@@ -177,10 +189,13 @@ class ResolveChangelogTableStreamingPostProcessingSuite
       "deduplicationMode" -> "none").queryExecution.analyzed
     assertWatermarkOnCommitTimestamp(analyzed)
 
-    // No carry-over Filter when only update detection runs.
+    // No carry-over Filter when only update detection runs -- but the NULL
+    // `_commit_timestamp` guard Filter is always present.
     val filters = analyzed.collect { case f: Filter => f }
-    assert(filters.isEmpty,
-      s"No Filter expected for update-detection-only path. Plan:\n$analyzed")
+    assert(filters.size == 1,
+      s"Only the NULL guard Filter is expected for update-detection-only path. " +
+        s"Plan:\n$analyzed")
+    assertContainsNullCommitTimestampGuard(analyzed)
 
     assertInlineGenerate(analyzed)
 
@@ -211,10 +226,11 @@ class ResolveChangelogTableStreamingPostProcessingSuite
     val aggs = analyzed.collect { case a: Aggregate => a }
     assert(aggs.size == 1, s"Should fuse both passes into a single Aggregate. Plan:\n$analyzed")
 
-    // Filter for carry-over removal AND a Project for relabeling.
+    // Two Filters: NULL guard + carry-over removal.
     val filters = analyzed.collect { case f: Filter => f }
-    assert(filters.size == 1,
-      s"Exactly one Filter expected for combined path. Plan:\n$analyzed")
+    assert(filters.size == 2,
+      s"Expected NULL guard + carry-over Filter for combined path. Plan:\n$analyzed")
+    assertContainsNullCommitTimestampGuard(analyzed)
 
     assertInlineGenerate(analyzed)
     assertHelperColumnsRemoved(analyzed)
@@ -277,4 +293,36 @@ class ResolveChangelogTableStreamingPostProcessingSuite
     assert(!ts.get.metadata.contains(EventTimeWatermark.delayKey),
       s"Watermark metadata leaked to user-visible `_commit_timestamp`. Plan:\n$analyzed")
   }
+
+  // ===========================================================================
+  // NULL _commit_timestamp guard
+  // ===========================================================================
+
+  test("NULL _commit_timestamp guard Filter is the first operator after the source") {
+    catalog.setChangelogProperties(identifier, ChangelogProperties(
+      containsCarryoverRows = true,
+      rowIdNames = Seq("id"),
+      rowVersionName = Some("row_commit_version")))
+
+    val analyzed = streamingDf().queryExecution.analyzed
+    // The guard must sit BELOW the EventTimeWatermark (we don't want a NULL row to
+    // be considered for watermark advancement at all). Verify by walking the plan
+    // top-down and finding the guard before any Aggregate.
+    val guards = analyzed.collect {
+      case f: Filter if f.condition.toString.contains("NULL_COMMIT_TIMESTAMP") => f
+    }
+    assert(guards.size == 1, s"Expected exactly one guard. Plan:\n$analyzed")
+    val guard = guards.head
+    val guardChild = guard.child
+    // The guard's child should be the bare relation (or a SubqueryAlias wrapping it),
+    // not the EventTimeWatermark.
+    val isSourceBelowGuard = guardChild match {
+      case _: org.apache.spark.sql.catalyst.streaming.StreamingRelationV2 => true
+      case org.apache.spark.sql.catalyst.plans.logical.SubqueryAlias(_,
+            _: org.apache.spark.sql.catalyst.streaming.StreamingRelationV2) => true
+      case _ => false
+    }
+    assert(isSourceBelowGuard,
+      s"NULL guard Filter should sit directly above the streaming relation. Plan:\n$analyzed")
+  }
 }