-
Notifications
You must be signed in to change notification settings - Fork 3.2k
Kafka Connect: Add bounded retry for transient commit exceptions #16434
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -53,6 +53,7 @@ | |
| import org.apache.iceberg.connect.events.Event; | ||
| import org.apache.iceberg.connect.events.StartCommit; | ||
| import org.apache.iceberg.connect.events.TableReference; | ||
| import org.apache.iceberg.exceptions.CommitFailedException; | ||
| import org.apache.iceberg.exceptions.NoSuchTableException; | ||
| import org.apache.iceberg.relocated.com.google.common.collect.Maps; | ||
| import org.apache.iceberg.relocated.com.google.common.collect.Streams; | ||
|
|
@@ -82,6 +83,7 @@ class Coordinator extends Channel { | |
| private final CommitState commitState; | ||
| private volatile boolean terminated; | ||
| private final String taskId; | ||
| private int consecutiveCommitFailures; | ||
|
|
||
| Coordinator( | ||
| Catalog catalog, | ||
|
|
@@ -150,16 +152,33 @@ protected boolean receive(Envelope envelope) { | |
| private void commit(boolean partialCommit) { | ||
| try { | ||
| doCommit(partialCommit); | ||
| } catch (RuntimeException e) { | ||
| if (!partialCommit) { | ||
| consecutiveCommitFailures = 0; | ||
| } | ||
| } catch (CommitFailedException e) { | ||
| if (partialCommit) { | ||
| LOG.warn( | ||
| "Partial commit {} failed for task {}, will retry", | ||
| commitState.currentCommitId(), | ||
| taskId, | ||
| e); | ||
| } else { | ||
| LOG.error("Commit {} failed for task {}", commitState.currentCommitId(), taskId, e); | ||
| throw e; | ||
| consecutiveCommitFailures++; | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd narrow this catch.
Permanent failures like I'd either catch } catch (RuntimeException e) {
if (e instanceof CommitStateUnknownException) {
throw e;
}
// ... existing partialCommit / retry logic, ideally gated on (e instanceof CommitFailedException)
}wdyt? |
||
| if (consecutiveCommitFailures >= config.commitMaxConsecutiveFailures()) { | ||
| LOG.error( | ||
| "Commit {} failed for task {} ({} consecutive failures, terminating)", | ||
| commitState.currentCommitId(), | ||
| taskId, | ||
| consecutiveCommitFailures, | ||
| e); | ||
| throw e; | ||
| } | ||
| LOG.warn( | ||
| "Commit {} failed for task {} ({} consecutive failure(s), will retry)", | ||
| commitState.currentCommitId(), | ||
| taskId, | ||
| consecutiveCommitFailures, | ||
| e); | ||
| } | ||
| } finally { | ||
| commitState.endCurrentCommit(); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -130,6 +130,8 @@ public void testCommitNoFiles() { | |
|
|
||
| @Test | ||
| public void testCommitError() { | ||
| when(config.commitMaxConsecutiveFailures()).thenReturn(1); | ||
|
|
||
| // this spec isn't registered with the table | ||
| PartitionSpec badPartitionSpec = | ||
| PartitionSpec.builderFor(SCHEMA).withSpecId(1).identity("id").build(); | ||
|
|
@@ -152,6 +154,8 @@ public void testCommitError() { | |
|
|
||
| @Test | ||
| public void testCommitFailedExceptionPropagates() { | ||
| when(config.commitMaxConsecutiveFailures()).thenReturn(1); | ||
|
|
||
| Table spiedTable = spy(table); | ||
| AppendFiles spiedAppend = spy(table.newAppend()); | ||
| doThrow(new CommitFailedException("Glue detected concurrent update")) | ||
|
|
@@ -170,6 +174,103 @@ public void testCommitFailedExceptionPropagates() { | |
| .hasMessageContaining("Glue detected concurrent update"); | ||
| } | ||
|
|
||
| @Test | ||
| public void testCommitBoundedRetry() { | ||
| when(config.commitMaxConsecutiveFailures()).thenReturn(3); | ||
| when(config.commitIntervalMs()).thenReturn(0); | ||
| when(config.commitTimeoutMs()).thenReturn(Integer.MAX_VALUE); | ||
|
|
||
| Table spiedTable = spy(table); | ||
| AppendFiles spiedAppend = spy(table.newAppend()); | ||
| doThrow(new CommitFailedException("transient error")).when(spiedAppend).commit(); | ||
| when(spiedTable.newAppend()).thenReturn(spiedAppend); | ||
| when(catalog.loadTable(TABLE_IDENTIFIER)).thenReturn(spiedTable); | ||
|
|
||
| SinkTaskContext context = mock(SinkTaskContext.class); | ||
| Coordinator coordinator = | ||
| new Coordinator(catalog, config, ImmutableList.of(), clientFactory, context); | ||
| coordinator.start(); | ||
| initConsumer(); | ||
|
|
||
| // first two failures should not throw | ||
| triggerCommitCycle(coordinator); | ||
| triggerCommitCycle(coordinator); | ||
|
|
||
| // third consecutive failure should terminate | ||
| assertThatThrownBy(() -> triggerCommitCycle(coordinator)) | ||
| .isInstanceOf(CommitFailedException.class) | ||
| .hasMessageContaining("transient error"); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Separately — once the classifier above lands, a one-liner asserting |
||
| } | ||
|
|
||
| @Test | ||
| public void testCommitCounterResetsOnSuccess() { | ||
| when(config.commitMaxConsecutiveFailures()).thenReturn(3); | ||
| when(config.commitIntervalMs()).thenReturn(0); | ||
| when(config.commitTimeoutMs()).thenReturn(Integer.MAX_VALUE); | ||
|
|
||
| Table spiedTable = spy(table); | ||
| AppendFiles failingAppend = spy(table.newAppend()); | ||
| doThrow(new CommitFailedException("transient error")).when(failingAppend).commit(); | ||
|
|
||
| // fail, fail, succeed, fail, fail — should NOT terminate | ||
| when(spiedTable.newAppend()) | ||
| .thenReturn(failingAppend) | ||
| .thenReturn(failingAppend) | ||
| .thenCallRealMethod() | ||
| .thenReturn(failingAppend) | ||
| .thenReturn(failingAppend); | ||
| when(catalog.loadTable(TABLE_IDENTIFIER)).thenReturn(spiedTable); | ||
|
|
||
| SinkTaskContext context = mock(SinkTaskContext.class); | ||
| Coordinator coordinator = | ||
| new Coordinator(catalog, config, ImmutableList.of(), clientFactory, context); | ||
| coordinator.start(); | ||
| initConsumer(); | ||
|
|
||
| // two failures | ||
| triggerCommitCycle(coordinator); | ||
| triggerCommitCycle(coordinator); | ||
|
|
||
| // success resets counter | ||
| triggerCommitCycle(coordinator); | ||
|
|
||
| // two more failures — still under threshold | ||
| triggerCommitCycle(coordinator); | ||
| triggerCommitCycle(coordinator); | ||
| } | ||
|
|
||
| private long nextOffset = 1; | ||
|
|
||
| private void triggerCommitCycle(Coordinator coordinator) { | ||
| coordinator.process(); | ||
|
|
||
| byte[] startBytes = producer.history().get(producer.history().size() - 1).value(); | ||
| Event startEvent = AvroUtil.decode(startBytes); | ||
| UUID commitId = ((StartCommit) startEvent.payload()).commitId(); | ||
|
|
||
| Event commitResponse = | ||
| new Event( | ||
| config.connectGroupId(), | ||
| new DataWritten( | ||
| StructType.of(), | ||
| commitId, | ||
| TableReference.of("catalog", TableIdentifier.of("db", "tbl"), null), | ||
| ImmutableList.of(EventTestUtil.createDataFile()), | ||
| ImmutableList.of())); | ||
| byte[] bytes = AvroUtil.encode(commitResponse); | ||
| consumer.addRecord(new ConsumerRecord<>(CTL_TOPIC_NAME, 0, nextOffset++, "key", bytes)); | ||
|
|
||
| Event commitReady = | ||
| new Event( | ||
| config.connectGroupId(), | ||
| new DataComplete( | ||
| commitId, ImmutableList.of(new TopicPartitionOffset("topic", 1, 1L, null)))); | ||
| bytes = AvroUtil.encode(commitReady); | ||
| consumer.addRecord(new ConsumerRecord<>(CTL_TOPIC_NAME, 0, nextOffset++, "key", bytes)); | ||
|
|
||
| coordinator.process(); | ||
| } | ||
|
|
||
| private void assertCommitTable(int idx, UUID commitId, OffsetDateTime ts) { | ||
| byte[] bytes = producer.history().get(idx).value(); | ||
| Event commitTable = AvroUtil.decode(bytes); | ||
|
|
@@ -288,6 +389,8 @@ public void testCoordinatorCommittedOffsetMerging() { | |
|
|
||
| @Test | ||
| public void testCoordinatorCommittedOffsetValidation() { | ||
| when(config.commitMaxConsecutiveFailures()).thenReturn(1); | ||
|
|
||
| // This test demonstrates that the Coordinator's validateAndCommit method | ||
| // prevents commits when another independent commit has updated the offsets | ||
| // during the commit process | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Two small things I'd fold in while we're here:
The property name
iceberg.connect.commit.max-consecutive-failuresintroduces a newiceberg.connect.commit.*namespace — the rest of the commit knobs in this file live under a different prefix. Once this ships it's public and renaming costs a deprecation cycle. Worth aligning with the existing commit-prefix convention now.And I'd add
ConfigDef.Range.atLeast(1)as the validator argument —0or negative silently makes the feature unreachable.