New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
KAFKA-4843: More efficient round-robin scheduler #2643
Changes from 32 commits
c3f9a17
138a491
caba483
187f9b6
87f06a8
aaa14d1
6c616ad
6414962
c1c47ef
978f925
b78545e
57856b0
b34122b
5086dd7
09097ef
8ec5678
467b11a
6c851cd
40394f3
1ee01c1
160766b
0d30ba4
7dae290
c658198
7252e62
4eb3670
c989871
88a16e7
a125039
69369ef
4a80d24
d93c4c3
732f4db
1b0b168
06e3380
7b98e22
d4c3ae8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -223,6 +223,7 @@ private synchronized void setStateWhenNotInPendingShutdown(final State newState) | |
private final TaskCreator taskCreator = new TaskCreator(); | ||
|
||
final ConsumerRebalanceListener rebalanceListener; | ||
private final static int UNLIMITED_RECORDS = -1; | ||
|
||
public synchronized boolean isInitialized() { | ||
return state == State.RUNNING; | ||
|
@@ -519,107 +520,168 @@ private long computeLatency() { | |
return Math.max(this.timerStartedMs - previousTimeMs, 0); | ||
} | ||
|
||
private void runLoop() { | ||
int totalNumBuffered = 0; | ||
boolean requiresPoll = true; | ||
boolean polledRecords = false; | ||
|
||
consumer.subscribe(sourceTopicPattern, rebalanceListener); | ||
|
||
while (stillRunning()) { | ||
this.timerStartedMs = time.milliseconds(); | ||
|
||
// try to fetch some records if necessary | ||
if (requiresPoll) { | ||
requiresPoll = false; | ||
|
||
boolean longPoll = totalNumBuffered == 0; | ||
/** | ||
* Get the next batch of records by polling. | ||
* @return Next batch of records or null if no records available. | ||
*/ | ||
private ConsumerRecords<byte[], byte[]> pollRequests(final long pollTimeMs) { | ||
ConsumerRecords<byte[], byte[]> records = null; | ||
|
||
ConsumerRecords<byte[], byte[]> records = null; | ||
try { | ||
records = consumer.poll(pollTimeMs); | ||
} catch (NoOffsetForPartitionException ex) { | ||
TopicPartition partition = ex.partition(); | ||
if (builder.earliestResetTopicsPattern().matcher(partition.topic()).matches()) { | ||
log.info(String.format("stream-thread [%s] setting topic to consume from earliest offset %s", this.getName(), partition.topic())); | ||
consumer.seekToBeginning(ex.partitions()); | ||
} else if (builder.latestResetTopicsPattern().matcher(partition.topic()).matches()) { | ||
consumer.seekToEnd(ex.partitions()); | ||
log.info(String.format("stream-thread [%s] setting topic to consume from latest offset %s", this.getName(), partition.topic())); | ||
} else { | ||
|
||
try { | ||
records = consumer.poll(longPoll ? this.pollTimeMs : 0); | ||
} catch (NoOffsetForPartitionException ex) { | ||
TopicPartition partition = ex.partition(); | ||
if (builder.earliestResetTopicsPattern().matcher(partition.topic()).matches()) { | ||
log.info(String.format("stream-thread [%s] setting topic to consume from earliest offset %s", this.getName(), partition.topic())); | ||
consumer.seekToBeginning(ex.partitions()); | ||
} else if (builder.latestResetTopicsPattern().matcher(partition.topic()).matches()) { | ||
consumer.seekToEnd(ex.partitions()); | ||
log.info(String.format("stream-thread [%s] setting topic to consume from latest offset %s", this.getName(), partition.topic())); | ||
} else { | ||
if (originalReset == null || (!originalReset.equals("earliest") && !originalReset.equals("latest"))) { | ||
setState(State.PENDING_SHUTDOWN); | ||
String errorMessage = "No valid committed offset found for input topic %s (partition %s) and no valid reset policy configured." + | ||
" You need to set configuration parameter \"auto.offset.reset\" or specify a topic specific reset " + | ||
"policy via KStreamBuilder#stream(StreamsConfig.AutoOffsetReset offsetReset, ...) or KStreamBuilder#table(StreamsConfig.AutoOffsetReset offsetReset, ...)"; | ||
throw new StreamsException(String.format(errorMessage, partition.topic(), partition.partition()), ex); | ||
} | ||
|
||
if (originalReset == null || (!originalReset.equals("earliest") && !originalReset.equals("latest"))) { | ||
setState(State.PENDING_SHUTDOWN); | ||
String errorMessage = "No valid committed offset found for input topic %s (partition %s) and no valid reset policy configured." + | ||
" You need to set configuration parameter \"auto.offset.reset\" or specify a topic specific reset " + | ||
"policy via KStreamBuilder#stream(StreamsConfig.AutoOffsetReset offsetReset, ...) or KStreamBuilder#table(StreamsConfig.AutoOffsetReset offsetReset, ...)"; | ||
throw new StreamsException(String.format(errorMessage, partition.topic(), partition.partition()), ex); | ||
} | ||
if (originalReset.equals("earliest")) { | ||
consumer.seekToBeginning(ex.partitions()); | ||
} else if (originalReset.equals("latest")) { | ||
consumer.seekToEnd(ex.partitions()); | ||
} | ||
log.info(String.format("stream-thread [%s] no custom setting defined for topic %s using original config %s for offset reset", this.getName(), partition.topic(), originalReset)); | ||
} | ||
|
||
if (originalReset.equals("earliest")) { | ||
consumer.seekToBeginning(ex.partitions()); | ||
} else if (originalReset.equals("latest")) { | ||
consumer.seekToEnd(ex.partitions()); | ||
} | ||
log.info(String.format("stream-thread [%s] no custom setting defined for topic %s using original config %s for offset reset", this.getName(), partition.topic(), originalReset)); | ||
} | ||
} | ||
|
||
} | ||
if (rebalanceException != null) | ||
throw new StreamsException(logPrefix + " Failed to rebalance", rebalanceException); | ||
|
||
if (rebalanceException != null) | ||
throw new StreamsException(logPrefix + " Failed to rebalance", rebalanceException); | ||
return records; | ||
} | ||
|
||
if (records != null && !records.isEmpty()) { | ||
int numAddedRecords = 0; | ||
/** | ||
* Take records and add them to each respective task | ||
* @param records Records, can be null | ||
*/ | ||
private void addRecordsToTasks(ConsumerRecords<byte[], byte[]> records) { | ||
if (records != null && !records.isEmpty()) { | ||
int numAddedRecords = 0; | ||
|
||
for (TopicPartition partition : records.partitions()) { | ||
StreamTask task = activeTasksByPartition.get(partition); | ||
numAddedRecords += task.addRecords(partition, records.records(partition)); | ||
} | ||
streamsMetrics.skippedRecordsSensor.record(records.count() - numAddedRecords, timerStartedMs); | ||
polledRecords = true; | ||
} else { | ||
polledRecords = false; | ||
} | ||
for (TopicPartition partition : records.partitions()) { | ||
StreamTask task = activeTasksByPartition.get(partition); | ||
numAddedRecords += task.addRecords(partition, records.records(partition)); | ||
} | ||
streamsMetrics.skippedRecordsSensor.record(records.count() - numAddedRecords, timerStartedMs); | ||
} | ||
} | ||
|
||
// only record poll latency is long poll is required | ||
if (longPoll) { | ||
streamsMetrics.pollTimeSensor.record(computeLatency(), timerStartedMs); | ||
/** | ||
* Schedule the records processing by selecting which record is processed next. Commits may | ||
* happen as records are processed. | ||
* @tasks The tasks that have records. | ||
* @param recordsProcessedBeforeCommit number of records to be processed before commit is called. | ||
* if UNLIMITED_RECORDS, then commit is never called | ||
* @return Number of records processed since last commit. | ||
*/ | ||
private long processAndPunctuate(final Map<TaskId, StreamTask> tasks, | ||
final long recordsProcessedBeforeCommit) { | ||
|
||
long totalProcessedEachRound; | ||
long totalProcessedSinceLastMaybeCommit = 0; | ||
// Round-robin scheduling by taking one record from each task repeatedly | ||
// until no task has any records left | ||
do { | ||
totalProcessedEachRound = 0; | ||
for (StreamTask task : tasks.values()) { | ||
// we processed one record, | ||
// and more are buffered waiting for the next round | ||
if (task.process() > 0) { | ||
totalProcessedEachRound++; | ||
totalProcessedSinceLastMaybeCommit++; | ||
} | ||
} | ||
if (recordsProcessedBeforeCommit != UNLIMITED_RECORDS && | ||
totalProcessedSinceLastMaybeCommit >= recordsProcessedBeforeCommit) { | ||
totalProcessedSinceLastMaybeCommit = 0; | ||
long processLatency = computeLatency(); | ||
streamsMetrics.processTimeSensor.record(processLatency / (double) totalProcessedSinceLastMaybeCommit, | ||
timerStartedMs); | ||
maybeCommit(this.timerStartedMs); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. One meta question: since we are only checking for commit at the round boundary anyways (by Also I'm wondering if we should update There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also, to add to the first question: we now calculate if we are overshooting the commit interval and adjust the number of records accordingly. So we don't overshoot the commit interval anymore. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK makes sense. |
||
} | ||
} while (totalProcessedEachRound != 0); | ||
|
||
// try to process one fetch record from each task via the topology, and also trigger punctuate | ||
// functions if necessary, which may result in more records going through the topology in this loop | ||
if (totalNumBuffered > 0 || polledRecords) { | ||
totalNumBuffered = 0; | ||
|
||
if (!activeTasks.isEmpty()) { | ||
for (StreamTask task : activeTasks.values()) { | ||
// go over the tasks again to punctuate or commit | ||
for (StreamTask task : tasks.values()) { | ||
maybePunctuate(task); | ||
if (task.commitNeeded()) | ||
commitOne(task); | ||
} | ||
|
||
totalNumBuffered += task.process(); | ||
return totalProcessedSinceLastMaybeCommit; | ||
} | ||
|
||
requiresPoll = requiresPoll || task.requiresPoll(); | ||
/** | ||
* Adjust the number of records that should be processed by scheduler. This avoids | ||
* scenarios where the processing time is higher than the commit time. | ||
* @param prevRecordsProcessedBeforeCommit Previous number of records processed by scheduler. | ||
* @param totalProcessed Total number of records processed in this last round. | ||
* @param processLatency Total processing latency in ms processed in this last round. | ||
* @param commitTime Desired commit time in ms. | ||
* @return An adjusted number of records to be processed in the next round. | ||
*/ | ||
private long adjustRecordsProcessedBeforeCommit(final long prevRecordsProcessedBeforeCommit, final long totalProcessed, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this method could be reduced to sth like:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd only want to make this adjustment if processLatency > commitTime though. In most cases that would mean no adjustment is needed at all and all records that are polled are processed. E.g., with the default commit time of 30 seconds currently I don't want to make any adjustments at all. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok - yep. I see if it was at the default, i.e., UNLIMITED and processorLatency < commitTime, then don't adjust. Makes sense |
||
final long processLatency, final long commitTime) { | ||
long recordsProcessedBeforeCommit = UNLIMITED_RECORDS; | ||
// check if process latency larger than commit latency | ||
// note that once we set recordsProcessedBeforeCommit, it will never be UNLIMITED_RECORDS again, so | ||
// we will never process all records again. This might be an issue if the initial measurement | ||
// was off due to a slow start. | ||
if (processLatency > commitTime) { | ||
// push down | ||
recordsProcessedBeforeCommit = Math.max(1, (commitTime * totalProcessed) / processLatency); | ||
log.debug("{} processing latency {} > commit time {} for {} records. Adjusting down recordsProcessedBeforeCommit={}", | ||
logPrefix, processLatency, commitTime, totalProcessed, recordsProcessedBeforeCommit); | ||
} else if (prevRecordsProcessedBeforeCommit != UNLIMITED_RECORDS && processLatency > 0) { | ||
// push up | ||
recordsProcessedBeforeCommit = Math.max(1, (commitTime * totalProcessed) / processLatency); | ||
log.debug("{} processing latency {} > commit time {} for {} records. Adjusting up recordsProcessedBeforeCommit={}", | ||
logPrefix, processLatency, commitTime, totalProcessed, recordsProcessedBeforeCommit); | ||
} | ||
|
||
streamsMetrics.processTimeSensor.record(computeLatency(), timerStartedMs); | ||
return recordsProcessedBeforeCommit; | ||
} | ||
|
||
maybePunctuate(task); | ||
/** | ||
* Main event loop for polling, and processing records through topologies. | ||
*/ | ||
private void runLoop() { | ||
long recordsProcessedBeforeCommit = UNLIMITED_RECORDS; | ||
consumer.subscribe(sourceTopicPattern, rebalanceListener); | ||
|
||
if (task.commitNeeded()) | ||
commitOne(task); | ||
} | ||
while (stillRunning()) { | ||
this.timerStartedMs = time.milliseconds(); | ||
|
||
} else { | ||
// even when no task is assigned, we must poll to get a task. | ||
requiresPoll = true; | ||
// try to fetch some records if necessary | ||
ConsumerRecords<byte[], byte[]> records = pollRequests(this.pollTimeMs); | ||
if (records != null && !records.isEmpty() && !activeTasks.isEmpty()) { | ||
streamsMetrics.pollTimeSensor.record(computeLatency(), timerStartedMs); | ||
addRecordsToTasks(records); | ||
final long totalProcessed = processAndPunctuate(activeTasks, recordsProcessedBeforeCommit); | ||
if (totalProcessed > 0) { | ||
final long processLatency = computeLatency(); | ||
streamsMetrics.processTimeSensor.record(processLatency / (double) totalProcessed, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we still need to record There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We still need it. For example, if There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for clarifying. |
||
timerStartedMs); | ||
recordsProcessedBeforeCommit = adjustRecordsProcessedBeforeCommit(recordsProcessedBeforeCommit, totalProcessed, | ||
processLatency, commitTimeMs); | ||
} | ||
|
||
} else { | ||
requiresPoll = true; | ||
} | ||
|
||
maybeCommit(timerStartedMs); | ||
maybeUpdateStandbyTasks(); | ||
|
||
maybeClean(timerStartedMs); | ||
} | ||
log.info("{} Shutting down at user request", logPrefix); | ||
|
@@ -692,8 +754,9 @@ private void maybePunctuate(StreamTask task) { | |
protected void maybeCommit(final long now) { | ||
|
||
if (commitTimeMs >= 0 && lastCommitMs + commitTimeMs < now) { | ||
log.info("{} Committing all active tasks {} and standby tasks {} because the commit interval {}ms has elapsed", | ||
logPrefix, commitTimeMs, activeTasks, standbyTasks); | ||
|
||
log.info("{} Committing all active tasks {} and standby tasks {} because the commit interval {}ms has elapsed by {}ms", | ||
logPrefix, activeTasks, standbyTasks, commitTimeMs, now - lastCommitMs); | ||
|
||
commitAll(); | ||
lastCommitMs = now; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In this case could we just return a
boolean
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok