apache · hachikuji · Jun 12, 2017 · Jun 13, 2017 · Jun 13, 2017 · Jun 13, 2017
diff --git a/core/src/main/scala/kafka/log/Log.scala b/core/src/main/scala/kafka/log/Log.scala
@@ -46,8 +46,6 @@ import java.util.Map.{Entry => JEntry}
 import java.lang.{Long => JLong}
 import java.util.regex.Pattern
 
-import org.apache.kafka.common.internals.Topic
-
 object LogAppendInfo {
   val UnknownLogAppendInfo = LogAppendInfo(-1, -1, RecordBatch.NO_TIMESTAMP, -1L, RecordBatch.NO_TIMESTAMP,
     NoCompressionCodec, NoCompressionCodec, -1, -1, offsetsMonotonic = false)
@@ -431,24 +429,26 @@ class Log(@volatile var dir: File,
   private def loadProducerState(lastOffset: Long): Unit = lock synchronized {
     info(s"Loading producer state from offset $lastOffset for partition $topicPartition")
 
+    // To avoid expensive initialization when upgrading from older brokers, we skip loading producer
+    // state if no snapshot file is found. To ensure that we cannot hit this case after upgrading (which
+    // could cause us to lose producer state), we enforce the invariant that we always have an empty snapshot
+    // file at the log start offset.
+
     if (producerStateManager.latestSnapshotOffset.isEmpty) {
-      // if there are no snapshots to load producer state from, we assume that the brokers are
-      // being upgraded, which means there would be no previous idempotent/transactional producers
-      // to load state for. To avoid an expensive scan through all of the segments, we take
-      // empty snapshots from the start of the last two segments and the last offset. The purpose
-      // of taking the segment snapshots is to avoid the full scan in the case that the log needs
-      // truncation.
+      // There are no snapshots so this is the upgrade path. In addition to taking a snapshot at the log start
+      // offset to enforce the invariant mentioned above, we take empty snapshots from the start of the last
+      // two segments and the last offset. The purpose of the additional snapshots is to avoid the full scan in
+      // the case that the log needs truncation.
       val nextLatestSegmentBaseOffset = Option(segments.lowerEntry(activeSegment.baseOffset)).map(_.getValue.baseOffset)
-      val offsetsToSnapshot = Seq(nextLatestSegmentBaseOffset, Some(activeSegment.baseOffset), Some(lastOffset))
-      offsetsToSnapshot.flatten.foreach { offset =>
-        producerStateManager.updateMapEndOffset(offset)
-        producerStateManager.takeSnapshot()
-      }
+      val offsetsToSnapshot = Seq(Some(logStartOffset), nextLatestSegmentBaseOffset, Some(activeSegment.baseOffset), Some(lastOffset))
+      offsetsToSnapshot.flatten.foreach(producerStateManager.takeEmptySnapshot)
     } else {
-      val currentTimeMs = time.milliseconds
-      producerStateManager.truncateAndReload(logStartOffset, lastOffset, currentTimeMs)
+      // Ensure we have an empty snapshot at the log start offset to enforce the invariant mentioned above.
+      // This must be done prior to truncation in case of failure after previous snapshots are removed.
+      producerStateManager.takeEmptySnapshot(logStartOffset)
+      producerStateManager.truncateAndReload(logStartOffset, lastOffset, time.milliseconds())
 
-      // only do the potentially expensive reloading of the last snapshot offset is lower than the
+      // Only do the potentially expensive reloading if the last snapshot offset is lower than the
       // log end offset (which would be the case on first startup) and there are active producers.
       // if there are no active producers, then truncating shouldn't change that fact (although it
       // could cause a producerId to expire earlier than expected), so we can skip the loading.
@@ -478,7 +478,7 @@ class Log(@volatile var dir: File,
     completedTxns.foreach(producerStateManager.completeTxn)
   }
 
-  private[log] def activePids: Map[Long, ProducerIdEntry] = lock synchronized {
+  private[log] def activeProducers: Map[Long, ProducerIdEntry] = lock synchronized {
     producerStateManager.activeProducers
   }
 
@@ -706,6 +706,11 @@ class Log(@volatile var dir: File,
     lock synchronized {
       if (offset > logStartOffset) {
         logStartOffset = offset
+
+        // Enforce the invariant that we have an empty snapshot at the log start offset to ensure
+        // proper loading of producer state upon recovery.
+        producerStateManager.takeEmptySnapshot(logStartOffset)
+        producerStateManager.deleteSnapshotsBefore(logStartOffset)
       }
     }
   }
@@ -1050,7 +1055,11 @@ class Log(@volatile var dir: File,
         deletable.foreach(deleteSegment)
         logStartOffset = math.max(logStartOffset, segments.firstEntry().getValue.baseOffset)
         leaderEpochCache.clearAndFlushEarliest(logStartOffset)
-        producerStateManager.evictUnretainedProducers(logStartOffset)
+
+        // Update the producer state with the new log start offset, which we cause any non-retained producers to
+        // be evicted. Enforce the invariant that we always have an empty snapshot at the log start offset.
+        producerStateManager.takeEmptySnapshot(logStartOffset)
+        producerStateManager.truncateHead(logStartOffset)
         updateFirstUnstableOffset()
       }
     }
@@ -1255,10 +1264,10 @@ class Log(@volatile var dir: File,
     for(segment <- logSegments(this.recoveryPoint, offset))
       segment.flush()
 
-    // now that we have flushed, we can cleanup old producer snapshots. However, it is useful to retain
-    // the snapshots from the recent segments in case we need to truncate and rebuild the producer state.
-    // Otherwise, we would always need to rebuild from the earliest segment.
-    producerStateManager.deleteSnapshotsBefore(minSnapshotOffsetToRetain(offset))
+    // Now that we have flushed, we can cleanup old producer snapshots. However, it is useful to retain the
+    // snapshots from the recent segments in case we need to truncate and rebuild the producer state. Note that
+    // we still retain the snapshot from the log start offset.
+    producerStateManager.deleteSnapshotsInRangeExclusive(logStartOffset, minSnapshotOffsetToRetain(offset))
 
     lock synchronized {
       if(offset > this.recoveryPoint) {
@@ -1364,6 +1373,11 @@ class Log(@volatile var dir: File,
 
       producerStateManager.truncate()
       producerStateManager.updateMapEndOffset(newOffset)
+
+      // Truncation results in all snapshot files being removed, so take an empty snapshot at the new offset
+      // to maintain the invariant that we always have a snapshot at the log start offset.
+      producerStateManager.takeEmptySnapshot(newOffset)
+
       updateFirstUnstableOffset()
 
       this.recoveryPoint = math.min(newOffset, this.recoveryPoint)

diff --git a/core/src/main/scala/kafka/log/LogCleaner.scala b/core/src/main/scala/kafka/log/LogCleaner.scala
@@ -424,7 +424,7 @@ private[log] class Cleaner(val id: Int,
         info("Cleaning segment %s in log %s (largest timestamp %s) into %s, %s deletes."
           .format(startOffset, log.name, new Date(oldSegmentOpt.largestTimestamp), cleaned.baseOffset, if(retainDeletes) "retaining" else "discarding"))
         cleanInto(log.topicPartition, oldSegmentOpt, cleaned, map, retainDeletes, log.config.maxMessageSize, transactionMetadata,
-          log.activePids, stats)
+          log.activeProducers, stats)
 
         currentSegmentOpt = nextSegmentOpt
       }

diff --git a/core/src/main/scala/kafka/log/ProducerStateManager.scala b/core/src/main/scala/kafka/log/ProducerStateManager.scala
@@ -376,7 +376,7 @@ class ProducerStateManager(val topicPartition: TopicPartition,
   private val validateSequenceNumbers = topicPartition.topic != Topic.GROUP_METADATA_TOPIC_NAME
   private val producers = mutable.Map.empty[Long, ProducerIdEntry]
   private var lastMapOffset = 0L
-  private var lastSnapOffset = 0L
+  private var lastSnapOffset = -1L
 
   // ongoing transactions sorted by the first offset of the transaction
   private val ongoingTxns = new util.TreeMap[Long, TxnMetadata]
@@ -444,7 +444,7 @@ class ProducerStateManager(val topicPartition: TopicPartition,
               Files.deleteIfExists(file.toPath)
           }
         case None =>
-          lastSnapOffset = logStartOffset
+          lastSnapOffset = -1
           lastMapOffset = logStartOffset
           return
       }
@@ -472,17 +472,24 @@ class ProducerStateManager(val topicPartition: TopicPartition,
     }
   }
 
+  private def inRangeInclusive(snapshotFile: File, startOffset: Long, endOffset: Long): Boolean = {
+    val offset = offsetFromFilename(snapshotFile.getName)
+    offset >= startOffset && offset <= endOffset
+  }
+
+  private def inRangeExclusive(snapshotFile: File, startOffset: Long, endOffset: Long): Boolean = {
+    val offset = offsetFromFilename(snapshotFile.getName)
+    offset > startOffset && offset < endOffset
+  }
+
   /**
    * Truncate the producer id mapping to the given offset range and reload the entries from the most recent
    * snapshot in range (if there is one). Note that the log end offset is assumed to be less than
    * or equal to the high watermark.
    */
   def truncateAndReload(logStartOffset: Long, logEndOffset: Long, currentTimeMs: Long) {
     // remove all out of range snapshots
-    deleteSnapshotFiles { file =>
-      val offset = offsetFromFilename(file.getName)
-      offset > logEndOffset || offset <= logStartOffset
-    }
+    deleteSnapshotFiles(!inRangeInclusive(_, logStartOffset, logEndOffset))
 
     if (logEndOffset != mapEndOffset) {
       producers.clear()
@@ -493,7 +500,7 @@ class ProducerStateManager(val topicPartition: TopicPartition,
       unreplicatedTxns.clear()
       loadFromSnapshot(logStartOffset, currentTimeMs)
     } else {
-      evictUnretainedProducers(logStartOffset)
+      truncateHead(logStartOffset)
     }
   }
 
@@ -541,6 +548,12 @@ class ProducerStateManager(val topicPartition: TopicPartition,
     }
   }
 
+  def takeEmptySnapshot(offset: Long) = {
+    val snapshotFile = Log.producerSnapshotFile(logDir, offset)
+    debug(s"Writing empty producer snapshot for partition $topicPartition at offset $offset")
+    writeSnapshot(snapshotFile, mutable.Map.empty)
+  }
+
   /**
    * Get the last offset (exclusive) of the latest snapshot file.
    */
@@ -553,17 +566,18 @@ class ProducerStateManager(val topicPartition: TopicPartition,
 
   /**
    * When we remove the head of the log due to retention, we need to clean up the id map. This method takes
-   * the new start offset and removes all producerIds which have a smaller last written offset.
+   * the new start offset and removes all producerIds which have a smaller last written offset. Additionally,
+   * all snapshot files at offsets strictly lower than the log start offset will be removed.
    */
-  def evictUnretainedProducers(logStartOffset: Long) {
+  def truncateHead(logStartOffset: Long) {
     val evictedProducerEntries = producers.filter(_._2.lastOffset < logStartOffset)
     val evictedProducerIds = evictedProducerEntries.keySet
 
     producers --= evictedProducerIds
     removeEvictedOngoingTransactions(evictedProducerIds)
     removeUnreplicatedTransactions(logStartOffset)
 
-    deleteSnapshotFiles(file => offsetFromFilename(file.getName) <= logStartOffset)
+    deleteSnapshotsBefore(logStartOffset)
     if (lastMapOffset < logStartOffset)
       lastMapOffset = logStartOffset
     lastSnapOffset = latestSnapshotOffset.getOrElse(logStartOffset)
@@ -596,7 +610,7 @@ class ProducerStateManager(val topicPartition: TopicPartition,
     ongoingTxns.clear()
     unreplicatedTxns.clear()
     deleteSnapshotFiles()
-    lastSnapOffset = 0L
+    lastSnapOffset = -1L
     lastMapOffset = 0L
   }
 
@@ -620,6 +634,11 @@ class ProducerStateManager(val topicPartition: TopicPartition,
     deleteSnapshotFiles(file => offsetFromFilename(file.getName) < offset)
   }
 
+  @threadsafe
+  def deleteSnapshotsInRangeExclusive(startOffset: Long, endOffset: Long): Unit = {
+    deleteSnapshotFiles(inRangeExclusive(_, startOffset, endOffset))
+  }
+
   private def listSnapshotFiles: List[File] = {
     if (logDir.exists && logDir.isDirectory)
       logDir.listFiles.filter(f => f.isFile && isSnapshotFile(f.getName)).toList

diff --git a/core/src/test/scala/unit/kafka/log/LogManagerTest.scala b/core/src/test/scala/unit/kafka/log/LogManagerTest.scala
@@ -105,8 +105,8 @@ class LogManagerTest {
     assertEquals("Now there should only be only one segment in the index.", 1, log.numberOfSegments)
     time.sleep(log.config.fileDeleteDelayMs + 1)
 
-    // there should be a log file, two indexes, and the leader epoch checkpoint
-    assertEquals("Files should have been deleted", log.numberOfSegments * 3 + 1, log.dir.list.length)
+    // there should be a log file, two indexes, one empty producer snapshot, and the leader epoch checkpoint
+    assertEquals("Files should have been deleted", log.numberOfSegments * 4 + 1, log.dir.list.length)
     assertEquals("Should get empty fetch off new log.", 0, log.readUncommitted(offset+1, 1024).records.sizeInBytes)
 
     try {
@@ -132,7 +132,7 @@ class LogManagerTest {
     val config = LogConfig.fromProps(logConfig.originals, logProps)
 
     logManager = createLogManager()
-    logManager.startup
+    logManager.startup()
 
     // create a log
     val log = logManager.createLog(new TopicPartition(name, 0), config)
@@ -154,8 +154,8 @@ class LogManagerTest {
     time.sleep(log.config.fileDeleteDelayMs + 1)
 
     // there should be a log file, two indexes (the txn index is created lazily),
-    // the leader epoch checkpoint and two pid mapping files (one for the active and previous segments)
-    assertEquals("Files should have been deleted", log.numberOfSegments * 3 + 3, log.dir.list.length)
+    // the leader epoch checkpoint and three pid mapping files (one for active and previous segments and the log start offset)
+    assertEquals("Files should have been deleted", log.numberOfSegments * 3 + 4, log.dir.list.length)
     assertEquals("Should get empty fetch off new log.", 0, log.readUncommitted(offset + 1, 1024).records.sizeInBytes)
     try {
       log.readUncommitted(0, 1024)