From ff933bd9ad3542a9509aa3abf59202735b42a142 Mon Sep 17 00:00:00 2001
From: akonopko <alex.konopko@teamaol.com>
Date: Wed, 4 Oct 2017 21:01:14 +0300
Subject: [PATCH] Add spark.streaming.backpressure.initialRate to direct Kafka
 streams

---
 .../kafka010/DirectKafkaInputDStream.scala    |  8 +-
 .../kafka010/DirectKafkaStreamSuite.scala     | 70 ++++++++++++++++
 .../kafka/DirectKafkaInputDStream.scala       | 13 ++-
 .../kafka/DirectKafkaStreamSuite.scala        | 83 +++++++++++++++++++
 4 files changed, 171 insertions(+), 3 deletions(-)

diff --git a/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala b/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala
index 9a4a1cf32a480..cf96c4590d398 100644
--- a/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala
+++ b/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala
@@ -58,6 +58,9 @@ private[spark] class DirectKafkaInputDStream[K, V](
     ppc: PerPartitionConfig
   ) extends InputDStream[ConsumerRecord[K, V]](_ssc) with Logging with CanCommitOffsets {
 
+  private val initialRate = context.sparkContext.getConf.getLong(
+    "spark.streaming.backpressure.initialRate", 0)
+
   val executorKafkaParams = {
     val ekp = new ju.HashMap[String, Object](consumerStrategy.executorKafkaParams)
     KafkaUtils.fixKafkaParams(ekp)
@@ -128,7 +131,10 @@ private[spark] class DirectKafkaInputDStream[K, V](
 
   protected[streaming] def maxMessagesPerPartition(
     offsets: Map[TopicPartition, Long]): Option[Map[TopicPartition, Long]] = {
-    val estimatedRateLimit = rateController.map(_.getLatestRate())
+    val estimatedRateLimit = rateController.map(x => {
+      val lr = x.getLatestRate()
+      if (lr > 0) lr else initialRate
+    })
 
     // calculate a per-partition rate limit based on current lag
     val effectiveRateLimitPerPartition = estimatedRateLimit.filter(_ > 0) match {
diff --git a/external/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/DirectKafkaStreamSuite.scala b/external/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/DirectKafkaStreamSuite.scala
index 88a312a189cee..32af1382e1ca2 100644
--- a/external/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/DirectKafkaStreamSuite.scala
+++ b/external/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/DirectKafkaStreamSuite.scala
@@ -551,6 +551,76 @@ class DirectKafkaStreamSuite
       Map(new TopicPartition(topic, 0) -> 5L, new TopicPartition(topic, 1) -> 10L))
   }
 
+  test("use backpressure.initialRate with backpressure") {
+    val topic = "backpressureInitialRate"
+    val kafkaParams = getKafkaParams("auto.offset.reset" -> "earliest")
+    val sparkConf = new SparkConf()
+      // Safe, even with streaming, because we're using the direct API.
+      // Using 1 core is useful to make the test more predictable.
+      .setMaster("local[1]")
+      .setAppName(this.getClass.getSimpleName)
+      .set("spark.streaming.backpressure.enabled", "true")
+      .set("spark.streaming.kafka.maxRatePerPartition", "1000")
+      .set("spark.streaming.backpressure.initialRate", "500")
+
+    val messages = Map("foo" -> 5000)
+    kafkaTestUtils.sendMessages(topic, messages)
+
+    ssc = new StreamingContext(sparkConf, Milliseconds(500))
+
+    val kafkaStream = withClue("Error creating direct stream") {
+      new DirectKafkaInputDStream[String, String](
+        ssc,
+        preferredHosts,
+        ConsumerStrategies.Subscribe[String, String](List(topic), kafkaParams.asScala),
+        new DefaultPerPartitionConfig(sparkConf)
+      )
+    }
+    kafkaStream.start()
+
+    val input = Map(new TopicPartition(topic, 0) -> 1000L)
+
+    assert(kafkaStream.maxMessagesPerPartition(input).flatMap(_.headOption).contains(
+      new TopicPartition(topic, 0) -> 250)) // we run for half a second
+
+    kafkaStream.stop()
+  }
+
+  test("backpressure.initialRate should honor maxRatePerPartition") {
+    val topic = "backpressureInitialRate"
+    val kafkaParams = getKafkaParams("auto.offset.reset" -> "earliest")
+    val sparkConf = new SparkConf()
+      // Safe, even with streaming, because we're using the direct API.
+      // Using 1 core is useful to make the test more predictable.
+      .setMaster("local[1]")
+      .setAppName(this.getClass.getSimpleName)
+      .set("spark.streaming.backpressure.enabled", "true")
+      .set("spark.streaming.kafka.maxRatePerPartition", "300")
+      .set("spark.streaming.backpressure.initialRate", "1000")
+
+    val messages = Map("foo" -> 5000)
+    kafkaTestUtils.sendMessages(topic, messages)
+
+    ssc = new StreamingContext(sparkConf, Milliseconds(500))
+
+    val kafkaStream = withClue("Error creating direct stream") {
+      new DirectKafkaInputDStream[String, String](
+        ssc,
+        preferredHosts,
+        ConsumerStrategies.Subscribe[String, String](List(topic), kafkaParams.asScala),
+        new DefaultPerPartitionConfig(sparkConf)
+      )
+    }
+    kafkaStream.start()
+
+    val input = Map(new TopicPartition(topic, 0) -> 1000L)
+
+    assert(kafkaStream.maxMessagesPerPartition(input).flatMap(_.headOption).contains(
+      new TopicPartition(topic, 0) -> 150L)) // we run for half a second
+
+    kafkaStream.stop()
+  }
+
   test("using rate controller") {
     val topic = "backpressure"
     kafkaTestUtils.createTopic(topic, 1)
diff --git a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala
index d52c230eb7849..0c5f7c4abff99 100644
--- a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala
+++ b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala
@@ -91,9 +91,16 @@ class DirectKafkaInputDStream[
   private val maxRateLimitPerPartition: Long = context.sparkContext.getConf.getLong(
       "spark.streaming.kafka.maxRatePerPartition", 0)
 
+  private val initialRate = context.sparkContext.getConf.getLong(
+    "spark.streaming.backpressure.initialRate", 0)
+
   protected[streaming] def maxMessagesPerPartition(
       offsets: Map[TopicAndPartition, Long]): Option[Map[TopicAndPartition, Long]] = {
-    val estimatedRateLimit = rateController.map(_.getLatestRate())
+
+    val estimatedRateLimit = rateController.map(x => {
+      val lr = x.getLatestRate()
+      if (lr > 0) lr else initialRate
+    })
 
     // calculate a per-partition rate limit based on current lag
     val effectiveRateLimitPerPartition = estimatedRateLimit.filter(_ > 0) match {
@@ -108,7 +115,9 @@ class DirectKafkaInputDStream[
           tp -> (if (maxRateLimitPerPartition > 0) {
             Math.min(backpressureRate, maxRateLimitPerPartition)} else backpressureRate)
         }
-      case None => offsets.map { case (tp, offset) => tp -> maxRateLimitPerPartition }
+      case None => offsets.map { case (tp, offset) => tp -> {
+        maxRateLimitPerPartition
+      }}
     }
 
     if (effectiveRateLimitPerPartition.values.sum > 0) {
diff --git a/external/kafka-0-8/src/test/scala/org/apache/spark/streaming/kafka/DirectKafkaStreamSuite.scala b/external/kafka-0-8/src/test/scala/org/apache/spark/streaming/kafka/DirectKafkaStreamSuite.scala
index f8b34074f104f..2d44c02e51a4d 100644
--- a/external/kafka-0-8/src/test/scala/org/apache/spark/streaming/kafka/DirectKafkaStreamSuite.scala
+++ b/external/kafka-0-8/src/test/scala/org/apache/spark/streaming/kafka/DirectKafkaStreamSuite.scala
@@ -387,6 +387,89 @@ class DirectKafkaStreamSuite
       Map(TopicAndPartition(topic, 0) -> 10L, TopicAndPartition(topic, 1) -> 10L))
   }
 
+  test("use backpressure.initialRate with backpressure") {
+    val topic = "backpressureInitialRate"
+    val topicPartitions = Set(TopicAndPartition(topic, 0))
+    kafkaTestUtils.createTopic(topic, 1)
+    val kafkaParams = Map(
+      "metadata.broker.list" -> kafkaTestUtils.brokerAddress,
+      "auto.offset.reset" -> "smallest"
+    )
+
+    val sparkConf = new SparkConf()
+      // Safe, even with streaming, because we're using the direct API.
+      // Using 1 core is useful to make the test more predictable.
+      .setMaster("local[1]")
+      .setAppName(this.getClass.getSimpleName)
+      .set("spark.streaming.backpressure.enabled", "true")
+      .set("spark.streaming.backpressure.initialRate", "500")
+
+    val messages = Map("foo" -> 5000)
+    kafkaTestUtils.sendMessages(topic, messages)
+
+    ssc = new StreamingContext(sparkConf, Milliseconds(500))
+
+    val kafkaStream = withClue("Error creating direct stream") {
+      val kc = new KafkaCluster(kafkaParams)
+      val messageHandler = (mmd: MessageAndMetadata[String, String]) => (mmd.key, mmd.message)
+      val m = kc.getEarliestLeaderOffsets(topicPartitions)
+        .fold(e => Map.empty[TopicAndPartition, Long], m => m.mapValues(lo => lo.offset))
+
+      new DirectKafkaInputDStream[String, String, StringDecoder, StringDecoder, (String, String)](
+        ssc, kafkaParams, m, messageHandler)
+    }
+    kafkaStream.start()
+
+    val input = Map(new TopicAndPartition(topic, 0) -> 1000L)
+
+    assert(kafkaStream.maxMessagesPerPartition(input).flatMap(_.headOption).contains(
+      new TopicAndPartition(topic, 0) -> 250)) // we run for half a second
+
+    kafkaStream.stop()
+  }
+
+  test("backpressure.initialRate should honor maxRatePerPartition") {
+    val topic = "backpressureInitialRate2"
+    val topicPartitions = Set(TopicAndPartition(topic, 0))
+    kafkaTestUtils.createTopic(topic, 1)
+    val kafkaParams = Map(
+      "metadata.broker.list" -> kafkaTestUtils.brokerAddress,
+      "auto.offset.reset" -> "smallest"
+    )
+
+    val sparkConf = new SparkConf()
+      // Safe, even with streaming, because we're using the direct API.
+      // Using 1 core is useful to make the test more predictable.
+      .setMaster("local[1]")
+      .setAppName(this.getClass.getSimpleName)
+      .set("spark.streaming.backpressure.enabled", "true")
+      .set("spark.streaming.kafka.maxRatePerPartition", "300")
+      .set("spark.streaming.backpressure.initialRate", "1000")
+
+    val messages = Map("foo" -> 5000)
+    kafkaTestUtils.sendMessages(topic, messages)
+
+    ssc = new StreamingContext(sparkConf, Milliseconds(500))
+
+    val kafkaStream = withClue("Error creating direct stream") {
+      val kc = new KafkaCluster(kafkaParams)
+      val messageHandler = (mmd: MessageAndMetadata[String, String]) => (mmd.key, mmd.message)
+      val m = kc.getEarliestLeaderOffsets(topicPartitions)
+        .fold(e => Map.empty[TopicAndPartition, Long], m => m.mapValues(lo => lo.offset))
+
+      new DirectKafkaInputDStream[String, String, StringDecoder, StringDecoder, (String, String)](
+        ssc, kafkaParams, m, messageHandler)
+    }
+    kafkaStream.start()
+
+    val input = Map(new TopicAndPartition(topic, 0) -> 1000L)
+
+    assert(kafkaStream.maxMessagesPerPartition(input).flatMap(_.headOption).contains(
+      new TopicAndPartition(topic, 0) -> 150L)) // we run for half a second
+
+    kafkaStream.stop()
+  }
+
   test("using rate controller") {
     val topic = "backpressure"
     val topicPartitions = Set(TopicAndPartition(topic, 0), TopicAndPartition(topic, 1))