apache · nishkamravi2 · Jun 3, 2015 · Jun 3, 2015 · Jun 5, 2015 · Jun 5, 2015
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala
@@ -17,8 +17,10 @@
 
 package org.apache.spark.streaming.scheduler
 
-import scala.collection.mutable.{HashMap, SynchronizedMap}
+import scala.collection.mutable.{ArrayBuffer, HashMap, SynchronizedMap}
 import scala.language.existentials
+import scala.math.max
+import org.apache.spark.rdd._
 
 import org.apache.spark.streaming.util.WriteAheadLogUtils
 import org.apache.spark.{Logging, SerializableWritable, SparkEnv, SparkException}
@@ -270,6 +272,44 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
       }
     }
 
+    /**
+     * Get the list of executors excluding driver
+     */
+    private def getExecutors(ssc: StreamingContext): List[String] = {
+      val executors = ssc.sparkContext.getExecutorMemoryStatus.map(_._1.split(":")(0)).toList
+      val driver = ssc.sparkContext.getConf.get("spark.driver.host")
+      executors.diff(List(driver))
+    }
+
+    /** Set host location(s) for each receiver so as to distribute them over
+     * executors in a round-robin fashion taking into account preferredLocation if set
+     */
+    private[streaming] def scheduleReceivers(receivers: Seq[Receiver[_]],
+      executors: List[String]): Array[ArrayBuffer[String]] = {
+      val locations = new Array[ArrayBuffer[String]](receivers.length)
+      if (!executors.isEmpty) {
+        var i = 0
+        for (i <- 0 until receivers.length) {
+          locations(i) = new ArrayBuffer[String]()
+          if (receivers(i).preferredLocation.isDefined) {
+            locations(i) += receivers(i).preferredLocation.get
+          }
+        }
+
+        var count = 0;
+        for (i <- 0 until max(receivers.length, executors.length)) {
+          if (!receivers(i % receivers.length).preferredLocation.isDefined) {
+            locations(i % receivers.length) += executors(count)
+            count += 1;
+            if (count == executors.length) {
+              count = 0;
+            }
+          }
+        }
+      }
+      locations
+    }
+
     /**
      * Get the receivers from the ReceiverInputDStreams, distributes them to the
      * worker nodes as a parallel collection, and runs them.
@@ -281,18 +321,6 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
         rcvr
       })
 
-      // Right now, we only honor preferences if all receivers have them
-      val hasLocationPreferences = receivers.map(_.preferredLocation.isDefined).reduce(_ && _)
-
-      // Create the parallel collection of receivers to distributed them on the worker nodes
-      val tempRDD =
-        if (hasLocationPreferences) {
-          val receiversWithPreferences = receivers.map(r => (r, Seq(r.preferredLocation.get)))
-          ssc.sc.makeRDD[Receiver[_]](receiversWithPreferences)
-        } else {
-          ssc.sc.makeRDD(receivers, receivers.size)
-        }
-
       val checkpointDirOption = Option(ssc.checkpointDir)
       val serializableHadoopConf = new SerializableWritable(ssc.sparkContext.hadoopConfiguration)
 
@@ -308,12 +336,25 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
         supervisor.start()
         supervisor.awaitTermination()
       }
+
       // Run the dummy Spark job to ensure that all slaves have registered.
       // This avoids all the receivers to be scheduled on the same node.
       if (!ssc.sparkContext.isLocal) {
         ssc.sparkContext.makeRDD(1 to 50, 50).map(x => (x, 1)).reduceByKey(_ + _, 20).collect()
       }
 
+      // Get the list of executors and schedule receivers
+      val executors = getExecutors(ssc)
+      val locations = scheduleReceivers(receivers, executors)
+      val tempRDD =
+        if (locations(0) != null) {
+          val roundRobinReceivers = (0 until receivers.length).map(i =>
+            (receivers(i), locations(i)))
+          ssc.sc.makeRDD[Receiver[_]](roundRobinReceivers)
+        } else {
+          ssc.sc.makeRDD(receivers, receivers.size)
+        }
+
       // Distribute the receivers and start them
       logInfo("Starting " + receivers.length + " receivers")
       running = true

diff --git a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/SchedulerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/SchedulerSuite.scala
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming.scheduler
+
+import org.apache.spark.streaming._
+import org.apache.spark.SparkConf
+import org.apache.spark.storage.StorageLevel
+import org.apache.spark.streaming.receiver._
+import org.apache.spark.util.Utils
+
+/** Testsuite for receiver scheduling */
+class SchedulerSuite extends TestSuiteBase {
+  val sparkConf = new SparkConf().setMaster("local[8]").setAppName("test")
+  val ssc = new StreamingContext(sparkConf, Milliseconds(100))
+  val tracker = new ReceiverTracker(ssc)
+  val launcher = new tracker.ReceiverLauncher()
+
+  test("receiver scheduling - no preferredLocation") {
+    val numReceivers = 10;
+    val receivers = (1 to numReceivers).map(i => new DummyReceiver)
+    val executors: List[String] = List("Host1", "Host2", "Host3", "Host4", "Host5")
+    val locations = launcher.scheduleReceivers(receivers, executors)
+    assert(locations(0)(0) === "Host1")
+    assert(locations(4)(0) === "Host5")
+    assert(locations(5)(0) === "Host1")
+    assert(locations(9)(0) === "Host5")
+  }
+
+  test("receiver scheduling - no preferredLocation, numExecutors > numReceivers") {
+    val numReceivers = 3;
+    val receivers = (1 to numReceivers).map(i => new DummyReceiver)
+    val executors: List[String] = List("Host1", "Host2", "Host3", "Host4", "Host5")
+    val locations = launcher.scheduleReceivers(receivers, executors)
+    assert(locations(0)(0) === "Host1")
+    assert(locations(2)(0) === "Host3")
+    assert(locations(0)(1) === "Host4")
+    assert(locations(1)(1) === "Host5")
+  }
+
+  test("receiver scheduling - all have preferredLocation") {
+    val numReceivers = 5;
+    val receivers = (1 to numReceivers).map(i => new DummyReceiver(host = Some("Host" + i)))
+    val executors: List[String] = List("Host1", "Host5", "Host4", "Host3", "Host2")
+    val locations = launcher.scheduleReceivers(receivers, executors)
+    assert(locations(1)(0) === "Host2")
+    assert(locations(4)(0) === "Host5")
+  }
+
+  test("receiver scheduling - some have preferredLocation") {
+    val numReceivers = 3;
+    val receivers: Seq[Receiver[_]] = Seq(
+      new DummyReceiver(host = Some("Host2")),
+      new DummyReceiver,
+      new DummyReceiver)
+    val executors: List[String] = List("Host1", "Host2", "Host3", "Host4", "Host5")
+    val locations = launcher.scheduleReceivers(receivers, executors)
+    assert(locations(0)(0) === "Host2")
+    assert(locations(1)(0) === "Host1")
+    assert(locations(2)(0) === "Host2")
+    assert(locations(1)(1) === "Host3")
+  }
+}
+
+/**
+ * Dummy receiver implementation
+ */
+class DummyReceiver(host: Option[String] = None) extends Receiver[Int](StorageLevel.MEMORY_ONLY) {
+
+  def onStart() {
+  }
+
+  def onStop() {
+  }
+
+  override def preferredLocation: Option[String] = host
+}