From bb3bbfd25a33dcad969c18df7dae0b62f01aefb4 Mon Sep 17 00:00:00 2001 From: Prashant Sharma Date: Tue, 18 Nov 2014 19:41:55 +0530 Subject: [PATCH 1/3] SPARK-3962 Marked scope as provided for external. --- external/flume/pom.xml | 8 +- .../streaming/LocalJavaStreamingContext.java | 40 ++ .../spark/streaming/TestSuiteBase.scala | 413 ++++++++++++++++++ external/kafka/pom.xml | 8 +- .../spark/streaming/TestSuiteBase.scala | 413 ++++++++++++++++++ external/mqtt/pom.xml | 8 +- .../streaming/LocalJavaStreamingContext.java | 40 ++ .../spark/streaming/TestSuiteBase.scala | 413 ++++++++++++++++++ external/twitter/pom.xml | 8 +- .../streaming/LocalJavaStreamingContext.java | 40 ++ .../spark/streaming/TestSuiteBase.scala | 413 ++++++++++++++++++ external/zeromq/pom.xml | 8 +- .../streaming/LocalJavaStreamingContext.java | 40 ++ .../spark/streaming/TestSuiteBase.scala | 413 ++++++++++++++++++ 14 files changed, 2230 insertions(+), 35 deletions(-) create mode 100644 external/flume/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java create mode 100644 external/flume/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala create mode 100644 external/kafka/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala create mode 100644 external/mqtt/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java create mode 100644 external/mqtt/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala create mode 100644 external/twitter/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java create mode 100644 external/twitter/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala create mode 100644 external/zeromq/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java create mode 100644 external/zeromq/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala diff --git a/external/flume/pom.xml b/external/flume/pom.xml index 7d31e32283d88..c60205dc4141c 100644 --- a/external/flume/pom.xml +++ b/external/flume/pom.xml @@ -39,19 +39,13 @@ org.apache.spark spark-streaming_${scala.binary.version} ${project.version} + provided org.apache.spark spark-streaming-flume-sink_${scala.binary.version} ${project.version} - - org.apache.spark - spark-streaming_${scala.binary.version} - ${project.version} - test-jar - test - org.apache.flume flume-ng-sdk diff --git a/external/flume/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java b/external/flume/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java new file mode 100644 index 0000000000000..6e1f01900071b --- /dev/null +++ b/external/flume/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.streaming; + +import org.apache.spark.streaming.api.java.JavaStreamingContext; +import org.junit.After; +import org.junit.Before; + +public abstract class LocalJavaStreamingContext { + + protected transient JavaStreamingContext ssc; + + @Before + public void setUp() { + System.setProperty("spark.streaming.clock", "org.apache.spark.streaming.util.ManualClock"); + ssc = new JavaStreamingContext("local[2]", "test", new Duration(1000)); + ssc.checkpoint("checkpoint"); + } + + @After + public void tearDown() { + ssc.stop(); + ssc = null; + } +} diff --git a/external/flume/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala b/external/flume/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala new file mode 100644 index 0000000000000..76b3b73a2ff3b --- /dev/null +++ b/external/flume/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala @@ -0,0 +1,413 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.streaming + +import java.io.{ObjectInputStream, IOException} + +import scala.collection.mutable.ArrayBuffer +import scala.collection.mutable.SynchronizedBuffer +import scala.reflect.ClassTag + +import org.scalatest.{BeforeAndAfter, FunSuite} + +import org.apache.spark.streaming.dstream.{DStream, InputDStream, ForEachDStream} +import org.apache.spark.streaming.util.ManualClock +import org.apache.spark.{SparkConf, Logging} +import org.apache.spark.rdd.RDD +import org.apache.spark.util.Utils + +/** + * This is a input stream just for the testsuites. This is equivalent to a checkpointable, + * replayable, reliable message queue like Kafka. It requires a sequence as input, and + * returns the i_th element at the i_th batch unde manual clock. + */ +class TestInputStream[T: ClassTag](ssc_ : StreamingContext, input: Seq[Seq[T]], numPartitions: Int) + extends InputDStream[T](ssc_) { + + def start() {} + + def stop() {} + + def compute(validTime: Time): Option[RDD[T]] = { + logInfo("Computing RDD for time " + validTime) + val index = ((validTime - zeroTime) / slideDuration - 1).toInt + val selectedInput = if (index < input.size) input(index) else Seq[T]() + + // lets us test cases where RDDs are not created + if (selectedInput == null) + return None + + val rdd = ssc.sc.makeRDD(selectedInput, numPartitions) + logInfo("Created RDD " + rdd.id + " with " + selectedInput) + Some(rdd) + } +} + +/** + * This is a output stream just for the testsuites. All the output is collected into a + * ArrayBuffer. This buffer is wiped clean on being restored from checkpoint. + * + * The buffer contains a sequence of RDD's, each containing a sequence of items + */ +class TestOutputStream[T: ClassTag](parent: DStream[T], + val output: ArrayBuffer[Seq[T]] = ArrayBuffer[Seq[T]]()) + extends ForEachDStream[T](parent, (rdd: RDD[T], t: Time) => { + val collected = rdd.collect() + output += collected + }) { + + // This is to clear the output buffer every it is read from a checkpoint + @throws(classOf[IOException]) + private def readObject(ois: ObjectInputStream): Unit = Utils.tryOrIOException { + ois.defaultReadObject() + output.clear() + } +} + +/** + * This is a output stream just for the testsuites. All the output is collected into a + * ArrayBuffer. This buffer is wiped clean on being restored from checkpoint. + * + * The buffer contains a sequence of RDD's, each containing a sequence of partitions, each + * containing a sequence of items. + */ +class TestOutputStreamWithPartitions[T: ClassTag](parent: DStream[T], + val output: ArrayBuffer[Seq[Seq[T]]] = ArrayBuffer[Seq[Seq[T]]]()) + extends ForEachDStream[T](parent, (rdd: RDD[T], t: Time) => { + val collected = rdd.glom().collect().map(_.toSeq) + output += collected + }) { + + // This is to clear the output buffer every it is read from a checkpoint + @throws(classOf[IOException]) + private def readObject(ois: ObjectInputStream): Unit = Utils.tryOrIOException { + ois.defaultReadObject() + output.clear() + } + + def toTestOutputStream = new TestOutputStream[T](this.parent, this.output.map(_.flatten)) +} + +/** + * This is the base trait for Spark Streaming testsuites. This provides basic functionality + * to run user-defined set of input on user-defined stream operations, and verify the output. + */ +trait TestSuiteBase extends FunSuite with BeforeAndAfter with Logging { + + // Name of the framework for Spark context + def framework = this.getClass.getSimpleName + + // Master for Spark context + def master = "local[2]" + + // Batch duration + def batchDuration = Seconds(1) + + // Directory where the checkpoint data will be saved + lazy val checkpointDir = { + val dir = Utils.createTempDir() + logDebug(s"checkpointDir: $dir") + dir.toString + } + + // Number of partitions of the input parallel collections created for testing + def numInputPartitions = 2 + + // Maximum time to wait before the test times out + def maxWaitTimeMillis = 10000 + + // Whether to use manual clock or not + def useManualClock = true + + // Whether to actually wait in real time before changing manual clock + def actuallyWait = false + + //// A SparkConf to use in tests. Can be modified before calling setupStreams to configure things. + val conf = new SparkConf() + .setMaster(master) + .setAppName(framework) + + // Default before function for any streaming test suite. Override this + // if you want to add your stuff to "before" (i.e., don't call before { } ) + def beforeFunction() { + if (useManualClock) { + logInfo("Using manual clock") + conf.set("spark.streaming.clock", "org.apache.spark.streaming.util.ManualClock") + } else { + logInfo("Using real clock") + conf.set("spark.streaming.clock", "org.apache.spark.streaming.util.SystemClock") + } + } + + // Default after function for any streaming test suite. Override this + // if you want to add your stuff to "after" (i.e., don't call after { } ) + def afterFunction() { + System.clearProperty("spark.streaming.clock") + } + + before(beforeFunction) + after(afterFunction) + + /** + * Run a block of code with the given StreamingContext and automatically + * stop the context when the block completes or when an exception is thrown. + */ + def withStreamingContext[R](ssc: StreamingContext)(block: StreamingContext => R): R = { + try { + block(ssc) + } finally { + try { + ssc.stop(stopSparkContext = true) + } catch { + case e: Exception => + logError("Error stopping StreamingContext", e) + } + } + } + + /** + * Set up required DStreams to test the DStream operation using the two sequences + * of input collections. + */ + def setupStreams[U: ClassTag, V: ClassTag]( + input: Seq[Seq[U]], + operation: DStream[U] => DStream[V], + numPartitions: Int = numInputPartitions + ): StreamingContext = { + // Create StreamingContext + val ssc = new StreamingContext(conf, batchDuration) + if (checkpointDir != null) { + ssc.checkpoint(checkpointDir) + } + + // Setup the stream computation + val inputStream = new TestInputStream(ssc, input, numPartitions) + val operatedStream = operation(inputStream) + val outputStream = new TestOutputStreamWithPartitions(operatedStream, + new ArrayBuffer[Seq[Seq[V]]] with SynchronizedBuffer[Seq[Seq[V]]]) + outputStream.register() + ssc + } + + /** + * Set up required DStreams to test the binary operation using the sequence + * of input collections. + */ + def setupStreams[U: ClassTag, V: ClassTag, W: ClassTag]( + input1: Seq[Seq[U]], + input2: Seq[Seq[V]], + operation: (DStream[U], DStream[V]) => DStream[W] + ): StreamingContext = { + // Create StreamingContext + val ssc = new StreamingContext(conf, batchDuration) + if (checkpointDir != null) { + ssc.checkpoint(checkpointDir) + } + + // Setup the stream computation + val inputStream1 = new TestInputStream(ssc, input1, numInputPartitions) + val inputStream2 = new TestInputStream(ssc, input2, numInputPartitions) + val operatedStream = operation(inputStream1, inputStream2) + val outputStream = new TestOutputStreamWithPartitions(operatedStream, + new ArrayBuffer[Seq[Seq[W]]] with SynchronizedBuffer[Seq[Seq[W]]]) + outputStream.register() + ssc + } + + /** + * Runs the streams set up in `ssc` on manual clock for `numBatches` batches and + * returns the collected output. It will wait until `numExpectedOutput` number of + * output data has been collected or timeout (set by `maxWaitTimeMillis`) is reached. + * + * Returns a sequence of items for each RDD. + */ + def runStreams[V: ClassTag]( + ssc: StreamingContext, + numBatches: Int, + numExpectedOutput: Int + ): Seq[Seq[V]] = { + // Flatten each RDD into a single Seq + runStreamsWithPartitions(ssc, numBatches, numExpectedOutput).map(_.flatten.toSeq) + } + + /** + * Runs the streams set up in `ssc` on manual clock for `numBatches` batches and + * returns the collected output. It will wait until `numExpectedOutput` number of + * output data has been collected or timeout (set by `maxWaitTimeMillis`) is reached. + * + * Returns a sequence of RDD's. Each RDD is represented as several sequences of items, each + * representing one partition. + */ + def runStreamsWithPartitions[V: ClassTag]( + ssc: StreamingContext, + numBatches: Int, + numExpectedOutput: Int + ): Seq[Seq[Seq[V]]] = { + assert(numBatches > 0, "Number of batches to run stream computation is zero") + assert(numExpectedOutput > 0, "Number of expected outputs after " + numBatches + " is zero") + logInfo("numBatches = " + numBatches + ", numExpectedOutput = " + numExpectedOutput) + + // Get the output buffer + val outputStream = ssc.graph.getOutputStreams. + filter(_.isInstanceOf[TestOutputStreamWithPartitions[_]]). + head.asInstanceOf[TestOutputStreamWithPartitions[V]] + val output = outputStream.output + + try { + // Start computation + ssc.start() + + // Advance manual clock + val clock = ssc.scheduler.clock.asInstanceOf[ManualClock] + logInfo("Manual clock before advancing = " + clock.time) + if (actuallyWait) { + for (i <- 1 to numBatches) { + logInfo("Actually waiting for " + batchDuration) + clock.addToTime(batchDuration.milliseconds) + Thread.sleep(batchDuration.milliseconds) + } + } else { + clock.addToTime(numBatches * batchDuration.milliseconds) + } + logInfo("Manual clock after advancing = " + clock.time) + + // Wait until expected number of output items have been generated + val startTime = System.currentTimeMillis() + while (output.size < numExpectedOutput && System.currentTimeMillis() - startTime < maxWaitTimeMillis) { + logInfo("output.size = " + output.size + ", numExpectedOutput = " + numExpectedOutput) + ssc.awaitTermination(50) + } + val timeTaken = System.currentTimeMillis() - startTime + logInfo("Output generated in " + timeTaken + " milliseconds") + output.foreach(x => logInfo("[" + x.mkString(",") + "]")) + assert(timeTaken < maxWaitTimeMillis, "Operation timed out after " + timeTaken + " ms") + assert(output.size === numExpectedOutput, "Unexpected number of outputs generated") + + Thread.sleep(100) // Give some time for the forgetting old RDDs to complete + } finally { + ssc.stop(stopSparkContext = true) + } + output + } + + /** + * Verify whether the output values after running a DStream operation + * is same as the expected output values, by comparing the output + * collections either as lists (order matters) or sets (order does not matter) + */ + def verifyOutput[V: ClassTag]( + output: Seq[Seq[V]], + expectedOutput: Seq[Seq[V]], + useSet: Boolean + ) { + logInfo("--------------------------------") + logInfo("output.size = " + output.size) + logInfo("output") + output.foreach(x => logInfo("[" + x.mkString(",") + "]")) + logInfo("expected output.size = " + expectedOutput.size) + logInfo("expected output") + expectedOutput.foreach(x => logInfo("[" + x.mkString(",") + "]")) + logInfo("--------------------------------") + + // Match the output with the expected output + assert(output.size === expectedOutput.size, "Number of outputs do not match") + for (i <- 0 until output.size) { + if (useSet) { + assert(output(i).toSet === expectedOutput(i).toSet) + } else { + assert(output(i).toList === expectedOutput(i).toList) + } + } + logInfo("Output verified successfully") + } + + /** + * Test unary DStream operation with a list of inputs, with number of + * batches to run same as the number of expected output values + */ + def testOperation[U: ClassTag, V: ClassTag]( + input: Seq[Seq[U]], + operation: DStream[U] => DStream[V], + expectedOutput: Seq[Seq[V]], + useSet: Boolean = false + ) { + testOperation[U, V](input, operation, expectedOutput, -1, useSet) + } + + /** + * Test unary DStream operation with a list of inputs + * @param input Sequence of input collections + * @param operation Binary DStream operation to be applied to the 2 inputs + * @param expectedOutput Sequence of expected output collections + * @param numBatches Number of batches to run the operation for + * @param useSet Compare the output values with the expected output values + * as sets (order matters) or as lists (order does not matter) + */ + def testOperation[U: ClassTag, V: ClassTag]( + input: Seq[Seq[U]], + operation: DStream[U] => DStream[V], + expectedOutput: Seq[Seq[V]], + numBatches: Int, + useSet: Boolean + ) { + val numBatches_ = if (numBatches > 0) numBatches else expectedOutput.size + withStreamingContext(setupStreams[U, V](input, operation)) { ssc => + val output = runStreams[V](ssc, numBatches_, expectedOutput.size) + verifyOutput[V](output, expectedOutput, useSet) + } + } + + /** + * Test binary DStream operation with two lists of inputs, with number of + * batches to run same as the number of expected output values + */ + def testOperation[U: ClassTag, V: ClassTag, W: ClassTag]( + input1: Seq[Seq[U]], + input2: Seq[Seq[V]], + operation: (DStream[U], DStream[V]) => DStream[W], + expectedOutput: Seq[Seq[W]], + useSet: Boolean + ) { + testOperation[U, V, W](input1, input2, operation, expectedOutput, -1, useSet) + } + + /** + * Test binary DStream operation with two lists of inputs + * @param input1 First sequence of input collections + * @param input2 Second sequence of input collections + * @param operation Binary DStream operation to be applied to the 2 inputs + * @param expectedOutput Sequence of expected output collections + * @param numBatches Number of batches to run the operation for + * @param useSet Compare the output values with the expected output values + * as sets (order matters) or as lists (order does not matter) + */ + def testOperation[U: ClassTag, V: ClassTag, W: ClassTag]( + input1: Seq[Seq[U]], + input2: Seq[Seq[V]], + operation: (DStream[U], DStream[V]) => DStream[W], + expectedOutput: Seq[Seq[W]], + numBatches: Int, + useSet: Boolean + ) { + val numBatches_ = if (numBatches > 0) numBatches else expectedOutput.size + withStreamingContext(setupStreams[U, V, W](input1, input2, operation)) { ssc => + val output = runStreams[W](ssc, numBatches_, expectedOutput.size) + verifyOutput[W](output, expectedOutput, useSet) + } + } +} diff --git a/external/kafka/pom.xml b/external/kafka/pom.xml index 2067c473f0e3f..5761ba5e4a971 100644 --- a/external/kafka/pom.xml +++ b/external/kafka/pom.xml @@ -39,13 +39,7 @@ org.apache.spark spark-streaming_${scala.binary.version} ${project.version} - - - org.apache.spark - spark-streaming_${scala.binary.version} - ${project.version} - test-jar - test + provided org.apache.kafka diff --git a/external/kafka/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala b/external/kafka/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala new file mode 100644 index 0000000000000..76b3b73a2ff3b --- /dev/null +++ b/external/kafka/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala @@ -0,0 +1,413 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.streaming + +import java.io.{ObjectInputStream, IOException} + +import scala.collection.mutable.ArrayBuffer +import scala.collection.mutable.SynchronizedBuffer +import scala.reflect.ClassTag + +import org.scalatest.{BeforeAndAfter, FunSuite} + +import org.apache.spark.streaming.dstream.{DStream, InputDStream, ForEachDStream} +import org.apache.spark.streaming.util.ManualClock +import org.apache.spark.{SparkConf, Logging} +import org.apache.spark.rdd.RDD +import org.apache.spark.util.Utils + +/** + * This is a input stream just for the testsuites. This is equivalent to a checkpointable, + * replayable, reliable message queue like Kafka. It requires a sequence as input, and + * returns the i_th element at the i_th batch unde manual clock. + */ +class TestInputStream[T: ClassTag](ssc_ : StreamingContext, input: Seq[Seq[T]], numPartitions: Int) + extends InputDStream[T](ssc_) { + + def start() {} + + def stop() {} + + def compute(validTime: Time): Option[RDD[T]] = { + logInfo("Computing RDD for time " + validTime) + val index = ((validTime - zeroTime) / slideDuration - 1).toInt + val selectedInput = if (index < input.size) input(index) else Seq[T]() + + // lets us test cases where RDDs are not created + if (selectedInput == null) + return None + + val rdd = ssc.sc.makeRDD(selectedInput, numPartitions) + logInfo("Created RDD " + rdd.id + " with " + selectedInput) + Some(rdd) + } +} + +/** + * This is a output stream just for the testsuites. All the output is collected into a + * ArrayBuffer. This buffer is wiped clean on being restored from checkpoint. + * + * The buffer contains a sequence of RDD's, each containing a sequence of items + */ +class TestOutputStream[T: ClassTag](parent: DStream[T], + val output: ArrayBuffer[Seq[T]] = ArrayBuffer[Seq[T]]()) + extends ForEachDStream[T](parent, (rdd: RDD[T], t: Time) => { + val collected = rdd.collect() + output += collected + }) { + + // This is to clear the output buffer every it is read from a checkpoint + @throws(classOf[IOException]) + private def readObject(ois: ObjectInputStream): Unit = Utils.tryOrIOException { + ois.defaultReadObject() + output.clear() + } +} + +/** + * This is a output stream just for the testsuites. All the output is collected into a + * ArrayBuffer. This buffer is wiped clean on being restored from checkpoint. + * + * The buffer contains a sequence of RDD's, each containing a sequence of partitions, each + * containing a sequence of items. + */ +class TestOutputStreamWithPartitions[T: ClassTag](parent: DStream[T], + val output: ArrayBuffer[Seq[Seq[T]]] = ArrayBuffer[Seq[Seq[T]]]()) + extends ForEachDStream[T](parent, (rdd: RDD[T], t: Time) => { + val collected = rdd.glom().collect().map(_.toSeq) + output += collected + }) { + + // This is to clear the output buffer every it is read from a checkpoint + @throws(classOf[IOException]) + private def readObject(ois: ObjectInputStream): Unit = Utils.tryOrIOException { + ois.defaultReadObject() + output.clear() + } + + def toTestOutputStream = new TestOutputStream[T](this.parent, this.output.map(_.flatten)) +} + +/** + * This is the base trait for Spark Streaming testsuites. This provides basic functionality + * to run user-defined set of input on user-defined stream operations, and verify the output. + */ +trait TestSuiteBase extends FunSuite with BeforeAndAfter with Logging { + + // Name of the framework for Spark context + def framework = this.getClass.getSimpleName + + // Master for Spark context + def master = "local[2]" + + // Batch duration + def batchDuration = Seconds(1) + + // Directory where the checkpoint data will be saved + lazy val checkpointDir = { + val dir = Utils.createTempDir() + logDebug(s"checkpointDir: $dir") + dir.toString + } + + // Number of partitions of the input parallel collections created for testing + def numInputPartitions = 2 + + // Maximum time to wait before the test times out + def maxWaitTimeMillis = 10000 + + // Whether to use manual clock or not + def useManualClock = true + + // Whether to actually wait in real time before changing manual clock + def actuallyWait = false + + //// A SparkConf to use in tests. Can be modified before calling setupStreams to configure things. + val conf = new SparkConf() + .setMaster(master) + .setAppName(framework) + + // Default before function for any streaming test suite. Override this + // if you want to add your stuff to "before" (i.e., don't call before { } ) + def beforeFunction() { + if (useManualClock) { + logInfo("Using manual clock") + conf.set("spark.streaming.clock", "org.apache.spark.streaming.util.ManualClock") + } else { + logInfo("Using real clock") + conf.set("spark.streaming.clock", "org.apache.spark.streaming.util.SystemClock") + } + } + + // Default after function for any streaming test suite. Override this + // if you want to add your stuff to "after" (i.e., don't call after { } ) + def afterFunction() { + System.clearProperty("spark.streaming.clock") + } + + before(beforeFunction) + after(afterFunction) + + /** + * Run a block of code with the given StreamingContext and automatically + * stop the context when the block completes or when an exception is thrown. + */ + def withStreamingContext[R](ssc: StreamingContext)(block: StreamingContext => R): R = { + try { + block(ssc) + } finally { + try { + ssc.stop(stopSparkContext = true) + } catch { + case e: Exception => + logError("Error stopping StreamingContext", e) + } + } + } + + /** + * Set up required DStreams to test the DStream operation using the two sequences + * of input collections. + */ + def setupStreams[U: ClassTag, V: ClassTag]( + input: Seq[Seq[U]], + operation: DStream[U] => DStream[V], + numPartitions: Int = numInputPartitions + ): StreamingContext = { + // Create StreamingContext + val ssc = new StreamingContext(conf, batchDuration) + if (checkpointDir != null) { + ssc.checkpoint(checkpointDir) + } + + // Setup the stream computation + val inputStream = new TestInputStream(ssc, input, numPartitions) + val operatedStream = operation(inputStream) + val outputStream = new TestOutputStreamWithPartitions(operatedStream, + new ArrayBuffer[Seq[Seq[V]]] with SynchronizedBuffer[Seq[Seq[V]]]) + outputStream.register() + ssc + } + + /** + * Set up required DStreams to test the binary operation using the sequence + * of input collections. + */ + def setupStreams[U: ClassTag, V: ClassTag, W: ClassTag]( + input1: Seq[Seq[U]], + input2: Seq[Seq[V]], + operation: (DStream[U], DStream[V]) => DStream[W] + ): StreamingContext = { + // Create StreamingContext + val ssc = new StreamingContext(conf, batchDuration) + if (checkpointDir != null) { + ssc.checkpoint(checkpointDir) + } + + // Setup the stream computation + val inputStream1 = new TestInputStream(ssc, input1, numInputPartitions) + val inputStream2 = new TestInputStream(ssc, input2, numInputPartitions) + val operatedStream = operation(inputStream1, inputStream2) + val outputStream = new TestOutputStreamWithPartitions(operatedStream, + new ArrayBuffer[Seq[Seq[W]]] with SynchronizedBuffer[Seq[Seq[W]]]) + outputStream.register() + ssc + } + + /** + * Runs the streams set up in `ssc` on manual clock for `numBatches` batches and + * returns the collected output. It will wait until `numExpectedOutput` number of + * output data has been collected or timeout (set by `maxWaitTimeMillis`) is reached. + * + * Returns a sequence of items for each RDD. + */ + def runStreams[V: ClassTag]( + ssc: StreamingContext, + numBatches: Int, + numExpectedOutput: Int + ): Seq[Seq[V]] = { + // Flatten each RDD into a single Seq + runStreamsWithPartitions(ssc, numBatches, numExpectedOutput).map(_.flatten.toSeq) + } + + /** + * Runs the streams set up in `ssc` on manual clock for `numBatches` batches and + * returns the collected output. It will wait until `numExpectedOutput` number of + * output data has been collected or timeout (set by `maxWaitTimeMillis`) is reached. + * + * Returns a sequence of RDD's. Each RDD is represented as several sequences of items, each + * representing one partition. + */ + def runStreamsWithPartitions[V: ClassTag]( + ssc: StreamingContext, + numBatches: Int, + numExpectedOutput: Int + ): Seq[Seq[Seq[V]]] = { + assert(numBatches > 0, "Number of batches to run stream computation is zero") + assert(numExpectedOutput > 0, "Number of expected outputs after " + numBatches + " is zero") + logInfo("numBatches = " + numBatches + ", numExpectedOutput = " + numExpectedOutput) + + // Get the output buffer + val outputStream = ssc.graph.getOutputStreams. + filter(_.isInstanceOf[TestOutputStreamWithPartitions[_]]). + head.asInstanceOf[TestOutputStreamWithPartitions[V]] + val output = outputStream.output + + try { + // Start computation + ssc.start() + + // Advance manual clock + val clock = ssc.scheduler.clock.asInstanceOf[ManualClock] + logInfo("Manual clock before advancing = " + clock.time) + if (actuallyWait) { + for (i <- 1 to numBatches) { + logInfo("Actually waiting for " + batchDuration) + clock.addToTime(batchDuration.milliseconds) + Thread.sleep(batchDuration.milliseconds) + } + } else { + clock.addToTime(numBatches * batchDuration.milliseconds) + } + logInfo("Manual clock after advancing = " + clock.time) + + // Wait until expected number of output items have been generated + val startTime = System.currentTimeMillis() + while (output.size < numExpectedOutput && System.currentTimeMillis() - startTime < maxWaitTimeMillis) { + logInfo("output.size = " + output.size + ", numExpectedOutput = " + numExpectedOutput) + ssc.awaitTermination(50) + } + val timeTaken = System.currentTimeMillis() - startTime + logInfo("Output generated in " + timeTaken + " milliseconds") + output.foreach(x => logInfo("[" + x.mkString(",") + "]")) + assert(timeTaken < maxWaitTimeMillis, "Operation timed out after " + timeTaken + " ms") + assert(output.size === numExpectedOutput, "Unexpected number of outputs generated") + + Thread.sleep(100) // Give some time for the forgetting old RDDs to complete + } finally { + ssc.stop(stopSparkContext = true) + } + output + } + + /** + * Verify whether the output values after running a DStream operation + * is same as the expected output values, by comparing the output + * collections either as lists (order matters) or sets (order does not matter) + */ + def verifyOutput[V: ClassTag]( + output: Seq[Seq[V]], + expectedOutput: Seq[Seq[V]], + useSet: Boolean + ) { + logInfo("--------------------------------") + logInfo("output.size = " + output.size) + logInfo("output") + output.foreach(x => logInfo("[" + x.mkString(",") + "]")) + logInfo("expected output.size = " + expectedOutput.size) + logInfo("expected output") + expectedOutput.foreach(x => logInfo("[" + x.mkString(",") + "]")) + logInfo("--------------------------------") + + // Match the output with the expected output + assert(output.size === expectedOutput.size, "Number of outputs do not match") + for (i <- 0 until output.size) { + if (useSet) { + assert(output(i).toSet === expectedOutput(i).toSet) + } else { + assert(output(i).toList === expectedOutput(i).toList) + } + } + logInfo("Output verified successfully") + } + + /** + * Test unary DStream operation with a list of inputs, with number of + * batches to run same as the number of expected output values + */ + def testOperation[U: ClassTag, V: ClassTag]( + input: Seq[Seq[U]], + operation: DStream[U] => DStream[V], + expectedOutput: Seq[Seq[V]], + useSet: Boolean = false + ) { + testOperation[U, V](input, operation, expectedOutput, -1, useSet) + } + + /** + * Test unary DStream operation with a list of inputs + * @param input Sequence of input collections + * @param operation Binary DStream operation to be applied to the 2 inputs + * @param expectedOutput Sequence of expected output collections + * @param numBatches Number of batches to run the operation for + * @param useSet Compare the output values with the expected output values + * as sets (order matters) or as lists (order does not matter) + */ + def testOperation[U: ClassTag, V: ClassTag]( + input: Seq[Seq[U]], + operation: DStream[U] => DStream[V], + expectedOutput: Seq[Seq[V]], + numBatches: Int, + useSet: Boolean + ) { + val numBatches_ = if (numBatches > 0) numBatches else expectedOutput.size + withStreamingContext(setupStreams[U, V](input, operation)) { ssc => + val output = runStreams[V](ssc, numBatches_, expectedOutput.size) + verifyOutput[V](output, expectedOutput, useSet) + } + } + + /** + * Test binary DStream operation with two lists of inputs, with number of + * batches to run same as the number of expected output values + */ + def testOperation[U: ClassTag, V: ClassTag, W: ClassTag]( + input1: Seq[Seq[U]], + input2: Seq[Seq[V]], + operation: (DStream[U], DStream[V]) => DStream[W], + expectedOutput: Seq[Seq[W]], + useSet: Boolean + ) { + testOperation[U, V, W](input1, input2, operation, expectedOutput, -1, useSet) + } + + /** + * Test binary DStream operation with two lists of inputs + * @param input1 First sequence of input collections + * @param input2 Second sequence of input collections + * @param operation Binary DStream operation to be applied to the 2 inputs + * @param expectedOutput Sequence of expected output collections + * @param numBatches Number of batches to run the operation for + * @param useSet Compare the output values with the expected output values + * as sets (order matters) or as lists (order does not matter) + */ + def testOperation[U: ClassTag, V: ClassTag, W: ClassTag]( + input1: Seq[Seq[U]], + input2: Seq[Seq[V]], + operation: (DStream[U], DStream[V]) => DStream[W], + expectedOutput: Seq[Seq[W]], + numBatches: Int, + useSet: Boolean + ) { + val numBatches_ = if (numBatches > 0) numBatches else expectedOutput.size + withStreamingContext(setupStreams[U, V, W](input1, input2, operation)) { ssc => + val output = runStreams[W](ssc, numBatches_, expectedOutput.size) + verifyOutput[W](output, expectedOutput, useSet) + } + } +} diff --git a/external/mqtt/pom.xml b/external/mqtt/pom.xml index 362a76e515938..e1b816a43b0ec 100644 --- a/external/mqtt/pom.xml +++ b/external/mqtt/pom.xml @@ -39,13 +39,7 @@ org.apache.spark spark-streaming_${scala.binary.version} ${project.version} - - - org.apache.spark - spark-streaming_${scala.binary.version} - ${project.version} - test-jar - test + provided org.eclipse.paho diff --git a/external/mqtt/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java b/external/mqtt/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java new file mode 100644 index 0000000000000..6e1f01900071b --- /dev/null +++ b/external/mqtt/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.streaming; + +import org.apache.spark.streaming.api.java.JavaStreamingContext; +import org.junit.After; +import org.junit.Before; + +public abstract class LocalJavaStreamingContext { + + protected transient JavaStreamingContext ssc; + + @Before + public void setUp() { + System.setProperty("spark.streaming.clock", "org.apache.spark.streaming.util.ManualClock"); + ssc = new JavaStreamingContext("local[2]", "test", new Duration(1000)); + ssc.checkpoint("checkpoint"); + } + + @After + public void tearDown() { + ssc.stop(); + ssc = null; + } +} diff --git a/external/mqtt/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala b/external/mqtt/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala new file mode 100644 index 0000000000000..76b3b73a2ff3b --- /dev/null +++ b/external/mqtt/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala @@ -0,0 +1,413 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.streaming + +import java.io.{ObjectInputStream, IOException} + +import scala.collection.mutable.ArrayBuffer +import scala.collection.mutable.SynchronizedBuffer +import scala.reflect.ClassTag + +import org.scalatest.{BeforeAndAfter, FunSuite} + +import org.apache.spark.streaming.dstream.{DStream, InputDStream, ForEachDStream} +import org.apache.spark.streaming.util.ManualClock +import org.apache.spark.{SparkConf, Logging} +import org.apache.spark.rdd.RDD +import org.apache.spark.util.Utils + +/** + * This is a input stream just for the testsuites. This is equivalent to a checkpointable, + * replayable, reliable message queue like Kafka. It requires a sequence as input, and + * returns the i_th element at the i_th batch unde manual clock. + */ +class TestInputStream[T: ClassTag](ssc_ : StreamingContext, input: Seq[Seq[T]], numPartitions: Int) + extends InputDStream[T](ssc_) { + + def start() {} + + def stop() {} + + def compute(validTime: Time): Option[RDD[T]] = { + logInfo("Computing RDD for time " + validTime) + val index = ((validTime - zeroTime) / slideDuration - 1).toInt + val selectedInput = if (index < input.size) input(index) else Seq[T]() + + // lets us test cases where RDDs are not created + if (selectedInput == null) + return None + + val rdd = ssc.sc.makeRDD(selectedInput, numPartitions) + logInfo("Created RDD " + rdd.id + " with " + selectedInput) + Some(rdd) + } +} + +/** + * This is a output stream just for the testsuites. All the output is collected into a + * ArrayBuffer. This buffer is wiped clean on being restored from checkpoint. + * + * The buffer contains a sequence of RDD's, each containing a sequence of items + */ +class TestOutputStream[T: ClassTag](parent: DStream[T], + val output: ArrayBuffer[Seq[T]] = ArrayBuffer[Seq[T]]()) + extends ForEachDStream[T](parent, (rdd: RDD[T], t: Time) => { + val collected = rdd.collect() + output += collected + }) { + + // This is to clear the output buffer every it is read from a checkpoint + @throws(classOf[IOException]) + private def readObject(ois: ObjectInputStream): Unit = Utils.tryOrIOException { + ois.defaultReadObject() + output.clear() + } +} + +/** + * This is a output stream just for the testsuites. All the output is collected into a + * ArrayBuffer. This buffer is wiped clean on being restored from checkpoint. + * + * The buffer contains a sequence of RDD's, each containing a sequence of partitions, each + * containing a sequence of items. + */ +class TestOutputStreamWithPartitions[T: ClassTag](parent: DStream[T], + val output: ArrayBuffer[Seq[Seq[T]]] = ArrayBuffer[Seq[Seq[T]]]()) + extends ForEachDStream[T](parent, (rdd: RDD[T], t: Time) => { + val collected = rdd.glom().collect().map(_.toSeq) + output += collected + }) { + + // This is to clear the output buffer every it is read from a checkpoint + @throws(classOf[IOException]) + private def readObject(ois: ObjectInputStream): Unit = Utils.tryOrIOException { + ois.defaultReadObject() + output.clear() + } + + def toTestOutputStream = new TestOutputStream[T](this.parent, this.output.map(_.flatten)) +} + +/** + * This is the base trait for Spark Streaming testsuites. This provides basic functionality + * to run user-defined set of input on user-defined stream operations, and verify the output. + */ +trait TestSuiteBase extends FunSuite with BeforeAndAfter with Logging { + + // Name of the framework for Spark context + def framework = this.getClass.getSimpleName + + // Master for Spark context + def master = "local[2]" + + // Batch duration + def batchDuration = Seconds(1) + + // Directory where the checkpoint data will be saved + lazy val checkpointDir = { + val dir = Utils.createTempDir() + logDebug(s"checkpointDir: $dir") + dir.toString + } + + // Number of partitions of the input parallel collections created for testing + def numInputPartitions = 2 + + // Maximum time to wait before the test times out + def maxWaitTimeMillis = 10000 + + // Whether to use manual clock or not + def useManualClock = true + + // Whether to actually wait in real time before changing manual clock + def actuallyWait = false + + //// A SparkConf to use in tests. Can be modified before calling setupStreams to configure things. + val conf = new SparkConf() + .setMaster(master) + .setAppName(framework) + + // Default before function for any streaming test suite. Override this + // if you want to add your stuff to "before" (i.e., don't call before { } ) + def beforeFunction() { + if (useManualClock) { + logInfo("Using manual clock") + conf.set("spark.streaming.clock", "org.apache.spark.streaming.util.ManualClock") + } else { + logInfo("Using real clock") + conf.set("spark.streaming.clock", "org.apache.spark.streaming.util.SystemClock") + } + } + + // Default after function for any streaming test suite. Override this + // if you want to add your stuff to "after" (i.e., don't call after { } ) + def afterFunction() { + System.clearProperty("spark.streaming.clock") + } + + before(beforeFunction) + after(afterFunction) + + /** + * Run a block of code with the given StreamingContext and automatically + * stop the context when the block completes or when an exception is thrown. + */ + def withStreamingContext[R](ssc: StreamingContext)(block: StreamingContext => R): R = { + try { + block(ssc) + } finally { + try { + ssc.stop(stopSparkContext = true) + } catch { + case e: Exception => + logError("Error stopping StreamingContext", e) + } + } + } + + /** + * Set up required DStreams to test the DStream operation using the two sequences + * of input collections. + */ + def setupStreams[U: ClassTag, V: ClassTag]( + input: Seq[Seq[U]], + operation: DStream[U] => DStream[V], + numPartitions: Int = numInputPartitions + ): StreamingContext = { + // Create StreamingContext + val ssc = new StreamingContext(conf, batchDuration) + if (checkpointDir != null) { + ssc.checkpoint(checkpointDir) + } + + // Setup the stream computation + val inputStream = new TestInputStream(ssc, input, numPartitions) + val operatedStream = operation(inputStream) + val outputStream = new TestOutputStreamWithPartitions(operatedStream, + new ArrayBuffer[Seq[Seq[V]]] with SynchronizedBuffer[Seq[Seq[V]]]) + outputStream.register() + ssc + } + + /** + * Set up required DStreams to test the binary operation using the sequence + * of input collections. + */ + def setupStreams[U: ClassTag, V: ClassTag, W: ClassTag]( + input1: Seq[Seq[U]], + input2: Seq[Seq[V]], + operation: (DStream[U], DStream[V]) => DStream[W] + ): StreamingContext = { + // Create StreamingContext + val ssc = new StreamingContext(conf, batchDuration) + if (checkpointDir != null) { + ssc.checkpoint(checkpointDir) + } + + // Setup the stream computation + val inputStream1 = new TestInputStream(ssc, input1, numInputPartitions) + val inputStream2 = new TestInputStream(ssc, input2, numInputPartitions) + val operatedStream = operation(inputStream1, inputStream2) + val outputStream = new TestOutputStreamWithPartitions(operatedStream, + new ArrayBuffer[Seq[Seq[W]]] with SynchronizedBuffer[Seq[Seq[W]]]) + outputStream.register() + ssc + } + + /** + * Runs the streams set up in `ssc` on manual clock for `numBatches` batches and + * returns the collected output. It will wait until `numExpectedOutput` number of + * output data has been collected or timeout (set by `maxWaitTimeMillis`) is reached. + * + * Returns a sequence of items for each RDD. + */ + def runStreams[V: ClassTag]( + ssc: StreamingContext, + numBatches: Int, + numExpectedOutput: Int + ): Seq[Seq[V]] = { + // Flatten each RDD into a single Seq + runStreamsWithPartitions(ssc, numBatches, numExpectedOutput).map(_.flatten.toSeq) + } + + /** + * Runs the streams set up in `ssc` on manual clock for `numBatches` batches and + * returns the collected output. It will wait until `numExpectedOutput` number of + * output data has been collected or timeout (set by `maxWaitTimeMillis`) is reached. + * + * Returns a sequence of RDD's. Each RDD is represented as several sequences of items, each + * representing one partition. + */ + def runStreamsWithPartitions[V: ClassTag]( + ssc: StreamingContext, + numBatches: Int, + numExpectedOutput: Int + ): Seq[Seq[Seq[V]]] = { + assert(numBatches > 0, "Number of batches to run stream computation is zero") + assert(numExpectedOutput > 0, "Number of expected outputs after " + numBatches + " is zero") + logInfo("numBatches = " + numBatches + ", numExpectedOutput = " + numExpectedOutput) + + // Get the output buffer + val outputStream = ssc.graph.getOutputStreams. + filter(_.isInstanceOf[TestOutputStreamWithPartitions[_]]). + head.asInstanceOf[TestOutputStreamWithPartitions[V]] + val output = outputStream.output + + try { + // Start computation + ssc.start() + + // Advance manual clock + val clock = ssc.scheduler.clock.asInstanceOf[ManualClock] + logInfo("Manual clock before advancing = " + clock.time) + if (actuallyWait) { + for (i <- 1 to numBatches) { + logInfo("Actually waiting for " + batchDuration) + clock.addToTime(batchDuration.milliseconds) + Thread.sleep(batchDuration.milliseconds) + } + } else { + clock.addToTime(numBatches * batchDuration.milliseconds) + } + logInfo("Manual clock after advancing = " + clock.time) + + // Wait until expected number of output items have been generated + val startTime = System.currentTimeMillis() + while (output.size < numExpectedOutput && System.currentTimeMillis() - startTime < maxWaitTimeMillis) { + logInfo("output.size = " + output.size + ", numExpectedOutput = " + numExpectedOutput) + ssc.awaitTermination(50) + } + val timeTaken = System.currentTimeMillis() - startTime + logInfo("Output generated in " + timeTaken + " milliseconds") + output.foreach(x => logInfo("[" + x.mkString(",") + "]")) + assert(timeTaken < maxWaitTimeMillis, "Operation timed out after " + timeTaken + " ms") + assert(output.size === numExpectedOutput, "Unexpected number of outputs generated") + + Thread.sleep(100) // Give some time for the forgetting old RDDs to complete + } finally { + ssc.stop(stopSparkContext = true) + } + output + } + + /** + * Verify whether the output values after running a DStream operation + * is same as the expected output values, by comparing the output + * collections either as lists (order matters) or sets (order does not matter) + */ + def verifyOutput[V: ClassTag]( + output: Seq[Seq[V]], + expectedOutput: Seq[Seq[V]], + useSet: Boolean + ) { + logInfo("--------------------------------") + logInfo("output.size = " + output.size) + logInfo("output") + output.foreach(x => logInfo("[" + x.mkString(",") + "]")) + logInfo("expected output.size = " + expectedOutput.size) + logInfo("expected output") + expectedOutput.foreach(x => logInfo("[" + x.mkString(",") + "]")) + logInfo("--------------------------------") + + // Match the output with the expected output + assert(output.size === expectedOutput.size, "Number of outputs do not match") + for (i <- 0 until output.size) { + if (useSet) { + assert(output(i).toSet === expectedOutput(i).toSet) + } else { + assert(output(i).toList === expectedOutput(i).toList) + } + } + logInfo("Output verified successfully") + } + + /** + * Test unary DStream operation with a list of inputs, with number of + * batches to run same as the number of expected output values + */ + def testOperation[U: ClassTag, V: ClassTag]( + input: Seq[Seq[U]], + operation: DStream[U] => DStream[V], + expectedOutput: Seq[Seq[V]], + useSet: Boolean = false + ) { + testOperation[U, V](input, operation, expectedOutput, -1, useSet) + } + + /** + * Test unary DStream operation with a list of inputs + * @param input Sequence of input collections + * @param operation Binary DStream operation to be applied to the 2 inputs + * @param expectedOutput Sequence of expected output collections + * @param numBatches Number of batches to run the operation for + * @param useSet Compare the output values with the expected output values + * as sets (order matters) or as lists (order does not matter) + */ + def testOperation[U: ClassTag, V: ClassTag]( + input: Seq[Seq[U]], + operation: DStream[U] => DStream[V], + expectedOutput: Seq[Seq[V]], + numBatches: Int, + useSet: Boolean + ) { + val numBatches_ = if (numBatches > 0) numBatches else expectedOutput.size + withStreamingContext(setupStreams[U, V](input, operation)) { ssc => + val output = runStreams[V](ssc, numBatches_, expectedOutput.size) + verifyOutput[V](output, expectedOutput, useSet) + } + } + + /** + * Test binary DStream operation with two lists of inputs, with number of + * batches to run same as the number of expected output values + */ + def testOperation[U: ClassTag, V: ClassTag, W: ClassTag]( + input1: Seq[Seq[U]], + input2: Seq[Seq[V]], + operation: (DStream[U], DStream[V]) => DStream[W], + expectedOutput: Seq[Seq[W]], + useSet: Boolean + ) { + testOperation[U, V, W](input1, input2, operation, expectedOutput, -1, useSet) + } + + /** + * Test binary DStream operation with two lists of inputs + * @param input1 First sequence of input collections + * @param input2 Second sequence of input collections + * @param operation Binary DStream operation to be applied to the 2 inputs + * @param expectedOutput Sequence of expected output collections + * @param numBatches Number of batches to run the operation for + * @param useSet Compare the output values with the expected output values + * as sets (order matters) or as lists (order does not matter) + */ + def testOperation[U: ClassTag, V: ClassTag, W: ClassTag]( + input1: Seq[Seq[U]], + input2: Seq[Seq[V]], + operation: (DStream[U], DStream[V]) => DStream[W], + expectedOutput: Seq[Seq[W]], + numBatches: Int, + useSet: Boolean + ) { + val numBatches_ = if (numBatches > 0) numBatches else expectedOutput.size + withStreamingContext(setupStreams[U, V, W](input1, input2, operation)) { ssc => + val output = runStreams[W](ssc, numBatches_, expectedOutput.size) + verifyOutput[W](output, expectedOutput, useSet) + } + } +} diff --git a/external/twitter/pom.xml b/external/twitter/pom.xml index 1d7dd49d15c22..81a53105af8b5 100644 --- a/external/twitter/pom.xml +++ b/external/twitter/pom.xml @@ -39,13 +39,7 @@ org.apache.spark spark-streaming_${scala.binary.version} ${project.version} - - - org.apache.spark - spark-streaming_${scala.binary.version} - ${project.version} - test-jar - test + provided org.twitter4j diff --git a/external/twitter/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java b/external/twitter/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java new file mode 100644 index 0000000000000..6e1f01900071b --- /dev/null +++ b/external/twitter/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.streaming; + +import org.apache.spark.streaming.api.java.JavaStreamingContext; +import org.junit.After; +import org.junit.Before; + +public abstract class LocalJavaStreamingContext { + + protected transient JavaStreamingContext ssc; + + @Before + public void setUp() { + System.setProperty("spark.streaming.clock", "org.apache.spark.streaming.util.ManualClock"); + ssc = new JavaStreamingContext("local[2]", "test", new Duration(1000)); + ssc.checkpoint("checkpoint"); + } + + @After + public void tearDown() { + ssc.stop(); + ssc = null; + } +} diff --git a/external/twitter/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala b/external/twitter/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala new file mode 100644 index 0000000000000..76b3b73a2ff3b --- /dev/null +++ b/external/twitter/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala @@ -0,0 +1,413 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.streaming + +import java.io.{ObjectInputStream, IOException} + +import scala.collection.mutable.ArrayBuffer +import scala.collection.mutable.SynchronizedBuffer +import scala.reflect.ClassTag + +import org.scalatest.{BeforeAndAfter, FunSuite} + +import org.apache.spark.streaming.dstream.{DStream, InputDStream, ForEachDStream} +import org.apache.spark.streaming.util.ManualClock +import org.apache.spark.{SparkConf, Logging} +import org.apache.spark.rdd.RDD +import org.apache.spark.util.Utils + +/** + * This is a input stream just for the testsuites. This is equivalent to a checkpointable, + * replayable, reliable message queue like Kafka. It requires a sequence as input, and + * returns the i_th element at the i_th batch unde manual clock. + */ +class TestInputStream[T: ClassTag](ssc_ : StreamingContext, input: Seq[Seq[T]], numPartitions: Int) + extends InputDStream[T](ssc_) { + + def start() {} + + def stop() {} + + def compute(validTime: Time): Option[RDD[T]] = { + logInfo("Computing RDD for time " + validTime) + val index = ((validTime - zeroTime) / slideDuration - 1).toInt + val selectedInput = if (index < input.size) input(index) else Seq[T]() + + // lets us test cases where RDDs are not created + if (selectedInput == null) + return None + + val rdd = ssc.sc.makeRDD(selectedInput, numPartitions) + logInfo("Created RDD " + rdd.id + " with " + selectedInput) + Some(rdd) + } +} + +/** + * This is a output stream just for the testsuites. All the output is collected into a + * ArrayBuffer. This buffer is wiped clean on being restored from checkpoint. + * + * The buffer contains a sequence of RDD's, each containing a sequence of items + */ +class TestOutputStream[T: ClassTag](parent: DStream[T], + val output: ArrayBuffer[Seq[T]] = ArrayBuffer[Seq[T]]()) + extends ForEachDStream[T](parent, (rdd: RDD[T], t: Time) => { + val collected = rdd.collect() + output += collected + }) { + + // This is to clear the output buffer every it is read from a checkpoint + @throws(classOf[IOException]) + private def readObject(ois: ObjectInputStream): Unit = Utils.tryOrIOException { + ois.defaultReadObject() + output.clear() + } +} + +/** + * This is a output stream just for the testsuites. All the output is collected into a + * ArrayBuffer. This buffer is wiped clean on being restored from checkpoint. + * + * The buffer contains a sequence of RDD's, each containing a sequence of partitions, each + * containing a sequence of items. + */ +class TestOutputStreamWithPartitions[T: ClassTag](parent: DStream[T], + val output: ArrayBuffer[Seq[Seq[T]]] = ArrayBuffer[Seq[Seq[T]]]()) + extends ForEachDStream[T](parent, (rdd: RDD[T], t: Time) => { + val collected = rdd.glom().collect().map(_.toSeq) + output += collected + }) { + + // This is to clear the output buffer every it is read from a checkpoint + @throws(classOf[IOException]) + private def readObject(ois: ObjectInputStream): Unit = Utils.tryOrIOException { + ois.defaultReadObject() + output.clear() + } + + def toTestOutputStream = new TestOutputStream[T](this.parent, this.output.map(_.flatten)) +} + +/** + * This is the base trait for Spark Streaming testsuites. This provides basic functionality + * to run user-defined set of input on user-defined stream operations, and verify the output. + */ +trait TestSuiteBase extends FunSuite with BeforeAndAfter with Logging { + + // Name of the framework for Spark context + def framework = this.getClass.getSimpleName + + // Master for Spark context + def master = "local[2]" + + // Batch duration + def batchDuration = Seconds(1) + + // Directory where the checkpoint data will be saved + lazy val checkpointDir = { + val dir = Utils.createTempDir() + logDebug(s"checkpointDir: $dir") + dir.toString + } + + // Number of partitions of the input parallel collections created for testing + def numInputPartitions = 2 + + // Maximum time to wait before the test times out + def maxWaitTimeMillis = 10000 + + // Whether to use manual clock or not + def useManualClock = true + + // Whether to actually wait in real time before changing manual clock + def actuallyWait = false + + //// A SparkConf to use in tests. Can be modified before calling setupStreams to configure things. + val conf = new SparkConf() + .setMaster(master) + .setAppName(framework) + + // Default before function for any streaming test suite. Override this + // if you want to add your stuff to "before" (i.e., don't call before { } ) + def beforeFunction() { + if (useManualClock) { + logInfo("Using manual clock") + conf.set("spark.streaming.clock", "org.apache.spark.streaming.util.ManualClock") + } else { + logInfo("Using real clock") + conf.set("spark.streaming.clock", "org.apache.spark.streaming.util.SystemClock") + } + } + + // Default after function for any streaming test suite. Override this + // if you want to add your stuff to "after" (i.e., don't call after { } ) + def afterFunction() { + System.clearProperty("spark.streaming.clock") + } + + before(beforeFunction) + after(afterFunction) + + /** + * Run a block of code with the given StreamingContext and automatically + * stop the context when the block completes or when an exception is thrown. + */ + def withStreamingContext[R](ssc: StreamingContext)(block: StreamingContext => R): R = { + try { + block(ssc) + } finally { + try { + ssc.stop(stopSparkContext = true) + } catch { + case e: Exception => + logError("Error stopping StreamingContext", e) + } + } + } + + /** + * Set up required DStreams to test the DStream operation using the two sequences + * of input collections. + */ + def setupStreams[U: ClassTag, V: ClassTag]( + input: Seq[Seq[U]], + operation: DStream[U] => DStream[V], + numPartitions: Int = numInputPartitions + ): StreamingContext = { + // Create StreamingContext + val ssc = new StreamingContext(conf, batchDuration) + if (checkpointDir != null) { + ssc.checkpoint(checkpointDir) + } + + // Setup the stream computation + val inputStream = new TestInputStream(ssc, input, numPartitions) + val operatedStream = operation(inputStream) + val outputStream = new TestOutputStreamWithPartitions(operatedStream, + new ArrayBuffer[Seq[Seq[V]]] with SynchronizedBuffer[Seq[Seq[V]]]) + outputStream.register() + ssc + } + + /** + * Set up required DStreams to test the binary operation using the sequence + * of input collections. + */ + def setupStreams[U: ClassTag, V: ClassTag, W: ClassTag]( + input1: Seq[Seq[U]], + input2: Seq[Seq[V]], + operation: (DStream[U], DStream[V]) => DStream[W] + ): StreamingContext = { + // Create StreamingContext + val ssc = new StreamingContext(conf, batchDuration) + if (checkpointDir != null) { + ssc.checkpoint(checkpointDir) + } + + // Setup the stream computation + val inputStream1 = new TestInputStream(ssc, input1, numInputPartitions) + val inputStream2 = new TestInputStream(ssc, input2, numInputPartitions) + val operatedStream = operation(inputStream1, inputStream2) + val outputStream = new TestOutputStreamWithPartitions(operatedStream, + new ArrayBuffer[Seq[Seq[W]]] with SynchronizedBuffer[Seq[Seq[W]]]) + outputStream.register() + ssc + } + + /** + * Runs the streams set up in `ssc` on manual clock for `numBatches` batches and + * returns the collected output. It will wait until `numExpectedOutput` number of + * output data has been collected or timeout (set by `maxWaitTimeMillis`) is reached. + * + * Returns a sequence of items for each RDD. + */ + def runStreams[V: ClassTag]( + ssc: StreamingContext, + numBatches: Int, + numExpectedOutput: Int + ): Seq[Seq[V]] = { + // Flatten each RDD into a single Seq + runStreamsWithPartitions(ssc, numBatches, numExpectedOutput).map(_.flatten.toSeq) + } + + /** + * Runs the streams set up in `ssc` on manual clock for `numBatches` batches and + * returns the collected output. It will wait until `numExpectedOutput` number of + * output data has been collected or timeout (set by `maxWaitTimeMillis`) is reached. + * + * Returns a sequence of RDD's. Each RDD is represented as several sequences of items, each + * representing one partition. + */ + def runStreamsWithPartitions[V: ClassTag]( + ssc: StreamingContext, + numBatches: Int, + numExpectedOutput: Int + ): Seq[Seq[Seq[V]]] = { + assert(numBatches > 0, "Number of batches to run stream computation is zero") + assert(numExpectedOutput > 0, "Number of expected outputs after " + numBatches + " is zero") + logInfo("numBatches = " + numBatches + ", numExpectedOutput = " + numExpectedOutput) + + // Get the output buffer + val outputStream = ssc.graph.getOutputStreams. + filter(_.isInstanceOf[TestOutputStreamWithPartitions[_]]). + head.asInstanceOf[TestOutputStreamWithPartitions[V]] + val output = outputStream.output + + try { + // Start computation + ssc.start() + + // Advance manual clock + val clock = ssc.scheduler.clock.asInstanceOf[ManualClock] + logInfo("Manual clock before advancing = " + clock.time) + if (actuallyWait) { + for (i <- 1 to numBatches) { + logInfo("Actually waiting for " + batchDuration) + clock.addToTime(batchDuration.milliseconds) + Thread.sleep(batchDuration.milliseconds) + } + } else { + clock.addToTime(numBatches * batchDuration.milliseconds) + } + logInfo("Manual clock after advancing = " + clock.time) + + // Wait until expected number of output items have been generated + val startTime = System.currentTimeMillis() + while (output.size < numExpectedOutput && System.currentTimeMillis() - startTime < maxWaitTimeMillis) { + logInfo("output.size = " + output.size + ", numExpectedOutput = " + numExpectedOutput) + ssc.awaitTermination(50) + } + val timeTaken = System.currentTimeMillis() - startTime + logInfo("Output generated in " + timeTaken + " milliseconds") + output.foreach(x => logInfo("[" + x.mkString(",") + "]")) + assert(timeTaken < maxWaitTimeMillis, "Operation timed out after " + timeTaken + " ms") + assert(output.size === numExpectedOutput, "Unexpected number of outputs generated") + + Thread.sleep(100) // Give some time for the forgetting old RDDs to complete + } finally { + ssc.stop(stopSparkContext = true) + } + output + } + + /** + * Verify whether the output values after running a DStream operation + * is same as the expected output values, by comparing the output + * collections either as lists (order matters) or sets (order does not matter) + */ + def verifyOutput[V: ClassTag]( + output: Seq[Seq[V]], + expectedOutput: Seq[Seq[V]], + useSet: Boolean + ) { + logInfo("--------------------------------") + logInfo("output.size = " + output.size) + logInfo("output") + output.foreach(x => logInfo("[" + x.mkString(",") + "]")) + logInfo("expected output.size = " + expectedOutput.size) + logInfo("expected output") + expectedOutput.foreach(x => logInfo("[" + x.mkString(",") + "]")) + logInfo("--------------------------------") + + // Match the output with the expected output + assert(output.size === expectedOutput.size, "Number of outputs do not match") + for (i <- 0 until output.size) { + if (useSet) { + assert(output(i).toSet === expectedOutput(i).toSet) + } else { + assert(output(i).toList === expectedOutput(i).toList) + } + } + logInfo("Output verified successfully") + } + + /** + * Test unary DStream operation with a list of inputs, with number of + * batches to run same as the number of expected output values + */ + def testOperation[U: ClassTag, V: ClassTag]( + input: Seq[Seq[U]], + operation: DStream[U] => DStream[V], + expectedOutput: Seq[Seq[V]], + useSet: Boolean = false + ) { + testOperation[U, V](input, operation, expectedOutput, -1, useSet) + } + + /** + * Test unary DStream operation with a list of inputs + * @param input Sequence of input collections + * @param operation Binary DStream operation to be applied to the 2 inputs + * @param expectedOutput Sequence of expected output collections + * @param numBatches Number of batches to run the operation for + * @param useSet Compare the output values with the expected output values + * as sets (order matters) or as lists (order does not matter) + */ + def testOperation[U: ClassTag, V: ClassTag]( + input: Seq[Seq[U]], + operation: DStream[U] => DStream[V], + expectedOutput: Seq[Seq[V]], + numBatches: Int, + useSet: Boolean + ) { + val numBatches_ = if (numBatches > 0) numBatches else expectedOutput.size + withStreamingContext(setupStreams[U, V](input, operation)) { ssc => + val output = runStreams[V](ssc, numBatches_, expectedOutput.size) + verifyOutput[V](output, expectedOutput, useSet) + } + } + + /** + * Test binary DStream operation with two lists of inputs, with number of + * batches to run same as the number of expected output values + */ + def testOperation[U: ClassTag, V: ClassTag, W: ClassTag]( + input1: Seq[Seq[U]], + input2: Seq[Seq[V]], + operation: (DStream[U], DStream[V]) => DStream[W], + expectedOutput: Seq[Seq[W]], + useSet: Boolean + ) { + testOperation[U, V, W](input1, input2, operation, expectedOutput, -1, useSet) + } + + /** + * Test binary DStream operation with two lists of inputs + * @param input1 First sequence of input collections + * @param input2 Second sequence of input collections + * @param operation Binary DStream operation to be applied to the 2 inputs + * @param expectedOutput Sequence of expected output collections + * @param numBatches Number of batches to run the operation for + * @param useSet Compare the output values with the expected output values + * as sets (order matters) or as lists (order does not matter) + */ + def testOperation[U: ClassTag, V: ClassTag, W: ClassTag]( + input1: Seq[Seq[U]], + input2: Seq[Seq[V]], + operation: (DStream[U], DStream[V]) => DStream[W], + expectedOutput: Seq[Seq[W]], + numBatches: Int, + useSet: Boolean + ) { + val numBatches_ = if (numBatches > 0) numBatches else expectedOutput.size + withStreamingContext(setupStreams[U, V, W](input1, input2, operation)) { ssc => + val output = runStreams[W](ssc, numBatches_, expectedOutput.size) + verifyOutput[W](output, expectedOutput, useSet) + } + } +} diff --git a/external/zeromq/pom.xml b/external/zeromq/pom.xml index 7e48968feb3bc..6d75179e9404d 100644 --- a/external/zeromq/pom.xml +++ b/external/zeromq/pom.xml @@ -39,13 +39,7 @@ org.apache.spark spark-streaming_${scala.binary.version} ${project.version} - - - org.apache.spark - spark-streaming_${scala.binary.version} - ${project.version} - test-jar - test + provided ${akka.group} diff --git a/external/zeromq/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java b/external/zeromq/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java new file mode 100644 index 0000000000000..6e1f01900071b --- /dev/null +++ b/external/zeromq/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.streaming; + +import org.apache.spark.streaming.api.java.JavaStreamingContext; +import org.junit.After; +import org.junit.Before; + +public abstract class LocalJavaStreamingContext { + + protected transient JavaStreamingContext ssc; + + @Before + public void setUp() { + System.setProperty("spark.streaming.clock", "org.apache.spark.streaming.util.ManualClock"); + ssc = new JavaStreamingContext("local[2]", "test", new Duration(1000)); + ssc.checkpoint("checkpoint"); + } + + @After + public void tearDown() { + ssc.stop(); + ssc = null; + } +} diff --git a/external/zeromq/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala b/external/zeromq/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala new file mode 100644 index 0000000000000..76b3b73a2ff3b --- /dev/null +++ b/external/zeromq/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala @@ -0,0 +1,413 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.streaming + +import java.io.{ObjectInputStream, IOException} + +import scala.collection.mutable.ArrayBuffer +import scala.collection.mutable.SynchronizedBuffer +import scala.reflect.ClassTag + +import org.scalatest.{BeforeAndAfter, FunSuite} + +import org.apache.spark.streaming.dstream.{DStream, InputDStream, ForEachDStream} +import org.apache.spark.streaming.util.ManualClock +import org.apache.spark.{SparkConf, Logging} +import org.apache.spark.rdd.RDD +import org.apache.spark.util.Utils + +/** + * This is a input stream just for the testsuites. This is equivalent to a checkpointable, + * replayable, reliable message queue like Kafka. It requires a sequence as input, and + * returns the i_th element at the i_th batch unde manual clock. + */ +class TestInputStream[T: ClassTag](ssc_ : StreamingContext, input: Seq[Seq[T]], numPartitions: Int) + extends InputDStream[T](ssc_) { + + def start() {} + + def stop() {} + + def compute(validTime: Time): Option[RDD[T]] = { + logInfo("Computing RDD for time " + validTime) + val index = ((validTime - zeroTime) / slideDuration - 1).toInt + val selectedInput = if (index < input.size) input(index) else Seq[T]() + + // lets us test cases where RDDs are not created + if (selectedInput == null) + return None + + val rdd = ssc.sc.makeRDD(selectedInput, numPartitions) + logInfo("Created RDD " + rdd.id + " with " + selectedInput) + Some(rdd) + } +} + +/** + * This is a output stream just for the testsuites. All the output is collected into a + * ArrayBuffer. This buffer is wiped clean on being restored from checkpoint. + * + * The buffer contains a sequence of RDD's, each containing a sequence of items + */ +class TestOutputStream[T: ClassTag](parent: DStream[T], + val output: ArrayBuffer[Seq[T]] = ArrayBuffer[Seq[T]]()) + extends ForEachDStream[T](parent, (rdd: RDD[T], t: Time) => { + val collected = rdd.collect() + output += collected + }) { + + // This is to clear the output buffer every it is read from a checkpoint + @throws(classOf[IOException]) + private def readObject(ois: ObjectInputStream): Unit = Utils.tryOrIOException { + ois.defaultReadObject() + output.clear() + } +} + +/** + * This is a output stream just for the testsuites. All the output is collected into a + * ArrayBuffer. This buffer is wiped clean on being restored from checkpoint. + * + * The buffer contains a sequence of RDD's, each containing a sequence of partitions, each + * containing a sequence of items. + */ +class TestOutputStreamWithPartitions[T: ClassTag](parent: DStream[T], + val output: ArrayBuffer[Seq[Seq[T]]] = ArrayBuffer[Seq[Seq[T]]]()) + extends ForEachDStream[T](parent, (rdd: RDD[T], t: Time) => { + val collected = rdd.glom().collect().map(_.toSeq) + output += collected + }) { + + // This is to clear the output buffer every it is read from a checkpoint + @throws(classOf[IOException]) + private def readObject(ois: ObjectInputStream): Unit = Utils.tryOrIOException { + ois.defaultReadObject() + output.clear() + } + + def toTestOutputStream = new TestOutputStream[T](this.parent, this.output.map(_.flatten)) +} + +/** + * This is the base trait for Spark Streaming testsuites. This provides basic functionality + * to run user-defined set of input on user-defined stream operations, and verify the output. + */ +trait TestSuiteBase extends FunSuite with BeforeAndAfter with Logging { + + // Name of the framework for Spark context + def framework = this.getClass.getSimpleName + + // Master for Spark context + def master = "local[2]" + + // Batch duration + def batchDuration = Seconds(1) + + // Directory where the checkpoint data will be saved + lazy val checkpointDir = { + val dir = Utils.createTempDir() + logDebug(s"checkpointDir: $dir") + dir.toString + } + + // Number of partitions of the input parallel collections created for testing + def numInputPartitions = 2 + + // Maximum time to wait before the test times out + def maxWaitTimeMillis = 10000 + + // Whether to use manual clock or not + def useManualClock = true + + // Whether to actually wait in real time before changing manual clock + def actuallyWait = false + + //// A SparkConf to use in tests. Can be modified before calling setupStreams to configure things. + val conf = new SparkConf() + .setMaster(master) + .setAppName(framework) + + // Default before function for any streaming test suite. Override this + // if you want to add your stuff to "before" (i.e., don't call before { } ) + def beforeFunction() { + if (useManualClock) { + logInfo("Using manual clock") + conf.set("spark.streaming.clock", "org.apache.spark.streaming.util.ManualClock") + } else { + logInfo("Using real clock") + conf.set("spark.streaming.clock", "org.apache.spark.streaming.util.SystemClock") + } + } + + // Default after function for any streaming test suite. Override this + // if you want to add your stuff to "after" (i.e., don't call after { } ) + def afterFunction() { + System.clearProperty("spark.streaming.clock") + } + + before(beforeFunction) + after(afterFunction) + + /** + * Run a block of code with the given StreamingContext and automatically + * stop the context when the block completes or when an exception is thrown. + */ + def withStreamingContext[R](ssc: StreamingContext)(block: StreamingContext => R): R = { + try { + block(ssc) + } finally { + try { + ssc.stop(stopSparkContext = true) + } catch { + case e: Exception => + logError("Error stopping StreamingContext", e) + } + } + } + + /** + * Set up required DStreams to test the DStream operation using the two sequences + * of input collections. + */ + def setupStreams[U: ClassTag, V: ClassTag]( + input: Seq[Seq[U]], + operation: DStream[U] => DStream[V], + numPartitions: Int = numInputPartitions + ): StreamingContext = { + // Create StreamingContext + val ssc = new StreamingContext(conf, batchDuration) + if (checkpointDir != null) { + ssc.checkpoint(checkpointDir) + } + + // Setup the stream computation + val inputStream = new TestInputStream(ssc, input, numPartitions) + val operatedStream = operation(inputStream) + val outputStream = new TestOutputStreamWithPartitions(operatedStream, + new ArrayBuffer[Seq[Seq[V]]] with SynchronizedBuffer[Seq[Seq[V]]]) + outputStream.register() + ssc + } + + /** + * Set up required DStreams to test the binary operation using the sequence + * of input collections. + */ + def setupStreams[U: ClassTag, V: ClassTag, W: ClassTag]( + input1: Seq[Seq[U]], + input2: Seq[Seq[V]], + operation: (DStream[U], DStream[V]) => DStream[W] + ): StreamingContext = { + // Create StreamingContext + val ssc = new StreamingContext(conf, batchDuration) + if (checkpointDir != null) { + ssc.checkpoint(checkpointDir) + } + + // Setup the stream computation + val inputStream1 = new TestInputStream(ssc, input1, numInputPartitions) + val inputStream2 = new TestInputStream(ssc, input2, numInputPartitions) + val operatedStream = operation(inputStream1, inputStream2) + val outputStream = new TestOutputStreamWithPartitions(operatedStream, + new ArrayBuffer[Seq[Seq[W]]] with SynchronizedBuffer[Seq[Seq[W]]]) + outputStream.register() + ssc + } + + /** + * Runs the streams set up in `ssc` on manual clock for `numBatches` batches and + * returns the collected output. It will wait until `numExpectedOutput` number of + * output data has been collected or timeout (set by `maxWaitTimeMillis`) is reached. + * + * Returns a sequence of items for each RDD. + */ + def runStreams[V: ClassTag]( + ssc: StreamingContext, + numBatches: Int, + numExpectedOutput: Int + ): Seq[Seq[V]] = { + // Flatten each RDD into a single Seq + runStreamsWithPartitions(ssc, numBatches, numExpectedOutput).map(_.flatten.toSeq) + } + + /** + * Runs the streams set up in `ssc` on manual clock for `numBatches` batches and + * returns the collected output. It will wait until `numExpectedOutput` number of + * output data has been collected or timeout (set by `maxWaitTimeMillis`) is reached. + * + * Returns a sequence of RDD's. Each RDD is represented as several sequences of items, each + * representing one partition. + */ + def runStreamsWithPartitions[V: ClassTag]( + ssc: StreamingContext, + numBatches: Int, + numExpectedOutput: Int + ): Seq[Seq[Seq[V]]] = { + assert(numBatches > 0, "Number of batches to run stream computation is zero") + assert(numExpectedOutput > 0, "Number of expected outputs after " + numBatches + " is zero") + logInfo("numBatches = " + numBatches + ", numExpectedOutput = " + numExpectedOutput) + + // Get the output buffer + val outputStream = ssc.graph.getOutputStreams. + filter(_.isInstanceOf[TestOutputStreamWithPartitions[_]]). + head.asInstanceOf[TestOutputStreamWithPartitions[V]] + val output = outputStream.output + + try { + // Start computation + ssc.start() + + // Advance manual clock + val clock = ssc.scheduler.clock.asInstanceOf[ManualClock] + logInfo("Manual clock before advancing = " + clock.time) + if (actuallyWait) { + for (i <- 1 to numBatches) { + logInfo("Actually waiting for " + batchDuration) + clock.addToTime(batchDuration.milliseconds) + Thread.sleep(batchDuration.milliseconds) + } + } else { + clock.addToTime(numBatches * batchDuration.milliseconds) + } + logInfo("Manual clock after advancing = " + clock.time) + + // Wait until expected number of output items have been generated + val startTime = System.currentTimeMillis() + while (output.size < numExpectedOutput && System.currentTimeMillis() - startTime < maxWaitTimeMillis) { + logInfo("output.size = " + output.size + ", numExpectedOutput = " + numExpectedOutput) + ssc.awaitTermination(50) + } + val timeTaken = System.currentTimeMillis() - startTime + logInfo("Output generated in " + timeTaken + " milliseconds") + output.foreach(x => logInfo("[" + x.mkString(",") + "]")) + assert(timeTaken < maxWaitTimeMillis, "Operation timed out after " + timeTaken + " ms") + assert(output.size === numExpectedOutput, "Unexpected number of outputs generated") + + Thread.sleep(100) // Give some time for the forgetting old RDDs to complete + } finally { + ssc.stop(stopSparkContext = true) + } + output + } + + /** + * Verify whether the output values after running a DStream operation + * is same as the expected output values, by comparing the output + * collections either as lists (order matters) or sets (order does not matter) + */ + def verifyOutput[V: ClassTag]( + output: Seq[Seq[V]], + expectedOutput: Seq[Seq[V]], + useSet: Boolean + ) { + logInfo("--------------------------------") + logInfo("output.size = " + output.size) + logInfo("output") + output.foreach(x => logInfo("[" + x.mkString(",") + "]")) + logInfo("expected output.size = " + expectedOutput.size) + logInfo("expected output") + expectedOutput.foreach(x => logInfo("[" + x.mkString(",") + "]")) + logInfo("--------------------------------") + + // Match the output with the expected output + assert(output.size === expectedOutput.size, "Number of outputs do not match") + for (i <- 0 until output.size) { + if (useSet) { + assert(output(i).toSet === expectedOutput(i).toSet) + } else { + assert(output(i).toList === expectedOutput(i).toList) + } + } + logInfo("Output verified successfully") + } + + /** + * Test unary DStream operation with a list of inputs, with number of + * batches to run same as the number of expected output values + */ + def testOperation[U: ClassTag, V: ClassTag]( + input: Seq[Seq[U]], + operation: DStream[U] => DStream[V], + expectedOutput: Seq[Seq[V]], + useSet: Boolean = false + ) { + testOperation[U, V](input, operation, expectedOutput, -1, useSet) + } + + /** + * Test unary DStream operation with a list of inputs + * @param input Sequence of input collections + * @param operation Binary DStream operation to be applied to the 2 inputs + * @param expectedOutput Sequence of expected output collections + * @param numBatches Number of batches to run the operation for + * @param useSet Compare the output values with the expected output values + * as sets (order matters) or as lists (order does not matter) + */ + def testOperation[U: ClassTag, V: ClassTag]( + input: Seq[Seq[U]], + operation: DStream[U] => DStream[V], + expectedOutput: Seq[Seq[V]], + numBatches: Int, + useSet: Boolean + ) { + val numBatches_ = if (numBatches > 0) numBatches else expectedOutput.size + withStreamingContext(setupStreams[U, V](input, operation)) { ssc => + val output = runStreams[V](ssc, numBatches_, expectedOutput.size) + verifyOutput[V](output, expectedOutput, useSet) + } + } + + /** + * Test binary DStream operation with two lists of inputs, with number of + * batches to run same as the number of expected output values + */ + def testOperation[U: ClassTag, V: ClassTag, W: ClassTag]( + input1: Seq[Seq[U]], + input2: Seq[Seq[V]], + operation: (DStream[U], DStream[V]) => DStream[W], + expectedOutput: Seq[Seq[W]], + useSet: Boolean + ) { + testOperation[U, V, W](input1, input2, operation, expectedOutput, -1, useSet) + } + + /** + * Test binary DStream operation with two lists of inputs + * @param input1 First sequence of input collections + * @param input2 Second sequence of input collections + * @param operation Binary DStream operation to be applied to the 2 inputs + * @param expectedOutput Sequence of expected output collections + * @param numBatches Number of batches to run the operation for + * @param useSet Compare the output values with the expected output values + * as sets (order matters) or as lists (order does not matter) + */ + def testOperation[U: ClassTag, V: ClassTag, W: ClassTag]( + input1: Seq[Seq[U]], + input2: Seq[Seq[V]], + operation: (DStream[U], DStream[V]) => DStream[W], + expectedOutput: Seq[Seq[W]], + numBatches: Int, + useSet: Boolean + ) { + val numBatches_ = if (numBatches > 0) numBatches else expectedOutput.size + withStreamingContext(setupStreams[U, V, W](input1, input2, operation)) { ssc => + val output = runStreams[W](ssc, numBatches_, expectedOutput.size) + verifyOutput[W](output, expectedOutput, useSet) + } + } +} From 270b4fbcc4ba07860fc08f3432b248693a097171 Mon Sep 17 00:00:00 2001 From: Prashant Sharma Date: Wed, 19 Nov 2014 14:21:00 +0530 Subject: [PATCH 2/3] Removed most of the unused code. --- .../spark/streaming/TestOutputStream.scala | 48 ++ .../spark/streaming/TestSuiteBase.scala | 413 ------------------ .../flume/FlumePollingStreamSuite.scala | 15 +- .../spark/streaming/TestSuiteBase.scala | 413 ------------------ .../spark/streaming/TestSuiteBase.scala | 413 ------------------ .../streaming/mqtt/MQTTStreamSuite.scala | 12 +- .../spark/streaming/TestSuiteBase.scala | 413 ------------------ .../twitter/TwitterStreamSuite.scala | 18 +- .../spark/streaming/TestSuiteBase.scala | 413 ------------------ .../streaming/zeromq/ZeroMQStreamSuite.scala | 11 +- 10 files changed, 91 insertions(+), 2078 deletions(-) create mode 100644 external/flume/src/test/scala/org/apache/spark/streaming/TestOutputStream.scala delete mode 100644 external/flume/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala delete mode 100644 external/kafka/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala delete mode 100644 external/mqtt/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala delete mode 100644 external/twitter/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala delete mode 100644 external/zeromq/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala diff --git a/external/flume/src/test/scala/org/apache/spark/streaming/TestOutputStream.scala b/external/flume/src/test/scala/org/apache/spark/streaming/TestOutputStream.scala new file mode 100644 index 0000000000000..1a900007b696b --- /dev/null +++ b/external/flume/src/test/scala/org/apache/spark/streaming/TestOutputStream.scala @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.streaming + +import java.io.{IOException, ObjectInputStream} + +import org.apache.spark.rdd.RDD +import org.apache.spark.streaming.dstream.{DStream, ForEachDStream} +import org.apache.spark.util.Utils + +import scala.collection.mutable.ArrayBuffer +import scala.reflect.ClassTag + +/** + * This is a output stream just for the testsuites. All the output is collected into a + * ArrayBuffer. This buffer is wiped clean on being restored from checkpoint. + * + * The buffer contains a sequence of RDD's, each containing a sequence of items + */ +class TestOutputStream[T: ClassTag](parent: DStream[T], + val output: ArrayBuffer[Seq[T]] = ArrayBuffer[Seq[T]]()) + extends ForEachDStream[T](parent, (rdd: RDD[T], t: Time) => { + val collected = rdd.collect() + output += collected + }) { + + // This is to clear the output buffer every it is read from a checkpoint + @throws(classOf[IOException]) + private def readObject(ois: ObjectInputStream): Unit = Utils.tryOrIOException { + ois.defaultReadObject() + output.clear() + } +} diff --git a/external/flume/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala b/external/flume/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala deleted file mode 100644 index 76b3b73a2ff3b..0000000000000 --- a/external/flume/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala +++ /dev/null @@ -1,413 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.streaming - -import java.io.{ObjectInputStream, IOException} - -import scala.collection.mutable.ArrayBuffer -import scala.collection.mutable.SynchronizedBuffer -import scala.reflect.ClassTag - -import org.scalatest.{BeforeAndAfter, FunSuite} - -import org.apache.spark.streaming.dstream.{DStream, InputDStream, ForEachDStream} -import org.apache.spark.streaming.util.ManualClock -import org.apache.spark.{SparkConf, Logging} -import org.apache.spark.rdd.RDD -import org.apache.spark.util.Utils - -/** - * This is a input stream just for the testsuites. This is equivalent to a checkpointable, - * replayable, reliable message queue like Kafka. It requires a sequence as input, and - * returns the i_th element at the i_th batch unde manual clock. - */ -class TestInputStream[T: ClassTag](ssc_ : StreamingContext, input: Seq[Seq[T]], numPartitions: Int) - extends InputDStream[T](ssc_) { - - def start() {} - - def stop() {} - - def compute(validTime: Time): Option[RDD[T]] = { - logInfo("Computing RDD for time " + validTime) - val index = ((validTime - zeroTime) / slideDuration - 1).toInt - val selectedInput = if (index < input.size) input(index) else Seq[T]() - - // lets us test cases where RDDs are not created - if (selectedInput == null) - return None - - val rdd = ssc.sc.makeRDD(selectedInput, numPartitions) - logInfo("Created RDD " + rdd.id + " with " + selectedInput) - Some(rdd) - } -} - -/** - * This is a output stream just for the testsuites. All the output is collected into a - * ArrayBuffer. This buffer is wiped clean on being restored from checkpoint. - * - * The buffer contains a sequence of RDD's, each containing a sequence of items - */ -class TestOutputStream[T: ClassTag](parent: DStream[T], - val output: ArrayBuffer[Seq[T]] = ArrayBuffer[Seq[T]]()) - extends ForEachDStream[T](parent, (rdd: RDD[T], t: Time) => { - val collected = rdd.collect() - output += collected - }) { - - // This is to clear the output buffer every it is read from a checkpoint - @throws(classOf[IOException]) - private def readObject(ois: ObjectInputStream): Unit = Utils.tryOrIOException { - ois.defaultReadObject() - output.clear() - } -} - -/** - * This is a output stream just for the testsuites. All the output is collected into a - * ArrayBuffer. This buffer is wiped clean on being restored from checkpoint. - * - * The buffer contains a sequence of RDD's, each containing a sequence of partitions, each - * containing a sequence of items. - */ -class TestOutputStreamWithPartitions[T: ClassTag](parent: DStream[T], - val output: ArrayBuffer[Seq[Seq[T]]] = ArrayBuffer[Seq[Seq[T]]]()) - extends ForEachDStream[T](parent, (rdd: RDD[T], t: Time) => { - val collected = rdd.glom().collect().map(_.toSeq) - output += collected - }) { - - // This is to clear the output buffer every it is read from a checkpoint - @throws(classOf[IOException]) - private def readObject(ois: ObjectInputStream): Unit = Utils.tryOrIOException { - ois.defaultReadObject() - output.clear() - } - - def toTestOutputStream = new TestOutputStream[T](this.parent, this.output.map(_.flatten)) -} - -/** - * This is the base trait for Spark Streaming testsuites. This provides basic functionality - * to run user-defined set of input on user-defined stream operations, and verify the output. - */ -trait TestSuiteBase extends FunSuite with BeforeAndAfter with Logging { - - // Name of the framework for Spark context - def framework = this.getClass.getSimpleName - - // Master for Spark context - def master = "local[2]" - - // Batch duration - def batchDuration = Seconds(1) - - // Directory where the checkpoint data will be saved - lazy val checkpointDir = { - val dir = Utils.createTempDir() - logDebug(s"checkpointDir: $dir") - dir.toString - } - - // Number of partitions of the input parallel collections created for testing - def numInputPartitions = 2 - - // Maximum time to wait before the test times out - def maxWaitTimeMillis = 10000 - - // Whether to use manual clock or not - def useManualClock = true - - // Whether to actually wait in real time before changing manual clock - def actuallyWait = false - - //// A SparkConf to use in tests. Can be modified before calling setupStreams to configure things. - val conf = new SparkConf() - .setMaster(master) - .setAppName(framework) - - // Default before function for any streaming test suite. Override this - // if you want to add your stuff to "before" (i.e., don't call before { } ) - def beforeFunction() { - if (useManualClock) { - logInfo("Using manual clock") - conf.set("spark.streaming.clock", "org.apache.spark.streaming.util.ManualClock") - } else { - logInfo("Using real clock") - conf.set("spark.streaming.clock", "org.apache.spark.streaming.util.SystemClock") - } - } - - // Default after function for any streaming test suite. Override this - // if you want to add your stuff to "after" (i.e., don't call after { } ) - def afterFunction() { - System.clearProperty("spark.streaming.clock") - } - - before(beforeFunction) - after(afterFunction) - - /** - * Run a block of code with the given StreamingContext and automatically - * stop the context when the block completes or when an exception is thrown. - */ - def withStreamingContext[R](ssc: StreamingContext)(block: StreamingContext => R): R = { - try { - block(ssc) - } finally { - try { - ssc.stop(stopSparkContext = true) - } catch { - case e: Exception => - logError("Error stopping StreamingContext", e) - } - } - } - - /** - * Set up required DStreams to test the DStream operation using the two sequences - * of input collections. - */ - def setupStreams[U: ClassTag, V: ClassTag]( - input: Seq[Seq[U]], - operation: DStream[U] => DStream[V], - numPartitions: Int = numInputPartitions - ): StreamingContext = { - // Create StreamingContext - val ssc = new StreamingContext(conf, batchDuration) - if (checkpointDir != null) { - ssc.checkpoint(checkpointDir) - } - - // Setup the stream computation - val inputStream = new TestInputStream(ssc, input, numPartitions) - val operatedStream = operation(inputStream) - val outputStream = new TestOutputStreamWithPartitions(operatedStream, - new ArrayBuffer[Seq[Seq[V]]] with SynchronizedBuffer[Seq[Seq[V]]]) - outputStream.register() - ssc - } - - /** - * Set up required DStreams to test the binary operation using the sequence - * of input collections. - */ - def setupStreams[U: ClassTag, V: ClassTag, W: ClassTag]( - input1: Seq[Seq[U]], - input2: Seq[Seq[V]], - operation: (DStream[U], DStream[V]) => DStream[W] - ): StreamingContext = { - // Create StreamingContext - val ssc = new StreamingContext(conf, batchDuration) - if (checkpointDir != null) { - ssc.checkpoint(checkpointDir) - } - - // Setup the stream computation - val inputStream1 = new TestInputStream(ssc, input1, numInputPartitions) - val inputStream2 = new TestInputStream(ssc, input2, numInputPartitions) - val operatedStream = operation(inputStream1, inputStream2) - val outputStream = new TestOutputStreamWithPartitions(operatedStream, - new ArrayBuffer[Seq[Seq[W]]] with SynchronizedBuffer[Seq[Seq[W]]]) - outputStream.register() - ssc - } - - /** - * Runs the streams set up in `ssc` on manual clock for `numBatches` batches and - * returns the collected output. It will wait until `numExpectedOutput` number of - * output data has been collected or timeout (set by `maxWaitTimeMillis`) is reached. - * - * Returns a sequence of items for each RDD. - */ - def runStreams[V: ClassTag]( - ssc: StreamingContext, - numBatches: Int, - numExpectedOutput: Int - ): Seq[Seq[V]] = { - // Flatten each RDD into a single Seq - runStreamsWithPartitions(ssc, numBatches, numExpectedOutput).map(_.flatten.toSeq) - } - - /** - * Runs the streams set up in `ssc` on manual clock for `numBatches` batches and - * returns the collected output. It will wait until `numExpectedOutput` number of - * output data has been collected or timeout (set by `maxWaitTimeMillis`) is reached. - * - * Returns a sequence of RDD's. Each RDD is represented as several sequences of items, each - * representing one partition. - */ - def runStreamsWithPartitions[V: ClassTag]( - ssc: StreamingContext, - numBatches: Int, - numExpectedOutput: Int - ): Seq[Seq[Seq[V]]] = { - assert(numBatches > 0, "Number of batches to run stream computation is zero") - assert(numExpectedOutput > 0, "Number of expected outputs after " + numBatches + " is zero") - logInfo("numBatches = " + numBatches + ", numExpectedOutput = " + numExpectedOutput) - - // Get the output buffer - val outputStream = ssc.graph.getOutputStreams. - filter(_.isInstanceOf[TestOutputStreamWithPartitions[_]]). - head.asInstanceOf[TestOutputStreamWithPartitions[V]] - val output = outputStream.output - - try { - // Start computation - ssc.start() - - // Advance manual clock - val clock = ssc.scheduler.clock.asInstanceOf[ManualClock] - logInfo("Manual clock before advancing = " + clock.time) - if (actuallyWait) { - for (i <- 1 to numBatches) { - logInfo("Actually waiting for " + batchDuration) - clock.addToTime(batchDuration.milliseconds) - Thread.sleep(batchDuration.milliseconds) - } - } else { - clock.addToTime(numBatches * batchDuration.milliseconds) - } - logInfo("Manual clock after advancing = " + clock.time) - - // Wait until expected number of output items have been generated - val startTime = System.currentTimeMillis() - while (output.size < numExpectedOutput && System.currentTimeMillis() - startTime < maxWaitTimeMillis) { - logInfo("output.size = " + output.size + ", numExpectedOutput = " + numExpectedOutput) - ssc.awaitTermination(50) - } - val timeTaken = System.currentTimeMillis() - startTime - logInfo("Output generated in " + timeTaken + " milliseconds") - output.foreach(x => logInfo("[" + x.mkString(",") + "]")) - assert(timeTaken < maxWaitTimeMillis, "Operation timed out after " + timeTaken + " ms") - assert(output.size === numExpectedOutput, "Unexpected number of outputs generated") - - Thread.sleep(100) // Give some time for the forgetting old RDDs to complete - } finally { - ssc.stop(stopSparkContext = true) - } - output - } - - /** - * Verify whether the output values after running a DStream operation - * is same as the expected output values, by comparing the output - * collections either as lists (order matters) or sets (order does not matter) - */ - def verifyOutput[V: ClassTag]( - output: Seq[Seq[V]], - expectedOutput: Seq[Seq[V]], - useSet: Boolean - ) { - logInfo("--------------------------------") - logInfo("output.size = " + output.size) - logInfo("output") - output.foreach(x => logInfo("[" + x.mkString(",") + "]")) - logInfo("expected output.size = " + expectedOutput.size) - logInfo("expected output") - expectedOutput.foreach(x => logInfo("[" + x.mkString(",") + "]")) - logInfo("--------------------------------") - - // Match the output with the expected output - assert(output.size === expectedOutput.size, "Number of outputs do not match") - for (i <- 0 until output.size) { - if (useSet) { - assert(output(i).toSet === expectedOutput(i).toSet) - } else { - assert(output(i).toList === expectedOutput(i).toList) - } - } - logInfo("Output verified successfully") - } - - /** - * Test unary DStream operation with a list of inputs, with number of - * batches to run same as the number of expected output values - */ - def testOperation[U: ClassTag, V: ClassTag]( - input: Seq[Seq[U]], - operation: DStream[U] => DStream[V], - expectedOutput: Seq[Seq[V]], - useSet: Boolean = false - ) { - testOperation[U, V](input, operation, expectedOutput, -1, useSet) - } - - /** - * Test unary DStream operation with a list of inputs - * @param input Sequence of input collections - * @param operation Binary DStream operation to be applied to the 2 inputs - * @param expectedOutput Sequence of expected output collections - * @param numBatches Number of batches to run the operation for - * @param useSet Compare the output values with the expected output values - * as sets (order matters) or as lists (order does not matter) - */ - def testOperation[U: ClassTag, V: ClassTag]( - input: Seq[Seq[U]], - operation: DStream[U] => DStream[V], - expectedOutput: Seq[Seq[V]], - numBatches: Int, - useSet: Boolean - ) { - val numBatches_ = if (numBatches > 0) numBatches else expectedOutput.size - withStreamingContext(setupStreams[U, V](input, operation)) { ssc => - val output = runStreams[V](ssc, numBatches_, expectedOutput.size) - verifyOutput[V](output, expectedOutput, useSet) - } - } - - /** - * Test binary DStream operation with two lists of inputs, with number of - * batches to run same as the number of expected output values - */ - def testOperation[U: ClassTag, V: ClassTag, W: ClassTag]( - input1: Seq[Seq[U]], - input2: Seq[Seq[V]], - operation: (DStream[U], DStream[V]) => DStream[W], - expectedOutput: Seq[Seq[W]], - useSet: Boolean - ) { - testOperation[U, V, W](input1, input2, operation, expectedOutput, -1, useSet) - } - - /** - * Test binary DStream operation with two lists of inputs - * @param input1 First sequence of input collections - * @param input2 Second sequence of input collections - * @param operation Binary DStream operation to be applied to the 2 inputs - * @param expectedOutput Sequence of expected output collections - * @param numBatches Number of batches to run the operation for - * @param useSet Compare the output values with the expected output values - * as sets (order matters) or as lists (order does not matter) - */ - def testOperation[U: ClassTag, V: ClassTag, W: ClassTag]( - input1: Seq[Seq[U]], - input2: Seq[Seq[V]], - operation: (DStream[U], DStream[V]) => DStream[W], - expectedOutput: Seq[Seq[W]], - numBatches: Int, - useSet: Boolean - ) { - val numBatches_ = if (numBatches > 0) numBatches else expectedOutput.size - withStreamingContext(setupStreams[U, V, W](input1, input2, operation)) { ssc => - val output = runStreams[W](ssc, numBatches_, expectedOutput.size) - verifyOutput[W](output, expectedOutput, useSet) - } - } -} diff --git a/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumePollingStreamSuite.scala b/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumePollingStreamSuite.scala index 475026e8eb140..ba079d84e4529 100644 --- a/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumePollingStreamSuite.scala +++ b/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumePollingStreamSuite.scala @@ -20,9 +20,6 @@ package org.apache.spark.streaming.flume import java.net.InetSocketAddress import java.util.concurrent.{Callable, ExecutorCompletionService, Executors} -import java.util.Random - -import org.apache.spark.TestUtils import scala.collection.JavaConversions._ import scala.collection.mutable.{SynchronizedBuffer, ArrayBuffer} @@ -32,20 +29,28 @@ import org.apache.flume.channel.MemoryChannel import org.apache.flume.conf.Configurables import org.apache.flume.event.EventBuilder +import org.scalatest.{BeforeAndAfter, FunSuite} + +import org.apache.spark.{SparkConf, Logging} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.util.ManualClock -import org.apache.spark.streaming.{TestSuiteBase, TestOutputStream, StreamingContext} +import org.apache.spark.streaming.{Seconds, TestOutputStream, StreamingContext} import org.apache.spark.streaming.flume.sink._ import org.apache.spark.util.Utils -class FlumePollingStreamSuite extends TestSuiteBase { +class FlumePollingStreamSuite extends FunSuite with BeforeAndAfter with Logging { val batchCount = 5 val eventsPerBatch = 100 val totalEventsPerChannel = batchCount * eventsPerBatch val channelCapacity = 5000 val maxAttempts = 5 + val batchDuration = Seconds(1) + + val conf = new SparkConf() + .setMaster("local[2]") + .setAppName(this.getClass.getSimpleName) test("flume polling test") { testMultipleTimes(testFlumePolling) diff --git a/external/kafka/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala b/external/kafka/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala deleted file mode 100644 index 76b3b73a2ff3b..0000000000000 --- a/external/kafka/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala +++ /dev/null @@ -1,413 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.streaming - -import java.io.{ObjectInputStream, IOException} - -import scala.collection.mutable.ArrayBuffer -import scala.collection.mutable.SynchronizedBuffer -import scala.reflect.ClassTag - -import org.scalatest.{BeforeAndAfter, FunSuite} - -import org.apache.spark.streaming.dstream.{DStream, InputDStream, ForEachDStream} -import org.apache.spark.streaming.util.ManualClock -import org.apache.spark.{SparkConf, Logging} -import org.apache.spark.rdd.RDD -import org.apache.spark.util.Utils - -/** - * This is a input stream just for the testsuites. This is equivalent to a checkpointable, - * replayable, reliable message queue like Kafka. It requires a sequence as input, and - * returns the i_th element at the i_th batch unde manual clock. - */ -class TestInputStream[T: ClassTag](ssc_ : StreamingContext, input: Seq[Seq[T]], numPartitions: Int) - extends InputDStream[T](ssc_) { - - def start() {} - - def stop() {} - - def compute(validTime: Time): Option[RDD[T]] = { - logInfo("Computing RDD for time " + validTime) - val index = ((validTime - zeroTime) / slideDuration - 1).toInt - val selectedInput = if (index < input.size) input(index) else Seq[T]() - - // lets us test cases where RDDs are not created - if (selectedInput == null) - return None - - val rdd = ssc.sc.makeRDD(selectedInput, numPartitions) - logInfo("Created RDD " + rdd.id + " with " + selectedInput) - Some(rdd) - } -} - -/** - * This is a output stream just for the testsuites. All the output is collected into a - * ArrayBuffer. This buffer is wiped clean on being restored from checkpoint. - * - * The buffer contains a sequence of RDD's, each containing a sequence of items - */ -class TestOutputStream[T: ClassTag](parent: DStream[T], - val output: ArrayBuffer[Seq[T]] = ArrayBuffer[Seq[T]]()) - extends ForEachDStream[T](parent, (rdd: RDD[T], t: Time) => { - val collected = rdd.collect() - output += collected - }) { - - // This is to clear the output buffer every it is read from a checkpoint - @throws(classOf[IOException]) - private def readObject(ois: ObjectInputStream): Unit = Utils.tryOrIOException { - ois.defaultReadObject() - output.clear() - } -} - -/** - * This is a output stream just for the testsuites. All the output is collected into a - * ArrayBuffer. This buffer is wiped clean on being restored from checkpoint. - * - * The buffer contains a sequence of RDD's, each containing a sequence of partitions, each - * containing a sequence of items. - */ -class TestOutputStreamWithPartitions[T: ClassTag](parent: DStream[T], - val output: ArrayBuffer[Seq[Seq[T]]] = ArrayBuffer[Seq[Seq[T]]]()) - extends ForEachDStream[T](parent, (rdd: RDD[T], t: Time) => { - val collected = rdd.glom().collect().map(_.toSeq) - output += collected - }) { - - // This is to clear the output buffer every it is read from a checkpoint - @throws(classOf[IOException]) - private def readObject(ois: ObjectInputStream): Unit = Utils.tryOrIOException { - ois.defaultReadObject() - output.clear() - } - - def toTestOutputStream = new TestOutputStream[T](this.parent, this.output.map(_.flatten)) -} - -/** - * This is the base trait for Spark Streaming testsuites. This provides basic functionality - * to run user-defined set of input on user-defined stream operations, and verify the output. - */ -trait TestSuiteBase extends FunSuite with BeforeAndAfter with Logging { - - // Name of the framework for Spark context - def framework = this.getClass.getSimpleName - - // Master for Spark context - def master = "local[2]" - - // Batch duration - def batchDuration = Seconds(1) - - // Directory where the checkpoint data will be saved - lazy val checkpointDir = { - val dir = Utils.createTempDir() - logDebug(s"checkpointDir: $dir") - dir.toString - } - - // Number of partitions of the input parallel collections created for testing - def numInputPartitions = 2 - - // Maximum time to wait before the test times out - def maxWaitTimeMillis = 10000 - - // Whether to use manual clock or not - def useManualClock = true - - // Whether to actually wait in real time before changing manual clock - def actuallyWait = false - - //// A SparkConf to use in tests. Can be modified before calling setupStreams to configure things. - val conf = new SparkConf() - .setMaster(master) - .setAppName(framework) - - // Default before function for any streaming test suite. Override this - // if you want to add your stuff to "before" (i.e., don't call before { } ) - def beforeFunction() { - if (useManualClock) { - logInfo("Using manual clock") - conf.set("spark.streaming.clock", "org.apache.spark.streaming.util.ManualClock") - } else { - logInfo("Using real clock") - conf.set("spark.streaming.clock", "org.apache.spark.streaming.util.SystemClock") - } - } - - // Default after function for any streaming test suite. Override this - // if you want to add your stuff to "after" (i.e., don't call after { } ) - def afterFunction() { - System.clearProperty("spark.streaming.clock") - } - - before(beforeFunction) - after(afterFunction) - - /** - * Run a block of code with the given StreamingContext and automatically - * stop the context when the block completes or when an exception is thrown. - */ - def withStreamingContext[R](ssc: StreamingContext)(block: StreamingContext => R): R = { - try { - block(ssc) - } finally { - try { - ssc.stop(stopSparkContext = true) - } catch { - case e: Exception => - logError("Error stopping StreamingContext", e) - } - } - } - - /** - * Set up required DStreams to test the DStream operation using the two sequences - * of input collections. - */ - def setupStreams[U: ClassTag, V: ClassTag]( - input: Seq[Seq[U]], - operation: DStream[U] => DStream[V], - numPartitions: Int = numInputPartitions - ): StreamingContext = { - // Create StreamingContext - val ssc = new StreamingContext(conf, batchDuration) - if (checkpointDir != null) { - ssc.checkpoint(checkpointDir) - } - - // Setup the stream computation - val inputStream = new TestInputStream(ssc, input, numPartitions) - val operatedStream = operation(inputStream) - val outputStream = new TestOutputStreamWithPartitions(operatedStream, - new ArrayBuffer[Seq[Seq[V]]] with SynchronizedBuffer[Seq[Seq[V]]]) - outputStream.register() - ssc - } - - /** - * Set up required DStreams to test the binary operation using the sequence - * of input collections. - */ - def setupStreams[U: ClassTag, V: ClassTag, W: ClassTag]( - input1: Seq[Seq[U]], - input2: Seq[Seq[V]], - operation: (DStream[U], DStream[V]) => DStream[W] - ): StreamingContext = { - // Create StreamingContext - val ssc = new StreamingContext(conf, batchDuration) - if (checkpointDir != null) { - ssc.checkpoint(checkpointDir) - } - - // Setup the stream computation - val inputStream1 = new TestInputStream(ssc, input1, numInputPartitions) - val inputStream2 = new TestInputStream(ssc, input2, numInputPartitions) - val operatedStream = operation(inputStream1, inputStream2) - val outputStream = new TestOutputStreamWithPartitions(operatedStream, - new ArrayBuffer[Seq[Seq[W]]] with SynchronizedBuffer[Seq[Seq[W]]]) - outputStream.register() - ssc - } - - /** - * Runs the streams set up in `ssc` on manual clock for `numBatches` batches and - * returns the collected output. It will wait until `numExpectedOutput` number of - * output data has been collected or timeout (set by `maxWaitTimeMillis`) is reached. - * - * Returns a sequence of items for each RDD. - */ - def runStreams[V: ClassTag]( - ssc: StreamingContext, - numBatches: Int, - numExpectedOutput: Int - ): Seq[Seq[V]] = { - // Flatten each RDD into a single Seq - runStreamsWithPartitions(ssc, numBatches, numExpectedOutput).map(_.flatten.toSeq) - } - - /** - * Runs the streams set up in `ssc` on manual clock for `numBatches` batches and - * returns the collected output. It will wait until `numExpectedOutput` number of - * output data has been collected or timeout (set by `maxWaitTimeMillis`) is reached. - * - * Returns a sequence of RDD's. Each RDD is represented as several sequences of items, each - * representing one partition. - */ - def runStreamsWithPartitions[V: ClassTag]( - ssc: StreamingContext, - numBatches: Int, - numExpectedOutput: Int - ): Seq[Seq[Seq[V]]] = { - assert(numBatches > 0, "Number of batches to run stream computation is zero") - assert(numExpectedOutput > 0, "Number of expected outputs after " + numBatches + " is zero") - logInfo("numBatches = " + numBatches + ", numExpectedOutput = " + numExpectedOutput) - - // Get the output buffer - val outputStream = ssc.graph.getOutputStreams. - filter(_.isInstanceOf[TestOutputStreamWithPartitions[_]]). - head.asInstanceOf[TestOutputStreamWithPartitions[V]] - val output = outputStream.output - - try { - // Start computation - ssc.start() - - // Advance manual clock - val clock = ssc.scheduler.clock.asInstanceOf[ManualClock] - logInfo("Manual clock before advancing = " + clock.time) - if (actuallyWait) { - for (i <- 1 to numBatches) { - logInfo("Actually waiting for " + batchDuration) - clock.addToTime(batchDuration.milliseconds) - Thread.sleep(batchDuration.milliseconds) - } - } else { - clock.addToTime(numBatches * batchDuration.milliseconds) - } - logInfo("Manual clock after advancing = " + clock.time) - - // Wait until expected number of output items have been generated - val startTime = System.currentTimeMillis() - while (output.size < numExpectedOutput && System.currentTimeMillis() - startTime < maxWaitTimeMillis) { - logInfo("output.size = " + output.size + ", numExpectedOutput = " + numExpectedOutput) - ssc.awaitTermination(50) - } - val timeTaken = System.currentTimeMillis() - startTime - logInfo("Output generated in " + timeTaken + " milliseconds") - output.foreach(x => logInfo("[" + x.mkString(",") + "]")) - assert(timeTaken < maxWaitTimeMillis, "Operation timed out after " + timeTaken + " ms") - assert(output.size === numExpectedOutput, "Unexpected number of outputs generated") - - Thread.sleep(100) // Give some time for the forgetting old RDDs to complete - } finally { - ssc.stop(stopSparkContext = true) - } - output - } - - /** - * Verify whether the output values after running a DStream operation - * is same as the expected output values, by comparing the output - * collections either as lists (order matters) or sets (order does not matter) - */ - def verifyOutput[V: ClassTag]( - output: Seq[Seq[V]], - expectedOutput: Seq[Seq[V]], - useSet: Boolean - ) { - logInfo("--------------------------------") - logInfo("output.size = " + output.size) - logInfo("output") - output.foreach(x => logInfo("[" + x.mkString(",") + "]")) - logInfo("expected output.size = " + expectedOutput.size) - logInfo("expected output") - expectedOutput.foreach(x => logInfo("[" + x.mkString(",") + "]")) - logInfo("--------------------------------") - - // Match the output with the expected output - assert(output.size === expectedOutput.size, "Number of outputs do not match") - for (i <- 0 until output.size) { - if (useSet) { - assert(output(i).toSet === expectedOutput(i).toSet) - } else { - assert(output(i).toList === expectedOutput(i).toList) - } - } - logInfo("Output verified successfully") - } - - /** - * Test unary DStream operation with a list of inputs, with number of - * batches to run same as the number of expected output values - */ - def testOperation[U: ClassTag, V: ClassTag]( - input: Seq[Seq[U]], - operation: DStream[U] => DStream[V], - expectedOutput: Seq[Seq[V]], - useSet: Boolean = false - ) { - testOperation[U, V](input, operation, expectedOutput, -1, useSet) - } - - /** - * Test unary DStream operation with a list of inputs - * @param input Sequence of input collections - * @param operation Binary DStream operation to be applied to the 2 inputs - * @param expectedOutput Sequence of expected output collections - * @param numBatches Number of batches to run the operation for - * @param useSet Compare the output values with the expected output values - * as sets (order matters) or as lists (order does not matter) - */ - def testOperation[U: ClassTag, V: ClassTag]( - input: Seq[Seq[U]], - operation: DStream[U] => DStream[V], - expectedOutput: Seq[Seq[V]], - numBatches: Int, - useSet: Boolean - ) { - val numBatches_ = if (numBatches > 0) numBatches else expectedOutput.size - withStreamingContext(setupStreams[U, V](input, operation)) { ssc => - val output = runStreams[V](ssc, numBatches_, expectedOutput.size) - verifyOutput[V](output, expectedOutput, useSet) - } - } - - /** - * Test binary DStream operation with two lists of inputs, with number of - * batches to run same as the number of expected output values - */ - def testOperation[U: ClassTag, V: ClassTag, W: ClassTag]( - input1: Seq[Seq[U]], - input2: Seq[Seq[V]], - operation: (DStream[U], DStream[V]) => DStream[W], - expectedOutput: Seq[Seq[W]], - useSet: Boolean - ) { - testOperation[U, V, W](input1, input2, operation, expectedOutput, -1, useSet) - } - - /** - * Test binary DStream operation with two lists of inputs - * @param input1 First sequence of input collections - * @param input2 Second sequence of input collections - * @param operation Binary DStream operation to be applied to the 2 inputs - * @param expectedOutput Sequence of expected output collections - * @param numBatches Number of batches to run the operation for - * @param useSet Compare the output values with the expected output values - * as sets (order matters) or as lists (order does not matter) - */ - def testOperation[U: ClassTag, V: ClassTag, W: ClassTag]( - input1: Seq[Seq[U]], - input2: Seq[Seq[V]], - operation: (DStream[U], DStream[V]) => DStream[W], - expectedOutput: Seq[Seq[W]], - numBatches: Int, - useSet: Boolean - ) { - val numBatches_ = if (numBatches > 0) numBatches else expectedOutput.size - withStreamingContext(setupStreams[U, V, W](input1, input2, operation)) { ssc => - val output = runStreams[W](ssc, numBatches_, expectedOutput.size) - verifyOutput[W](output, expectedOutput, useSet) - } - } -} diff --git a/external/mqtt/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala b/external/mqtt/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala deleted file mode 100644 index 76b3b73a2ff3b..0000000000000 --- a/external/mqtt/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala +++ /dev/null @@ -1,413 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.streaming - -import java.io.{ObjectInputStream, IOException} - -import scala.collection.mutable.ArrayBuffer -import scala.collection.mutable.SynchronizedBuffer -import scala.reflect.ClassTag - -import org.scalatest.{BeforeAndAfter, FunSuite} - -import org.apache.spark.streaming.dstream.{DStream, InputDStream, ForEachDStream} -import org.apache.spark.streaming.util.ManualClock -import org.apache.spark.{SparkConf, Logging} -import org.apache.spark.rdd.RDD -import org.apache.spark.util.Utils - -/** - * This is a input stream just for the testsuites. This is equivalent to a checkpointable, - * replayable, reliable message queue like Kafka. It requires a sequence as input, and - * returns the i_th element at the i_th batch unde manual clock. - */ -class TestInputStream[T: ClassTag](ssc_ : StreamingContext, input: Seq[Seq[T]], numPartitions: Int) - extends InputDStream[T](ssc_) { - - def start() {} - - def stop() {} - - def compute(validTime: Time): Option[RDD[T]] = { - logInfo("Computing RDD for time " + validTime) - val index = ((validTime - zeroTime) / slideDuration - 1).toInt - val selectedInput = if (index < input.size) input(index) else Seq[T]() - - // lets us test cases where RDDs are not created - if (selectedInput == null) - return None - - val rdd = ssc.sc.makeRDD(selectedInput, numPartitions) - logInfo("Created RDD " + rdd.id + " with " + selectedInput) - Some(rdd) - } -} - -/** - * This is a output stream just for the testsuites. All the output is collected into a - * ArrayBuffer. This buffer is wiped clean on being restored from checkpoint. - * - * The buffer contains a sequence of RDD's, each containing a sequence of items - */ -class TestOutputStream[T: ClassTag](parent: DStream[T], - val output: ArrayBuffer[Seq[T]] = ArrayBuffer[Seq[T]]()) - extends ForEachDStream[T](parent, (rdd: RDD[T], t: Time) => { - val collected = rdd.collect() - output += collected - }) { - - // This is to clear the output buffer every it is read from a checkpoint - @throws(classOf[IOException]) - private def readObject(ois: ObjectInputStream): Unit = Utils.tryOrIOException { - ois.defaultReadObject() - output.clear() - } -} - -/** - * This is a output stream just for the testsuites. All the output is collected into a - * ArrayBuffer. This buffer is wiped clean on being restored from checkpoint. - * - * The buffer contains a sequence of RDD's, each containing a sequence of partitions, each - * containing a sequence of items. - */ -class TestOutputStreamWithPartitions[T: ClassTag](parent: DStream[T], - val output: ArrayBuffer[Seq[Seq[T]]] = ArrayBuffer[Seq[Seq[T]]]()) - extends ForEachDStream[T](parent, (rdd: RDD[T], t: Time) => { - val collected = rdd.glom().collect().map(_.toSeq) - output += collected - }) { - - // This is to clear the output buffer every it is read from a checkpoint - @throws(classOf[IOException]) - private def readObject(ois: ObjectInputStream): Unit = Utils.tryOrIOException { - ois.defaultReadObject() - output.clear() - } - - def toTestOutputStream = new TestOutputStream[T](this.parent, this.output.map(_.flatten)) -} - -/** - * This is the base trait for Spark Streaming testsuites. This provides basic functionality - * to run user-defined set of input on user-defined stream operations, and verify the output. - */ -trait TestSuiteBase extends FunSuite with BeforeAndAfter with Logging { - - // Name of the framework for Spark context - def framework = this.getClass.getSimpleName - - // Master for Spark context - def master = "local[2]" - - // Batch duration - def batchDuration = Seconds(1) - - // Directory where the checkpoint data will be saved - lazy val checkpointDir = { - val dir = Utils.createTempDir() - logDebug(s"checkpointDir: $dir") - dir.toString - } - - // Number of partitions of the input parallel collections created for testing - def numInputPartitions = 2 - - // Maximum time to wait before the test times out - def maxWaitTimeMillis = 10000 - - // Whether to use manual clock or not - def useManualClock = true - - // Whether to actually wait in real time before changing manual clock - def actuallyWait = false - - //// A SparkConf to use in tests. Can be modified before calling setupStreams to configure things. - val conf = new SparkConf() - .setMaster(master) - .setAppName(framework) - - // Default before function for any streaming test suite. Override this - // if you want to add your stuff to "before" (i.e., don't call before { } ) - def beforeFunction() { - if (useManualClock) { - logInfo("Using manual clock") - conf.set("spark.streaming.clock", "org.apache.spark.streaming.util.ManualClock") - } else { - logInfo("Using real clock") - conf.set("spark.streaming.clock", "org.apache.spark.streaming.util.SystemClock") - } - } - - // Default after function for any streaming test suite. Override this - // if you want to add your stuff to "after" (i.e., don't call after { } ) - def afterFunction() { - System.clearProperty("spark.streaming.clock") - } - - before(beforeFunction) - after(afterFunction) - - /** - * Run a block of code with the given StreamingContext and automatically - * stop the context when the block completes or when an exception is thrown. - */ - def withStreamingContext[R](ssc: StreamingContext)(block: StreamingContext => R): R = { - try { - block(ssc) - } finally { - try { - ssc.stop(stopSparkContext = true) - } catch { - case e: Exception => - logError("Error stopping StreamingContext", e) - } - } - } - - /** - * Set up required DStreams to test the DStream operation using the two sequences - * of input collections. - */ - def setupStreams[U: ClassTag, V: ClassTag]( - input: Seq[Seq[U]], - operation: DStream[U] => DStream[V], - numPartitions: Int = numInputPartitions - ): StreamingContext = { - // Create StreamingContext - val ssc = new StreamingContext(conf, batchDuration) - if (checkpointDir != null) { - ssc.checkpoint(checkpointDir) - } - - // Setup the stream computation - val inputStream = new TestInputStream(ssc, input, numPartitions) - val operatedStream = operation(inputStream) - val outputStream = new TestOutputStreamWithPartitions(operatedStream, - new ArrayBuffer[Seq[Seq[V]]] with SynchronizedBuffer[Seq[Seq[V]]]) - outputStream.register() - ssc - } - - /** - * Set up required DStreams to test the binary operation using the sequence - * of input collections. - */ - def setupStreams[U: ClassTag, V: ClassTag, W: ClassTag]( - input1: Seq[Seq[U]], - input2: Seq[Seq[V]], - operation: (DStream[U], DStream[V]) => DStream[W] - ): StreamingContext = { - // Create StreamingContext - val ssc = new StreamingContext(conf, batchDuration) - if (checkpointDir != null) { - ssc.checkpoint(checkpointDir) - } - - // Setup the stream computation - val inputStream1 = new TestInputStream(ssc, input1, numInputPartitions) - val inputStream2 = new TestInputStream(ssc, input2, numInputPartitions) - val operatedStream = operation(inputStream1, inputStream2) - val outputStream = new TestOutputStreamWithPartitions(operatedStream, - new ArrayBuffer[Seq[Seq[W]]] with SynchronizedBuffer[Seq[Seq[W]]]) - outputStream.register() - ssc - } - - /** - * Runs the streams set up in `ssc` on manual clock for `numBatches` batches and - * returns the collected output. It will wait until `numExpectedOutput` number of - * output data has been collected or timeout (set by `maxWaitTimeMillis`) is reached. - * - * Returns a sequence of items for each RDD. - */ - def runStreams[V: ClassTag]( - ssc: StreamingContext, - numBatches: Int, - numExpectedOutput: Int - ): Seq[Seq[V]] = { - // Flatten each RDD into a single Seq - runStreamsWithPartitions(ssc, numBatches, numExpectedOutput).map(_.flatten.toSeq) - } - - /** - * Runs the streams set up in `ssc` on manual clock for `numBatches` batches and - * returns the collected output. It will wait until `numExpectedOutput` number of - * output data has been collected or timeout (set by `maxWaitTimeMillis`) is reached. - * - * Returns a sequence of RDD's. Each RDD is represented as several sequences of items, each - * representing one partition. - */ - def runStreamsWithPartitions[V: ClassTag]( - ssc: StreamingContext, - numBatches: Int, - numExpectedOutput: Int - ): Seq[Seq[Seq[V]]] = { - assert(numBatches > 0, "Number of batches to run stream computation is zero") - assert(numExpectedOutput > 0, "Number of expected outputs after " + numBatches + " is zero") - logInfo("numBatches = " + numBatches + ", numExpectedOutput = " + numExpectedOutput) - - // Get the output buffer - val outputStream = ssc.graph.getOutputStreams. - filter(_.isInstanceOf[TestOutputStreamWithPartitions[_]]). - head.asInstanceOf[TestOutputStreamWithPartitions[V]] - val output = outputStream.output - - try { - // Start computation - ssc.start() - - // Advance manual clock - val clock = ssc.scheduler.clock.asInstanceOf[ManualClock] - logInfo("Manual clock before advancing = " + clock.time) - if (actuallyWait) { - for (i <- 1 to numBatches) { - logInfo("Actually waiting for " + batchDuration) - clock.addToTime(batchDuration.milliseconds) - Thread.sleep(batchDuration.milliseconds) - } - } else { - clock.addToTime(numBatches * batchDuration.milliseconds) - } - logInfo("Manual clock after advancing = " + clock.time) - - // Wait until expected number of output items have been generated - val startTime = System.currentTimeMillis() - while (output.size < numExpectedOutput && System.currentTimeMillis() - startTime < maxWaitTimeMillis) { - logInfo("output.size = " + output.size + ", numExpectedOutput = " + numExpectedOutput) - ssc.awaitTermination(50) - } - val timeTaken = System.currentTimeMillis() - startTime - logInfo("Output generated in " + timeTaken + " milliseconds") - output.foreach(x => logInfo("[" + x.mkString(",") + "]")) - assert(timeTaken < maxWaitTimeMillis, "Operation timed out after " + timeTaken + " ms") - assert(output.size === numExpectedOutput, "Unexpected number of outputs generated") - - Thread.sleep(100) // Give some time for the forgetting old RDDs to complete - } finally { - ssc.stop(stopSparkContext = true) - } - output - } - - /** - * Verify whether the output values after running a DStream operation - * is same as the expected output values, by comparing the output - * collections either as lists (order matters) or sets (order does not matter) - */ - def verifyOutput[V: ClassTag]( - output: Seq[Seq[V]], - expectedOutput: Seq[Seq[V]], - useSet: Boolean - ) { - logInfo("--------------------------------") - logInfo("output.size = " + output.size) - logInfo("output") - output.foreach(x => logInfo("[" + x.mkString(",") + "]")) - logInfo("expected output.size = " + expectedOutput.size) - logInfo("expected output") - expectedOutput.foreach(x => logInfo("[" + x.mkString(",") + "]")) - logInfo("--------------------------------") - - // Match the output with the expected output - assert(output.size === expectedOutput.size, "Number of outputs do not match") - for (i <- 0 until output.size) { - if (useSet) { - assert(output(i).toSet === expectedOutput(i).toSet) - } else { - assert(output(i).toList === expectedOutput(i).toList) - } - } - logInfo("Output verified successfully") - } - - /** - * Test unary DStream operation with a list of inputs, with number of - * batches to run same as the number of expected output values - */ - def testOperation[U: ClassTag, V: ClassTag]( - input: Seq[Seq[U]], - operation: DStream[U] => DStream[V], - expectedOutput: Seq[Seq[V]], - useSet: Boolean = false - ) { - testOperation[U, V](input, operation, expectedOutput, -1, useSet) - } - - /** - * Test unary DStream operation with a list of inputs - * @param input Sequence of input collections - * @param operation Binary DStream operation to be applied to the 2 inputs - * @param expectedOutput Sequence of expected output collections - * @param numBatches Number of batches to run the operation for - * @param useSet Compare the output values with the expected output values - * as sets (order matters) or as lists (order does not matter) - */ - def testOperation[U: ClassTag, V: ClassTag]( - input: Seq[Seq[U]], - operation: DStream[U] => DStream[V], - expectedOutput: Seq[Seq[V]], - numBatches: Int, - useSet: Boolean - ) { - val numBatches_ = if (numBatches > 0) numBatches else expectedOutput.size - withStreamingContext(setupStreams[U, V](input, operation)) { ssc => - val output = runStreams[V](ssc, numBatches_, expectedOutput.size) - verifyOutput[V](output, expectedOutput, useSet) - } - } - - /** - * Test binary DStream operation with two lists of inputs, with number of - * batches to run same as the number of expected output values - */ - def testOperation[U: ClassTag, V: ClassTag, W: ClassTag]( - input1: Seq[Seq[U]], - input2: Seq[Seq[V]], - operation: (DStream[U], DStream[V]) => DStream[W], - expectedOutput: Seq[Seq[W]], - useSet: Boolean - ) { - testOperation[U, V, W](input1, input2, operation, expectedOutput, -1, useSet) - } - - /** - * Test binary DStream operation with two lists of inputs - * @param input1 First sequence of input collections - * @param input2 Second sequence of input collections - * @param operation Binary DStream operation to be applied to the 2 inputs - * @param expectedOutput Sequence of expected output collections - * @param numBatches Number of batches to run the operation for - * @param useSet Compare the output values with the expected output values - * as sets (order matters) or as lists (order does not matter) - */ - def testOperation[U: ClassTag, V: ClassTag, W: ClassTag]( - input1: Seq[Seq[U]], - input2: Seq[Seq[V]], - operation: (DStream[U], DStream[V]) => DStream[W], - expectedOutput: Seq[Seq[W]], - numBatches: Int, - useSet: Boolean - ) { - val numBatches_ = if (numBatches > 0) numBatches else expectedOutput.size - withStreamingContext(setupStreams[U, V, W](input1, input2, operation)) { ssc => - val output = runStreams[W](ssc, numBatches_, expectedOutput.size) - verifyOutput[W](output, expectedOutput, useSet) - } - } -} diff --git a/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTStreamSuite.scala b/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTStreamSuite.scala index 467fd263e2d64..84595acf45ccb 100644 --- a/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTStreamSuite.scala +++ b/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTStreamSuite.scala @@ -17,11 +17,19 @@ package org.apache.spark.streaming.mqtt -import org.apache.spark.streaming.{StreamingContext, TestSuiteBase} +import org.scalatest.FunSuite + +import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.dstream.ReceiverInputDStream -class MQTTStreamSuite extends TestSuiteBase { +class MQTTStreamSuite extends FunSuite { + + val batchDuration = Seconds(1) + + private val master: String = "local[2]" + + private val framework: String = this.getClass.getSimpleName test("mqtt input stream") { val ssc = new StreamingContext(master, framework, batchDuration) diff --git a/external/twitter/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala b/external/twitter/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala deleted file mode 100644 index 76b3b73a2ff3b..0000000000000 --- a/external/twitter/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala +++ /dev/null @@ -1,413 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.streaming - -import java.io.{ObjectInputStream, IOException} - -import scala.collection.mutable.ArrayBuffer -import scala.collection.mutable.SynchronizedBuffer -import scala.reflect.ClassTag - -import org.scalatest.{BeforeAndAfter, FunSuite} - -import org.apache.spark.streaming.dstream.{DStream, InputDStream, ForEachDStream} -import org.apache.spark.streaming.util.ManualClock -import org.apache.spark.{SparkConf, Logging} -import org.apache.spark.rdd.RDD -import org.apache.spark.util.Utils - -/** - * This is a input stream just for the testsuites. This is equivalent to a checkpointable, - * replayable, reliable message queue like Kafka. It requires a sequence as input, and - * returns the i_th element at the i_th batch unde manual clock. - */ -class TestInputStream[T: ClassTag](ssc_ : StreamingContext, input: Seq[Seq[T]], numPartitions: Int) - extends InputDStream[T](ssc_) { - - def start() {} - - def stop() {} - - def compute(validTime: Time): Option[RDD[T]] = { - logInfo("Computing RDD for time " + validTime) - val index = ((validTime - zeroTime) / slideDuration - 1).toInt - val selectedInput = if (index < input.size) input(index) else Seq[T]() - - // lets us test cases where RDDs are not created - if (selectedInput == null) - return None - - val rdd = ssc.sc.makeRDD(selectedInput, numPartitions) - logInfo("Created RDD " + rdd.id + " with " + selectedInput) - Some(rdd) - } -} - -/** - * This is a output stream just for the testsuites. All the output is collected into a - * ArrayBuffer. This buffer is wiped clean on being restored from checkpoint. - * - * The buffer contains a sequence of RDD's, each containing a sequence of items - */ -class TestOutputStream[T: ClassTag](parent: DStream[T], - val output: ArrayBuffer[Seq[T]] = ArrayBuffer[Seq[T]]()) - extends ForEachDStream[T](parent, (rdd: RDD[T], t: Time) => { - val collected = rdd.collect() - output += collected - }) { - - // This is to clear the output buffer every it is read from a checkpoint - @throws(classOf[IOException]) - private def readObject(ois: ObjectInputStream): Unit = Utils.tryOrIOException { - ois.defaultReadObject() - output.clear() - } -} - -/** - * This is a output stream just for the testsuites. All the output is collected into a - * ArrayBuffer. This buffer is wiped clean on being restored from checkpoint. - * - * The buffer contains a sequence of RDD's, each containing a sequence of partitions, each - * containing a sequence of items. - */ -class TestOutputStreamWithPartitions[T: ClassTag](parent: DStream[T], - val output: ArrayBuffer[Seq[Seq[T]]] = ArrayBuffer[Seq[Seq[T]]]()) - extends ForEachDStream[T](parent, (rdd: RDD[T], t: Time) => { - val collected = rdd.glom().collect().map(_.toSeq) - output += collected - }) { - - // This is to clear the output buffer every it is read from a checkpoint - @throws(classOf[IOException]) - private def readObject(ois: ObjectInputStream): Unit = Utils.tryOrIOException { - ois.defaultReadObject() - output.clear() - } - - def toTestOutputStream = new TestOutputStream[T](this.parent, this.output.map(_.flatten)) -} - -/** - * This is the base trait for Spark Streaming testsuites. This provides basic functionality - * to run user-defined set of input on user-defined stream operations, and verify the output. - */ -trait TestSuiteBase extends FunSuite with BeforeAndAfter with Logging { - - // Name of the framework for Spark context - def framework = this.getClass.getSimpleName - - // Master for Spark context - def master = "local[2]" - - // Batch duration - def batchDuration = Seconds(1) - - // Directory where the checkpoint data will be saved - lazy val checkpointDir = { - val dir = Utils.createTempDir() - logDebug(s"checkpointDir: $dir") - dir.toString - } - - // Number of partitions of the input parallel collections created for testing - def numInputPartitions = 2 - - // Maximum time to wait before the test times out - def maxWaitTimeMillis = 10000 - - // Whether to use manual clock or not - def useManualClock = true - - // Whether to actually wait in real time before changing manual clock - def actuallyWait = false - - //// A SparkConf to use in tests. Can be modified before calling setupStreams to configure things. - val conf = new SparkConf() - .setMaster(master) - .setAppName(framework) - - // Default before function for any streaming test suite. Override this - // if you want to add your stuff to "before" (i.e., don't call before { } ) - def beforeFunction() { - if (useManualClock) { - logInfo("Using manual clock") - conf.set("spark.streaming.clock", "org.apache.spark.streaming.util.ManualClock") - } else { - logInfo("Using real clock") - conf.set("spark.streaming.clock", "org.apache.spark.streaming.util.SystemClock") - } - } - - // Default after function for any streaming test suite. Override this - // if you want to add your stuff to "after" (i.e., don't call after { } ) - def afterFunction() { - System.clearProperty("spark.streaming.clock") - } - - before(beforeFunction) - after(afterFunction) - - /** - * Run a block of code with the given StreamingContext and automatically - * stop the context when the block completes or when an exception is thrown. - */ - def withStreamingContext[R](ssc: StreamingContext)(block: StreamingContext => R): R = { - try { - block(ssc) - } finally { - try { - ssc.stop(stopSparkContext = true) - } catch { - case e: Exception => - logError("Error stopping StreamingContext", e) - } - } - } - - /** - * Set up required DStreams to test the DStream operation using the two sequences - * of input collections. - */ - def setupStreams[U: ClassTag, V: ClassTag]( - input: Seq[Seq[U]], - operation: DStream[U] => DStream[V], - numPartitions: Int = numInputPartitions - ): StreamingContext = { - // Create StreamingContext - val ssc = new StreamingContext(conf, batchDuration) - if (checkpointDir != null) { - ssc.checkpoint(checkpointDir) - } - - // Setup the stream computation - val inputStream = new TestInputStream(ssc, input, numPartitions) - val operatedStream = operation(inputStream) - val outputStream = new TestOutputStreamWithPartitions(operatedStream, - new ArrayBuffer[Seq[Seq[V]]] with SynchronizedBuffer[Seq[Seq[V]]]) - outputStream.register() - ssc - } - - /** - * Set up required DStreams to test the binary operation using the sequence - * of input collections. - */ - def setupStreams[U: ClassTag, V: ClassTag, W: ClassTag]( - input1: Seq[Seq[U]], - input2: Seq[Seq[V]], - operation: (DStream[U], DStream[V]) => DStream[W] - ): StreamingContext = { - // Create StreamingContext - val ssc = new StreamingContext(conf, batchDuration) - if (checkpointDir != null) { - ssc.checkpoint(checkpointDir) - } - - // Setup the stream computation - val inputStream1 = new TestInputStream(ssc, input1, numInputPartitions) - val inputStream2 = new TestInputStream(ssc, input2, numInputPartitions) - val operatedStream = operation(inputStream1, inputStream2) - val outputStream = new TestOutputStreamWithPartitions(operatedStream, - new ArrayBuffer[Seq[Seq[W]]] with SynchronizedBuffer[Seq[Seq[W]]]) - outputStream.register() - ssc - } - - /** - * Runs the streams set up in `ssc` on manual clock for `numBatches` batches and - * returns the collected output. It will wait until `numExpectedOutput` number of - * output data has been collected or timeout (set by `maxWaitTimeMillis`) is reached. - * - * Returns a sequence of items for each RDD. - */ - def runStreams[V: ClassTag]( - ssc: StreamingContext, - numBatches: Int, - numExpectedOutput: Int - ): Seq[Seq[V]] = { - // Flatten each RDD into a single Seq - runStreamsWithPartitions(ssc, numBatches, numExpectedOutput).map(_.flatten.toSeq) - } - - /** - * Runs the streams set up in `ssc` on manual clock for `numBatches` batches and - * returns the collected output. It will wait until `numExpectedOutput` number of - * output data has been collected or timeout (set by `maxWaitTimeMillis`) is reached. - * - * Returns a sequence of RDD's. Each RDD is represented as several sequences of items, each - * representing one partition. - */ - def runStreamsWithPartitions[V: ClassTag]( - ssc: StreamingContext, - numBatches: Int, - numExpectedOutput: Int - ): Seq[Seq[Seq[V]]] = { - assert(numBatches > 0, "Number of batches to run stream computation is zero") - assert(numExpectedOutput > 0, "Number of expected outputs after " + numBatches + " is zero") - logInfo("numBatches = " + numBatches + ", numExpectedOutput = " + numExpectedOutput) - - // Get the output buffer - val outputStream = ssc.graph.getOutputStreams. - filter(_.isInstanceOf[TestOutputStreamWithPartitions[_]]). - head.asInstanceOf[TestOutputStreamWithPartitions[V]] - val output = outputStream.output - - try { - // Start computation - ssc.start() - - // Advance manual clock - val clock = ssc.scheduler.clock.asInstanceOf[ManualClock] - logInfo("Manual clock before advancing = " + clock.time) - if (actuallyWait) { - for (i <- 1 to numBatches) { - logInfo("Actually waiting for " + batchDuration) - clock.addToTime(batchDuration.milliseconds) - Thread.sleep(batchDuration.milliseconds) - } - } else { - clock.addToTime(numBatches * batchDuration.milliseconds) - } - logInfo("Manual clock after advancing = " + clock.time) - - // Wait until expected number of output items have been generated - val startTime = System.currentTimeMillis() - while (output.size < numExpectedOutput && System.currentTimeMillis() - startTime < maxWaitTimeMillis) { - logInfo("output.size = " + output.size + ", numExpectedOutput = " + numExpectedOutput) - ssc.awaitTermination(50) - } - val timeTaken = System.currentTimeMillis() - startTime - logInfo("Output generated in " + timeTaken + " milliseconds") - output.foreach(x => logInfo("[" + x.mkString(",") + "]")) - assert(timeTaken < maxWaitTimeMillis, "Operation timed out after " + timeTaken + " ms") - assert(output.size === numExpectedOutput, "Unexpected number of outputs generated") - - Thread.sleep(100) // Give some time for the forgetting old RDDs to complete - } finally { - ssc.stop(stopSparkContext = true) - } - output - } - - /** - * Verify whether the output values after running a DStream operation - * is same as the expected output values, by comparing the output - * collections either as lists (order matters) or sets (order does not matter) - */ - def verifyOutput[V: ClassTag]( - output: Seq[Seq[V]], - expectedOutput: Seq[Seq[V]], - useSet: Boolean - ) { - logInfo("--------------------------------") - logInfo("output.size = " + output.size) - logInfo("output") - output.foreach(x => logInfo("[" + x.mkString(",") + "]")) - logInfo("expected output.size = " + expectedOutput.size) - logInfo("expected output") - expectedOutput.foreach(x => logInfo("[" + x.mkString(",") + "]")) - logInfo("--------------------------------") - - // Match the output with the expected output - assert(output.size === expectedOutput.size, "Number of outputs do not match") - for (i <- 0 until output.size) { - if (useSet) { - assert(output(i).toSet === expectedOutput(i).toSet) - } else { - assert(output(i).toList === expectedOutput(i).toList) - } - } - logInfo("Output verified successfully") - } - - /** - * Test unary DStream operation with a list of inputs, with number of - * batches to run same as the number of expected output values - */ - def testOperation[U: ClassTag, V: ClassTag]( - input: Seq[Seq[U]], - operation: DStream[U] => DStream[V], - expectedOutput: Seq[Seq[V]], - useSet: Boolean = false - ) { - testOperation[U, V](input, operation, expectedOutput, -1, useSet) - } - - /** - * Test unary DStream operation with a list of inputs - * @param input Sequence of input collections - * @param operation Binary DStream operation to be applied to the 2 inputs - * @param expectedOutput Sequence of expected output collections - * @param numBatches Number of batches to run the operation for - * @param useSet Compare the output values with the expected output values - * as sets (order matters) or as lists (order does not matter) - */ - def testOperation[U: ClassTag, V: ClassTag]( - input: Seq[Seq[U]], - operation: DStream[U] => DStream[V], - expectedOutput: Seq[Seq[V]], - numBatches: Int, - useSet: Boolean - ) { - val numBatches_ = if (numBatches > 0) numBatches else expectedOutput.size - withStreamingContext(setupStreams[U, V](input, operation)) { ssc => - val output = runStreams[V](ssc, numBatches_, expectedOutput.size) - verifyOutput[V](output, expectedOutput, useSet) - } - } - - /** - * Test binary DStream operation with two lists of inputs, with number of - * batches to run same as the number of expected output values - */ - def testOperation[U: ClassTag, V: ClassTag, W: ClassTag]( - input1: Seq[Seq[U]], - input2: Seq[Seq[V]], - operation: (DStream[U], DStream[V]) => DStream[W], - expectedOutput: Seq[Seq[W]], - useSet: Boolean - ) { - testOperation[U, V, W](input1, input2, operation, expectedOutput, -1, useSet) - } - - /** - * Test binary DStream operation with two lists of inputs - * @param input1 First sequence of input collections - * @param input2 Second sequence of input collections - * @param operation Binary DStream operation to be applied to the 2 inputs - * @param expectedOutput Sequence of expected output collections - * @param numBatches Number of batches to run the operation for - * @param useSet Compare the output values with the expected output values - * as sets (order matters) or as lists (order does not matter) - */ - def testOperation[U: ClassTag, V: ClassTag, W: ClassTag]( - input1: Seq[Seq[U]], - input2: Seq[Seq[V]], - operation: (DStream[U], DStream[V]) => DStream[W], - expectedOutput: Seq[Seq[W]], - numBatches: Int, - useSet: Boolean - ) { - val numBatches_ = if (numBatches > 0) numBatches else expectedOutput.size - withStreamingContext(setupStreams[U, V, W](input1, input2, operation)) { ssc => - val output = runStreams[W](ssc, numBatches_, expectedOutput.size) - verifyOutput[W](output, expectedOutput, useSet) - } - } -} diff --git a/external/twitter/src/test/scala/org/apache/spark/streaming/twitter/TwitterStreamSuite.scala b/external/twitter/src/test/scala/org/apache/spark/streaming/twitter/TwitterStreamSuite.scala index 93741e0375164..9ee57d7581d85 100644 --- a/external/twitter/src/test/scala/org/apache/spark/streaming/twitter/TwitterStreamSuite.scala +++ b/external/twitter/src/test/scala/org/apache/spark/streaming/twitter/TwitterStreamSuite.scala @@ -17,13 +17,23 @@ package org.apache.spark.streaming.twitter -import org.apache.spark.streaming.{StreamingContext, TestSuiteBase} -import org.apache.spark.storage.StorageLevel + +import org.scalatest.{BeforeAndAfter, FunSuite} +import twitter4j.Status import twitter4j.auth.{NullAuthorization, Authorization} + +import org.apache.spark.Logging +import org.apache.spark.streaming.{Seconds, StreamingContext} +import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.dstream.ReceiverInputDStream -import twitter4j.Status -class TwitterStreamSuite extends TestSuiteBase { +class TwitterStreamSuite extends FunSuite with BeforeAndAfter with Logging { + + val batchDuration = Seconds(1) + + private val master: String = "local[2]" + + private val framework: String = this.getClass.getSimpleName test("twitter input stream") { val ssc = new StreamingContext(master, framework, batchDuration) diff --git a/external/zeromq/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala b/external/zeromq/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala deleted file mode 100644 index 76b3b73a2ff3b..0000000000000 --- a/external/zeromq/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala +++ /dev/null @@ -1,413 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.streaming - -import java.io.{ObjectInputStream, IOException} - -import scala.collection.mutable.ArrayBuffer -import scala.collection.mutable.SynchronizedBuffer -import scala.reflect.ClassTag - -import org.scalatest.{BeforeAndAfter, FunSuite} - -import org.apache.spark.streaming.dstream.{DStream, InputDStream, ForEachDStream} -import org.apache.spark.streaming.util.ManualClock -import org.apache.spark.{SparkConf, Logging} -import org.apache.spark.rdd.RDD -import org.apache.spark.util.Utils - -/** - * This is a input stream just for the testsuites. This is equivalent to a checkpointable, - * replayable, reliable message queue like Kafka. It requires a sequence as input, and - * returns the i_th element at the i_th batch unde manual clock. - */ -class TestInputStream[T: ClassTag](ssc_ : StreamingContext, input: Seq[Seq[T]], numPartitions: Int) - extends InputDStream[T](ssc_) { - - def start() {} - - def stop() {} - - def compute(validTime: Time): Option[RDD[T]] = { - logInfo("Computing RDD for time " + validTime) - val index = ((validTime - zeroTime) / slideDuration - 1).toInt - val selectedInput = if (index < input.size) input(index) else Seq[T]() - - // lets us test cases where RDDs are not created - if (selectedInput == null) - return None - - val rdd = ssc.sc.makeRDD(selectedInput, numPartitions) - logInfo("Created RDD " + rdd.id + " with " + selectedInput) - Some(rdd) - } -} - -/** - * This is a output stream just for the testsuites. All the output is collected into a - * ArrayBuffer. This buffer is wiped clean on being restored from checkpoint. - * - * The buffer contains a sequence of RDD's, each containing a sequence of items - */ -class TestOutputStream[T: ClassTag](parent: DStream[T], - val output: ArrayBuffer[Seq[T]] = ArrayBuffer[Seq[T]]()) - extends ForEachDStream[T](parent, (rdd: RDD[T], t: Time) => { - val collected = rdd.collect() - output += collected - }) { - - // This is to clear the output buffer every it is read from a checkpoint - @throws(classOf[IOException]) - private def readObject(ois: ObjectInputStream): Unit = Utils.tryOrIOException { - ois.defaultReadObject() - output.clear() - } -} - -/** - * This is a output stream just for the testsuites. All the output is collected into a - * ArrayBuffer. This buffer is wiped clean on being restored from checkpoint. - * - * The buffer contains a sequence of RDD's, each containing a sequence of partitions, each - * containing a sequence of items. - */ -class TestOutputStreamWithPartitions[T: ClassTag](parent: DStream[T], - val output: ArrayBuffer[Seq[Seq[T]]] = ArrayBuffer[Seq[Seq[T]]]()) - extends ForEachDStream[T](parent, (rdd: RDD[T], t: Time) => { - val collected = rdd.glom().collect().map(_.toSeq) - output += collected - }) { - - // This is to clear the output buffer every it is read from a checkpoint - @throws(classOf[IOException]) - private def readObject(ois: ObjectInputStream): Unit = Utils.tryOrIOException { - ois.defaultReadObject() - output.clear() - } - - def toTestOutputStream = new TestOutputStream[T](this.parent, this.output.map(_.flatten)) -} - -/** - * This is the base trait for Spark Streaming testsuites. This provides basic functionality - * to run user-defined set of input on user-defined stream operations, and verify the output. - */ -trait TestSuiteBase extends FunSuite with BeforeAndAfter with Logging { - - // Name of the framework for Spark context - def framework = this.getClass.getSimpleName - - // Master for Spark context - def master = "local[2]" - - // Batch duration - def batchDuration = Seconds(1) - - // Directory where the checkpoint data will be saved - lazy val checkpointDir = { - val dir = Utils.createTempDir() - logDebug(s"checkpointDir: $dir") - dir.toString - } - - // Number of partitions of the input parallel collections created for testing - def numInputPartitions = 2 - - // Maximum time to wait before the test times out - def maxWaitTimeMillis = 10000 - - // Whether to use manual clock or not - def useManualClock = true - - // Whether to actually wait in real time before changing manual clock - def actuallyWait = false - - //// A SparkConf to use in tests. Can be modified before calling setupStreams to configure things. - val conf = new SparkConf() - .setMaster(master) - .setAppName(framework) - - // Default before function for any streaming test suite. Override this - // if you want to add your stuff to "before" (i.e., don't call before { } ) - def beforeFunction() { - if (useManualClock) { - logInfo("Using manual clock") - conf.set("spark.streaming.clock", "org.apache.spark.streaming.util.ManualClock") - } else { - logInfo("Using real clock") - conf.set("spark.streaming.clock", "org.apache.spark.streaming.util.SystemClock") - } - } - - // Default after function for any streaming test suite. Override this - // if you want to add your stuff to "after" (i.e., don't call after { } ) - def afterFunction() { - System.clearProperty("spark.streaming.clock") - } - - before(beforeFunction) - after(afterFunction) - - /** - * Run a block of code with the given StreamingContext and automatically - * stop the context when the block completes or when an exception is thrown. - */ - def withStreamingContext[R](ssc: StreamingContext)(block: StreamingContext => R): R = { - try { - block(ssc) - } finally { - try { - ssc.stop(stopSparkContext = true) - } catch { - case e: Exception => - logError("Error stopping StreamingContext", e) - } - } - } - - /** - * Set up required DStreams to test the DStream operation using the two sequences - * of input collections. - */ - def setupStreams[U: ClassTag, V: ClassTag]( - input: Seq[Seq[U]], - operation: DStream[U] => DStream[V], - numPartitions: Int = numInputPartitions - ): StreamingContext = { - // Create StreamingContext - val ssc = new StreamingContext(conf, batchDuration) - if (checkpointDir != null) { - ssc.checkpoint(checkpointDir) - } - - // Setup the stream computation - val inputStream = new TestInputStream(ssc, input, numPartitions) - val operatedStream = operation(inputStream) - val outputStream = new TestOutputStreamWithPartitions(operatedStream, - new ArrayBuffer[Seq[Seq[V]]] with SynchronizedBuffer[Seq[Seq[V]]]) - outputStream.register() - ssc - } - - /** - * Set up required DStreams to test the binary operation using the sequence - * of input collections. - */ - def setupStreams[U: ClassTag, V: ClassTag, W: ClassTag]( - input1: Seq[Seq[U]], - input2: Seq[Seq[V]], - operation: (DStream[U], DStream[V]) => DStream[W] - ): StreamingContext = { - // Create StreamingContext - val ssc = new StreamingContext(conf, batchDuration) - if (checkpointDir != null) { - ssc.checkpoint(checkpointDir) - } - - // Setup the stream computation - val inputStream1 = new TestInputStream(ssc, input1, numInputPartitions) - val inputStream2 = new TestInputStream(ssc, input2, numInputPartitions) - val operatedStream = operation(inputStream1, inputStream2) - val outputStream = new TestOutputStreamWithPartitions(operatedStream, - new ArrayBuffer[Seq[Seq[W]]] with SynchronizedBuffer[Seq[Seq[W]]]) - outputStream.register() - ssc - } - - /** - * Runs the streams set up in `ssc` on manual clock for `numBatches` batches and - * returns the collected output. It will wait until `numExpectedOutput` number of - * output data has been collected or timeout (set by `maxWaitTimeMillis`) is reached. - * - * Returns a sequence of items for each RDD. - */ - def runStreams[V: ClassTag]( - ssc: StreamingContext, - numBatches: Int, - numExpectedOutput: Int - ): Seq[Seq[V]] = { - // Flatten each RDD into a single Seq - runStreamsWithPartitions(ssc, numBatches, numExpectedOutput).map(_.flatten.toSeq) - } - - /** - * Runs the streams set up in `ssc` on manual clock for `numBatches` batches and - * returns the collected output. It will wait until `numExpectedOutput` number of - * output data has been collected or timeout (set by `maxWaitTimeMillis`) is reached. - * - * Returns a sequence of RDD's. Each RDD is represented as several sequences of items, each - * representing one partition. - */ - def runStreamsWithPartitions[V: ClassTag]( - ssc: StreamingContext, - numBatches: Int, - numExpectedOutput: Int - ): Seq[Seq[Seq[V]]] = { - assert(numBatches > 0, "Number of batches to run stream computation is zero") - assert(numExpectedOutput > 0, "Number of expected outputs after " + numBatches + " is zero") - logInfo("numBatches = " + numBatches + ", numExpectedOutput = " + numExpectedOutput) - - // Get the output buffer - val outputStream = ssc.graph.getOutputStreams. - filter(_.isInstanceOf[TestOutputStreamWithPartitions[_]]). - head.asInstanceOf[TestOutputStreamWithPartitions[V]] - val output = outputStream.output - - try { - // Start computation - ssc.start() - - // Advance manual clock - val clock = ssc.scheduler.clock.asInstanceOf[ManualClock] - logInfo("Manual clock before advancing = " + clock.time) - if (actuallyWait) { - for (i <- 1 to numBatches) { - logInfo("Actually waiting for " + batchDuration) - clock.addToTime(batchDuration.milliseconds) - Thread.sleep(batchDuration.milliseconds) - } - } else { - clock.addToTime(numBatches * batchDuration.milliseconds) - } - logInfo("Manual clock after advancing = " + clock.time) - - // Wait until expected number of output items have been generated - val startTime = System.currentTimeMillis() - while (output.size < numExpectedOutput && System.currentTimeMillis() - startTime < maxWaitTimeMillis) { - logInfo("output.size = " + output.size + ", numExpectedOutput = " + numExpectedOutput) - ssc.awaitTermination(50) - } - val timeTaken = System.currentTimeMillis() - startTime - logInfo("Output generated in " + timeTaken + " milliseconds") - output.foreach(x => logInfo("[" + x.mkString(",") + "]")) - assert(timeTaken < maxWaitTimeMillis, "Operation timed out after " + timeTaken + " ms") - assert(output.size === numExpectedOutput, "Unexpected number of outputs generated") - - Thread.sleep(100) // Give some time for the forgetting old RDDs to complete - } finally { - ssc.stop(stopSparkContext = true) - } - output - } - - /** - * Verify whether the output values after running a DStream operation - * is same as the expected output values, by comparing the output - * collections either as lists (order matters) or sets (order does not matter) - */ - def verifyOutput[V: ClassTag]( - output: Seq[Seq[V]], - expectedOutput: Seq[Seq[V]], - useSet: Boolean - ) { - logInfo("--------------------------------") - logInfo("output.size = " + output.size) - logInfo("output") - output.foreach(x => logInfo("[" + x.mkString(",") + "]")) - logInfo("expected output.size = " + expectedOutput.size) - logInfo("expected output") - expectedOutput.foreach(x => logInfo("[" + x.mkString(",") + "]")) - logInfo("--------------------------------") - - // Match the output with the expected output - assert(output.size === expectedOutput.size, "Number of outputs do not match") - for (i <- 0 until output.size) { - if (useSet) { - assert(output(i).toSet === expectedOutput(i).toSet) - } else { - assert(output(i).toList === expectedOutput(i).toList) - } - } - logInfo("Output verified successfully") - } - - /** - * Test unary DStream operation with a list of inputs, with number of - * batches to run same as the number of expected output values - */ - def testOperation[U: ClassTag, V: ClassTag]( - input: Seq[Seq[U]], - operation: DStream[U] => DStream[V], - expectedOutput: Seq[Seq[V]], - useSet: Boolean = false - ) { - testOperation[U, V](input, operation, expectedOutput, -1, useSet) - } - - /** - * Test unary DStream operation with a list of inputs - * @param input Sequence of input collections - * @param operation Binary DStream operation to be applied to the 2 inputs - * @param expectedOutput Sequence of expected output collections - * @param numBatches Number of batches to run the operation for - * @param useSet Compare the output values with the expected output values - * as sets (order matters) or as lists (order does not matter) - */ - def testOperation[U: ClassTag, V: ClassTag]( - input: Seq[Seq[U]], - operation: DStream[U] => DStream[V], - expectedOutput: Seq[Seq[V]], - numBatches: Int, - useSet: Boolean - ) { - val numBatches_ = if (numBatches > 0) numBatches else expectedOutput.size - withStreamingContext(setupStreams[U, V](input, operation)) { ssc => - val output = runStreams[V](ssc, numBatches_, expectedOutput.size) - verifyOutput[V](output, expectedOutput, useSet) - } - } - - /** - * Test binary DStream operation with two lists of inputs, with number of - * batches to run same as the number of expected output values - */ - def testOperation[U: ClassTag, V: ClassTag, W: ClassTag]( - input1: Seq[Seq[U]], - input2: Seq[Seq[V]], - operation: (DStream[U], DStream[V]) => DStream[W], - expectedOutput: Seq[Seq[W]], - useSet: Boolean - ) { - testOperation[U, V, W](input1, input2, operation, expectedOutput, -1, useSet) - } - - /** - * Test binary DStream operation with two lists of inputs - * @param input1 First sequence of input collections - * @param input2 Second sequence of input collections - * @param operation Binary DStream operation to be applied to the 2 inputs - * @param expectedOutput Sequence of expected output collections - * @param numBatches Number of batches to run the operation for - * @param useSet Compare the output values with the expected output values - * as sets (order matters) or as lists (order does not matter) - */ - def testOperation[U: ClassTag, V: ClassTag, W: ClassTag]( - input1: Seq[Seq[U]], - input2: Seq[Seq[V]], - operation: (DStream[U], DStream[V]) => DStream[W], - expectedOutput: Seq[Seq[W]], - numBatches: Int, - useSet: Boolean - ) { - val numBatches_ = if (numBatches > 0) numBatches else expectedOutput.size - withStreamingContext(setupStreams[U, V, W](input1, input2, operation)) { ssc => - val output = runStreams[W](ssc, numBatches_, expectedOutput.size) - verifyOutput[W](output, expectedOutput, useSet) - } - } -} diff --git a/external/zeromq/src/test/scala/org/apache/spark/streaming/zeromq/ZeroMQStreamSuite.scala b/external/zeromq/src/test/scala/org/apache/spark/streaming/zeromq/ZeroMQStreamSuite.scala index cc10ff6ae03cd..a7566e733d891 100644 --- a/external/zeromq/src/test/scala/org/apache/spark/streaming/zeromq/ZeroMQStreamSuite.scala +++ b/external/zeromq/src/test/scala/org/apache/spark/streaming/zeromq/ZeroMQStreamSuite.scala @@ -20,12 +20,19 @@ package org.apache.spark.streaming.zeromq import akka.actor.SupervisorStrategy import akka.util.ByteString import akka.zeromq.Subscribe +import org.scalatest.FunSuite import org.apache.spark.storage.StorageLevel -import org.apache.spark.streaming.{StreamingContext, TestSuiteBase} +import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.dstream.ReceiverInputDStream -class ZeroMQStreamSuite extends TestSuiteBase { +class ZeroMQStreamSuite extends FunSuite { + + val batchDuration = Seconds(1) + + private val master: String = "local[2]" + + private val framework: String = this.getClass.getSimpleName test("zeromq input stream") { val ssc = new StreamingContext(master, framework, batchDuration) From 994d1d327455111025b4ea1ae7aad440cffb40d5 Mon Sep 17 00:00:00 2001 From: Prashant Sharma Date: Wed, 19 Nov 2014 14:36:04 +0530 Subject: [PATCH 3/3] Fixed failing flume tests --- .../spark/streaming/flume/FlumePollingStreamSuite.scala | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumePollingStreamSuite.scala b/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumePollingStreamSuite.scala index ba079d84e4529..b57a1c71e35b9 100644 --- a/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumePollingStreamSuite.scala +++ b/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumePollingStreamSuite.scala @@ -52,6 +52,13 @@ class FlumePollingStreamSuite extends FunSuite with BeforeAndAfter with Logging .setMaster("local[2]") .setAppName(this.getClass.getSimpleName) + def beforeFunction() { + logInfo("Using manual clock") + conf.set("spark.streaming.clock", "org.apache.spark.streaming.util.ManualClock") + } + + before(beforeFunction()) + test("flume polling test") { testMultipleTimes(testFlumePolling) } @@ -234,4 +241,5 @@ class FlumePollingStreamSuite extends FunSuite with BeforeAndAfter with Logging null } } + }