From b3b0ff118cac3c0a5a10f9912b383bb0665c9a1b Mon Sep 17 00:00:00 2001 From: Chris Fregly Date: Wed, 16 Jul 2014 00:03:04 -0700 Subject: [PATCH 01/12] [SPARK-1981] Add AWS Kinesis streaming support --- assembly/pom.xml | 10 + bin/run-kinesis-example | 60 +++ bin/run-kinesis-example.cmd | 90 +++++ .../src/main/scala/SparkApp.scala | 7 + dev/audit-release/sbt_app_kinesis/build.sbt | 30 ++ .../src/main/scala/SparkApp.scala | 33 ++ dev/create-release/create-release.sh | 2 + docs/streaming-custom-receivers.md | 4 +- docs/streaming-programming-guide.md | 65 +++- extras/spark-kinesis-asl/pom.xml | 90 +++++ .../streaming/JavaKinesisWordCount.java | 310 ++++++++++++++++ .../src/main/resources/log4j.properties | 42 +++ .../examples/streaming/KinesisWordCount.scala | 345 ++++++++++++++++++ .../streaming/kinesis/CheckpointState.scala | 52 +++ .../streaming/kinesis/KinesisReceiver.scala | 122 +++++++ .../kinesis/KinesisRecordProcessor.scala | 148 ++++++++ .../kinesis/KinesisRecordSerializer.scala | 54 +++ .../KinesisStringRecordSerializer.scala | 47 +++ .../streaming/kinesis/KinesisUtils.scala | 151 ++++++++ .../src/test/resources/log4j.properties | 42 +++ .../kinesis/KinesisReceiverSuite.scala | 267 ++++++++++++++ pom.xml | 8 + project/SparkBuild.scala | 21 +- 23 files changed, 1992 insertions(+), 8 deletions(-) create mode 100755 bin/run-kinesis-example create mode 100755 bin/run-kinesis-example.cmd create mode 100644 dev/audit-release/sbt_app_kinesis/build.sbt create mode 100644 dev/audit-release/sbt_app_kinesis/src/main/scala/SparkApp.scala create mode 100644 extras/spark-kinesis-asl/pom.xml create mode 100644 extras/spark-kinesis-asl/src/main/java/org/apache/spark/examples/streaming/JavaKinesisWordCount.java create mode 100644 extras/spark-kinesis-asl/src/main/resources/log4j.properties create mode 100644 extras/spark-kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCount.scala create mode 100644 extras/spark-kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/CheckpointState.scala create mode 100644 extras/spark-kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala create mode 100644 extras/spark-kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala create mode 100644 extras/spark-kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordSerializer.scala create mode 100644 extras/spark-kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisStringRecordSerializer.scala create mode 100644 extras/spark-kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala create mode 100644 extras/spark-kinesis-asl/src/test/resources/log4j.properties create mode 100644 extras/spark-kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala diff --git a/assembly/pom.xml b/assembly/pom.xml index 0c60b66c3daca..60cc5aef67098 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -173,6 +173,16 @@ + + spark-kinesis-asl + + + org.apache.spark + spark-kinesis-asl_${scala.binary.version} + ${project.version} + + + bigtop-dist + + 4.0.0 + + org.apache.spark + spark-parent + 1.1.0-SNAPSHOT + ../../pom.xml + + + + org.apache.spark + spark-kinesis-asl_2.10 + jar + Spark Kinesis Integration + + + + org.apache.spark + spark-core_${scala.binary.version} + ${project.version} + + + org.apache.spark + spark-streaming_${scala.binary.version} + ${project.version} + + + com.amazonaws + amazon-kinesis-client + 1.1.0 + + + com.amazonaws + aws-java-sdk + 1.8.3 + + + org.scalatest + scalatest_${scala.binary.version} + test + + + org.mockito + mockito-all + test + + + org.scalacheck + scalacheck_${scala.binary.version} + test + + + org.easymock + easymockclassextension + test + + + com.novocode + junit-interface + test + + + + target/scala-${scala.binary.version}/classes + target/scala-${scala.binary.version}/test-classes + + + org.scalatest + scalatest-maven-plugin + + + + diff --git a/extras/spark-kinesis-asl/src/main/java/org/apache/spark/examples/streaming/JavaKinesisWordCount.java b/extras/spark-kinesis-asl/src/main/java/org/apache/spark/examples/streaming/JavaKinesisWordCount.java new file mode 100644 index 0000000000000..6f3a2454907ec --- /dev/null +++ b/extras/spark-kinesis-asl/src/main/java/org/apache/spark/examples/streaming/JavaKinesisWordCount.java @@ -0,0 +1,310 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.examples.streaming; + +import java.util.List; +import java.util.regex.Pattern; + +import org.apache.log4j.Level; +import org.apache.log4j.Logger; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.function.FlatMapFunction; +import org.apache.spark.api.java.function.Function; +import org.apache.spark.api.java.function.Function2; +import org.apache.spark.api.java.function.PairFunction; +import org.apache.spark.storage.StorageLevel; +import org.apache.spark.streaming.Duration; +import org.apache.spark.streaming.Milliseconds; +import org.apache.spark.streaming.api.java.JavaDStream; +import org.apache.spark.streaming.api.java.JavaPairDStream; +import org.apache.spark.streaming.api.java.JavaStreamingContext; +import org.apache.spark.streaming.dstream.DStream; +import org.apache.spark.streaming.kinesis.KinesisRecordSerializer; +import org.apache.spark.streaming.kinesis.KinesisStringRecordSerializer; +import org.apache.spark.streaming.kinesis.KinesisUtils; + +import scala.Tuple2; + +import com.amazonaws.auth.DefaultAWSCredentialsProviderChain; +import com.amazonaws.services.kinesis.AmazonKinesisClient; +import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream; +import com.google.common.base.Optional; +import com.google.common.collect.Lists; + +/** + * Java-friendly Kinesis Spark Streaming WordCount example + * + * See http://spark.apache.org/docs/latest/streaming-programming-guide.html for more details on the Kinesis Spark Streaming integration. + * + * This example spins up 1 Kinesis Worker (Spark Streaming Receivers) per shard of the given stream. + * It then starts pulling from the tip of the given and at the given . + * Because we're pulling from the tip (InitialPositionInStream.LATEST), only new stream data will be picked up after the KinesisReceiver starts. + * This could lead to missed records if data is added to the stream while no KinesisReceivers are running. + * In production, you'll want to switch to InitialPositionInStream.TRIM_HORIZON which will read up to 24 hours (Kinesis limit) of previous stream data + * depending on the checkpoint frequency. + * InitialPositionInStream.TRIM_HORIZON may lead to duplicate processing of records depending on the checkpoint frequency. + * Record processing should be idempotent when possible. + * + * This code uses the DefaultAWSCredentialsProviderChain and searches for credentials in the following order of precedence: + * Environment Variables - AWS_ACCESS_KEY_ID and AWS_SECRET_KEY + * Java System Properties - aws.accessKeyId and aws.secretKey + * Credential profiles file - default location (~/.aws/credentials) shared by all AWS SDKs + * Instance profile credentials - delivered through the Amazon EC2 metadata service + * + * Usage: JavaKinesisWordCount + * is the name of the Kinesis stream (ie. mySparkStream) + * is the endpoint of the Kinesis service (ie. https://kinesis.us-east-1.amazonaws.com) + * is the batch interval in milliseconds (ie. 1000ms) + * + * Example: + * $ export AWS_ACCESS_KEY_ID= + * $ export AWS_SECRET_KEY= + * $ bin/run-kinesis-example \ + * org.apache.spark.examples.streaming.JavaKinesisWordCount mySparkStream https://kinesis.us-east-1.amazonaws.com 1000 + * + * There is a companion helper class called KinesisWordCountProducer which puts dummy data onto the Kinesis stream. + * Usage instructions for KinesisWordCountProducer are provided in the class definition. + */ +public final class JavaKinesisWordCount { + private static final Pattern WORD_SEPARATOR = Pattern.compile(" "); + private static final Logger logger = Logger.getLogger(JavaKinesisWordCount.class); + + /** + * Make the constructor private to enforce singleton + */ + private JavaKinesisWordCount() { + } + + public static void main(String[] args) { + /** + * Check that all required args were passed in. + */ + if (args.length < 3) { + System.err.println("Usage: JavaKinesisWordCount "); + System.exit(1); + } + + /** + * (This was lifted from the StreamingExamples.scala in order to avoid the dependency on the spark-examples artifact.) + * Set reasonable logging levels for streaming if the user has not configured log4j. + */ + boolean log4jInitialized = Logger.getRootLogger().getAllAppenders() + .hasMoreElements(); + if (!log4jInitialized) { + /** We first log something to initialize Spark's default logging, then we override the logging level. */ + Logger.getRootLogger() + .info("Setting log level to [ERROR] for streaming example." + + " To override add a custom log4j.properties to the classpath."); + Logger.getRootLogger().setLevel(Level.ERROR); + Logger.getLogger("org.apache.spark.examples.streaming").setLevel(Level.DEBUG); + } + + /** Populate the appropriate variables from the given args */ + String stream = args[0]; + String endpoint = args[1]; + Integer batchIntervalMillis = Integer.valueOf(args[2]); + + /** Create a Kinesis client in order to determine the number of shards for the given stream */ + AmazonKinesisClient KinesisClient = new AmazonKinesisClient( + new DefaultAWSCredentialsProviderChain()); + + /** Determine the number of shards from the stream */ + int numShards = KinesisClient.describeStream(stream) + .getStreamDescription().getShards().size(); + + /** In this example, we're going to create 1 Kinesis Worker/Receiver/DStreams for each stream shard */ + int numStreams = numShards; + + /** Must add 1 more thread than the number of receivers or the output won't show properly from the driver */ + int numSparkThreads = numStreams + 1; + + /** Set the app name */ + String app = "KinesisWordCount"; + + /** Setup the Spark config. */ + SparkConf sparkConfig = new SparkConf().setAppName(app).setMaster( + "local[" + numSparkThreads + "]"); + + /** + * Set the batch interval. + * Records will be pulled from the Kinesis stream and stored as a single DStream within Spark every batch interval. + */ + Duration batchInterval = Milliseconds.apply(batchIntervalMillis); + + /** + * It's recommended that you perform a Spark checkpoint between 5 and 10 times the batch interval. + * While this is the Spark checkpoint interval, we're going to use it for the Kinesis checkpoint interval, as well. + */ + Duration checkpointInterval = batchInterval.$times(5); + + /** Setup the StreamingContext */ + JavaStreamingContext jssc = new JavaStreamingContext(sparkConfig, batchInterval); + + /** Setup the checkpoint directory used by Spark Streaming */ + jssc.checkpoint("/tmp/checkpoint"); + + /** Create the same number of Kinesis Receivers/DStreams as stream shards, then union them all */ + JavaDStream allStreams = KinesisUtils + .createJavaStream(jssc, app, stream, endpoint, checkpointInterval.milliseconds(), + InitialPositionInStream.LATEST, StorageLevel.MEMORY_AND_DISK_2()); + /** Set the checkpoint interval */ + allStreams.checkpoint(checkpointInterval); + for (int i = 1; i < numStreams; i++) { + /** Create a new Receiver/DStream for each stream shard */ + JavaDStream dStream = KinesisUtils + .createJavaStream(jssc, app, stream, endpoint, checkpointInterval.milliseconds(), + InitialPositionInStream.LATEST, StorageLevel.MEMORY_AND_DISK_2()); + /** Set the Spark checkpoint interval */ + dStream.checkpoint(checkpointInterval); + + /** Union with the existing streams */ + allStreams = allStreams.union(dStream); + } + + /** This implementation uses the String-based KinesisRecordSerializer impl */ + final KinesisRecordSerializer recordSerializer = new KinesisStringRecordSerializer(); + + /** + * Split each line of the union'd DStreams into multiple words using flatMap to produce the collection. + * Convert lines of byte[] to multiple Strings by first converting to String, then splitting on WORD_SEPARATOR + * We're caching the result here so that we can use it later without having to re-materialize the underlying RDDs. + */ + JavaDStream words = allStreams + .flatMap(new FlatMapFunction() { + /** + * Convert lines of byte[] to multiple words split by WORD_SEPARATOR + * @param byte array + * @return iterable of words split by WORD_SEPARATOR + */ + @Override + public Iterable call(byte[] line) { + return Lists.newArrayList(WORD_SEPARATOR.split(recordSerializer.deserialize(line))); + } + }).cache(); + + /** windowInterval must be a multiple of the batchInterval */ + Duration windowInterval = batchInterval.$times(5); + + /** slideInterval must be a multiple of the batchInterval */ + Duration slideInterval = batchInterval.$times(1); + + /** + * Map each word to a (word, 1) tuple so we can reduce/aggregate later. + * We're caching the result here so that we can use it later without having + * to re-materialize the underlying RDDs. + */ + JavaPairDStream wordCounts = words.mapToPair( + new PairFunction() { + /** + * Create the (word, 1) tuple + * @param word + * @return (word, 1) tuple + */ + @Override + public Tuple2 call(String s) { + return new Tuple2(s, 1); + } + }); + + /** + * Reduce/aggregate by key + * We're caching the result here so that we can use it later without having + * to re-materialize the underlying RDDs. + */ + JavaPairDStream wordCountsByKey = wordCounts.reduceByKey( + new Function2() { + @Override + public Integer call(Integer i1, Integer i2) { + return i1 + i2; + } + }).cache(); + + /** + * Reduce/aggregate by key for the given window. + * We're using the inverse-function (left - right) optimization over the sliding window per the Window Operations described at the following url: + * http://spark.apache.org/docs/latest/streaming-programming-guide.html#transformations + */ + JavaPairDStream wordCountsByKeyAndWindow = wordCountsByKey.reduceByKeyAndWindow( + new Function2() { + @Override + public Integer call(Integer i1, Integer i2) { + return i1 + i2; + } + }, windowInterval, slideInterval); + + /** + * Sort and print the word counts by key and window. + * This is an Output Operation and will materialize the DStream. + */ + sortAndPrint("Word Counts By Key and Window", wordCountsByKeyAndWindow); + + Function2, Optional, Optional> updateTotals = + new Function2, Optional, Optional>() { + @Override public Optional call(List newCounts, Optional currentCount) { + Integer currentSum = 0; + if (currentCount.isPresent()) { + currentSum = currentCount.get(); + } + Integer newSum = currentSum; + + for (Integer newCount : newCounts) { + newSum += newCount; + } + return Optional.of(newSum); + } + }; + + /** + * Calculate the running totals using the updateTotals method. + */ + JavaPairDStream wordTotalsByKey = wordCountsByKey.updateStateByKey(updateTotals); + + /** + * Sort and print the running word totals. + * This is an Output Operation and will materialize the DStream. + */ + sortAndPrint("Word Count Totals By Key", wordTotalsByKey); + + /** Start the streaming context and await termination */ + jssc.start(); + jssc.awaitTermination(); + } + + /** + * Sort and print the given dstream. + * This is an Output Operation that will materialize the underlying DStream. + * Everything up to this point is a lazy Transformation Operation. + * + * @param description of the dstream for logging purposes + * @param dstream to sort and print + */ + private static void sortAndPrint(final String description, JavaPairDStream dstream) { + dstream.foreachRDD( + new Function, Void>() { + public Void call(JavaPairRDD batch) { + JavaPairRDD sortedBatch = batch.sortByKey(true); + logger.info(description); + for (Object wordCount: sortedBatch.collect()) { + logger.info(wordCount); + } + + return null; + } + }); + } +} diff --git a/extras/spark-kinesis-asl/src/main/resources/log4j.properties b/extras/spark-kinesis-asl/src/main/resources/log4j.properties new file mode 100644 index 0000000000000..ad789341e62c9 --- /dev/null +++ b/extras/spark-kinesis-asl/src/main/resources/log4j.properties @@ -0,0 +1,42 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Set everything to be logged to the file streaming/target/unit-tests.log +log4j.rootCategory=WARN, console + +# File appender +log4j.appender.file=org.apache.log4j.FileAppender +log4j.appender.file.append=false +log4j.appender.file.file=target/unit-tests.log +log4j.appender.file.layout=org.apache.log4j.PatternLayout +log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %p %c{1}: %m%n + +# Console appender +log4j.appender.console=org.apache.log4j.ConsoleAppender +log4j.appender.console.target=System.out +log4j.appender.console.layout=org.apache.log4j.PatternLayout +log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n + +# Settings to quiet third party logs that are too verbose +log4j.logger.org.eclipse.jetty=WARN +log4j.logger.org.eclipse.jetty.util.component.AbstractLifeCycle=ERROR +log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO +log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO + +# Log all Kinesis Streaming messages +log4j.logger.org.apache.spark.examples.streaming=DEBUG +log4j.logger.org.apache.spark.streaming.kinesis=DEBUG diff --git a/extras/spark-kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCount.scala b/extras/spark-kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCount.scala new file mode 100644 index 0000000000000..0a0cccb49433d --- /dev/null +++ b/extras/spark-kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCount.scala @@ -0,0 +1,345 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.examples.streaming + +import java.nio.ByteBuffer +import org.apache.log4j.Level +import org.apache.log4j.Logger +import org.apache.spark.Logging +import org.apache.spark.SparkConf +import org.apache.spark.SparkContext._ +import org.apache.spark.streaming.Milliseconds +import org.apache.spark.streaming.StreamingContext +import org.apache.spark.streaming.StreamingContext.toPairDStreamFunctions +import org.apache.spark.streaming.kinesis.KinesisStringRecordSerializer +import org.apache.spark.streaming.kinesis.KinesisUtils +import com.amazonaws.auth.DefaultAWSCredentialsProviderChain +import com.amazonaws.services.kinesis.AmazonKinesisClient +import com.amazonaws.services.kinesis.model.PutRecordRequest +import scala.util.Random +import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream +import org.apache.spark.storage.StorageLevel +import org.apache.spark.streaming.dstream.ReceiverInputDStream +import org.apache.spark.streaming.dstream.DStream + +/** + * Kinesis Spark Streaming WordCount example. + * + * See http://spark.apache.org/docs/latest/streaming-programming-guide.html for more details on the Kinesis Spark Streaming integration. + * + * This example spins up 1 Kinesis Worker (Spark Streaming Receivers) per shard of the given stream. + * It then starts pulling from the tip of the given and at the given . + * Because we're pulling from the tip (InitialPositionInStream.LATEST), only new stream data will be picked up after the KinesisReceiver starts. + * This could lead to missed records if data is added to the stream while no KinesisReceivers are running. + * In production, you'll want to switch to InitialPositionInStream.TRIM_HORIZON which will read up to 24 hours (Kinesis limit) of previous stream data + * depending on the checkpoint frequency. + * + * InitialPositionInStream.TRIM_HORIZON may lead to duplicate processing of records depending on the checkpoint frequency. + * Record processing should be idempotent when possible. + * + * This code uses the DefaultAWSCredentialsProviderChain and searches for credentials in the following order of precedence: + * Environment Variables - AWS_ACCESS_KEY_ID and AWS_SECRET_KEY + * Java System Properties - aws.accessKeyId and aws.secretKey + * Credential profiles file - default location (~/.aws/credentials) shared by all AWS SDKs + * Instance profile credentials - delivered through the Amazon EC2 metadata service + * + * Usage: KinesisWordCount + * is the name of the Kinesis stream (ie. mySparkStream) + * is the endpoint of the Kinesis service (ie. https://kinesis.us-east-1.amazonaws.com) + * is the batch interval in millis (ie. 1000ms) + * + * Example: + * $ export AWS_ACCESS_KEY_ID= + * $ export AWS_SECRET_KEY= + * $ bin/run-kinesis-example \ + * org.apache.spark.examples.streaming.KinesisWordCount mySparkStream https://kinesis.us-east-1.amazonaws.com 100 + * + * There is a companion helper class below called KinesisWordCountProducer which puts dummy data onto the Kinesis stream. + * Usage instructions for KinesisWordCountProducer are provided in that class definition. + */ +object KinesisWordCount extends Logging { + val WordSeparator = " " + + def main(args: Array[String]) { +/** + * Check that all required args were passed in. + */ + if (args.length < 3) { + System.err.println("Usage: KinesisWordCount ") + System.exit(1) + } + + /** + * (This was lifted from the StreamingExamples.scala in order to avoid the dependency on the spark-examples artifact.) + * Set reasonable logging levels for streaming if the user has not configured log4j. + */ + val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements + if (!log4jInitialized) { + /** We first log something to initialize Spark's default logging, then we override the logging level. */ + logInfo("Setting log level to [INFO] for streaming example." + + " To override add a custom log4j.properties to the classpath.") + + Logger.getRootLogger().setLevel(Level.INFO) + Logger.getLogger("org.apache.spark.examples.streaming").setLevel(Level.DEBUG); + } + + /** Populate the appropriate variables from the given args */ + val Array(stream, endpoint, batchIntervalMillisStr) = args + val batchIntervalMillis = batchIntervalMillisStr.toInt + + /** Create a Kinesis client in order to determine the number of shards for the given stream */ + val KinesisClient = new AmazonKinesisClient(new DefaultAWSCredentialsProviderChain()); + + /** Determine the number of shards from the stream */ + val numShards = KinesisClient.describeStream(stream).getStreamDescription().getShards().size() + + /** In this example, we're going to create 1 Kinesis Worker/Receiver/DStreams for each stream shard */ + val numStreams = numShards + + /** Must add 1 more thread than the number of receivers or the output won't show properly from the driver */ + val numSparkThreads = numStreams + 1 + + /** Set the app name */ + val app = "KinesisWordCount" + + /** Setup the Spark config. */ + val sparkConfig = new SparkConf().setAppName(app).setMaster(s"local[$numSparkThreads]") + + /** + * Set the batch interval. + * Records will be pulled from the Kinesis stream and stored as a single DStream within Spark every batch interval. + */ + val batchInterval = Milliseconds(batchIntervalMillis) + + /** + * It's recommended that you perform a Spark checkpoint between 5 and 10 times the batch interval. + * While this is the Spark checkpoint interval, we're going to use it for the Kinesis checkpoint interval, as well. + */ + val checkpointInterval = batchInterval * 5 + + /** Setup the StreamingContext */ + val ssc = new StreamingContext(sparkConfig, batchInterval) + + /** Setup the checkpoint directory used by Spark Streaming */ + ssc.checkpoint("/tmp/checkpoint"); + + /** Create the same number of Kinesis Receivers/DStreams as stream shards, then union them all */ + var allStreams: DStream[Array[Byte]] = KinesisUtils.createStream(ssc, app, stream, endpoint, checkpointInterval.milliseconds, + InitialPositionInStream.LATEST, StorageLevel.MEMORY_AND_DISK_2) + /** Set the checkpoint interval */ + allStreams.checkpoint(checkpointInterval) + for (i <- 1 until numStreams) { + /** Create a new Receiver/DStream for each stream shard */ + val dStream = KinesisUtils.createStream(ssc, app, stream, endpoint, checkpointInterval.milliseconds, + InitialPositionInStream.LATEST, StorageLevel.MEMORY_AND_DISK_2) + /** Set the Spark checkpoint interval */ + dStream.checkpoint(checkpointInterval) + + /** Union with the existing streams */ + allStreams = allStreams.union(dStream) + } + + /** This implementation uses the String-based KinesisRecordSerializer impl */ + val recordSerializer = new KinesisStringRecordSerializer() + + /** + * Sort and print the given dstream. + * This is an Output Operation that will materialize the underlying DStream. + * Everything up to this point is a lazy Transformation Operation. + * + * @param description of the dstream for logging purposes + * @param dstream to sort and print + */ + def sortAndPrint(description: String, dstream: DStream[(String,Int)]) = { + dstream.foreachRDD((batch, endOfWindowTime) => { + val sortedBatch = batch.sortByKey(true) + logInfo(s"$description @ $endOfWindowTime") + sortedBatch.collect().foreach( + wordCount => logInfo(s"$wordCount")) + }) + } + + /** + * Split each line of the union'd DStreams into multiple words using flatMap to produce the collection. + * Convert lines of Array[Byte] to multiple Strings by first converting to String, then splitting on WORD_SEPARATOR + * We're caching the result here so that we can use it later without having to re-materialize the underlying RDDs. + */ + val words = allStreams.flatMap(line => recordSerializer.deserialize(line).split(WordSeparator)).cache() + + /** windowInterval must be a multiple of the batchInterval */ + val windowInterval = batchInterval * 5 + + /** slideInterval must be a multiple of the batchInterval */ + val slideInterval = batchInterval * 1 + + /** + * Map each word to a (word, 1) tuple so we can reduce/aggregate later. + * We're caching the result here so that we can use it later without having + * to re-materialize the underlying RDDs. + */ + val wordCounts = words.map(word => (word, 1)) + + /** + * Reduce/aggregate by key. + * We're caching the result here so that we can use it later without having + * to re-materialize the underlying RDDs. + */ + val wordCountsByKey = wordCounts.reduceByKey((left, right) => left + right) + + /** + * Reduce/aggregate by key for the given window. + * We're using the inverse-function (left - right) optimization over the sliding window per the Window Operations described at the following url: + * http://spark.apache.org/docs/latest/streaming-programming-guide.html#transformations + */ + val wordCountsByKeyAndWindow = wordCountsByKey.reduceByKeyAndWindow((left, right) => left + right, (left, right) => left - right, windowInterval, slideInterval) + + /** + * Sort and print the word counts by key and window. + * This is an Output Operation and will materialize the DStream. + * + */ + sortAndPrint("Word Counts By Key and Window", wordCountsByKeyAndWindow) + + /** + * Update the running totals of words. + * + * @param sequence of new counts + * @param current running total (could be None if no current count exists) + */ + def updateTotals = (newCounts: Seq[Int], currentCounts: Option[Int]) => { + val newCount = newCounts.foldLeft(0)((left, right) => left + right) + val currentCount = currentCounts.getOrElse(0) + Some(newCount + currentCount) + } + + /** + * Calculate the running totals using the updateTotals method. + */ + val wordTotalsByKey = wordCountsByKey.updateStateByKey[Int](updateTotals) + + /** + * Sort and print the running word totals. + * This is an Output Operation and will materialize the DStream. + */ + sortAndPrint("Word Count Totals By Key", wordTotalsByKey) + + /** Start the streaming context and await termination */ + ssc.start() + ssc.awaitTermination() + } +} + +/** + * Usage: KinesisWordCountProducer + * is the name of the Kinesis stream (ie. mySparkStream) + * is the endpoint of the Kinesis service (ie. https://kinesis.us-east-1.amazonaws.com) + * is the rate of records per second to put onto the stream + * is the rate of records per second to put onto the stream + * + * Example: + * $ export AWS_ACCESS_KEY_ID= + * $ export AWS_SECRET_KEY= + * $ bin/run-kinesis-example \ + * org.apache.spark.examples.streaming.KinesisWordCountProducer mySparkStream https://kinesis.us-east-1.amazonaws.com 10 5 + */ +private[streaming] +object KinesisWordCountProducer extends Logging { + val MaxRandomInts = 10 + + def main(args: Array[String]) { + if (args.length < 4) { + System.err.println("Usage: KinesisWordCountProducer ") + System.exit(1) + } + + /** + * (This was lifted from the StreamingExamples.scala in order to avoid the dependency on the spark-examples artifact.) + * Set reasonable logging levels for streaming if the user has not configured log4j. + */ + val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements + if (!log4jInitialized) { + /** We first log something to initialize Spark's default logging, then we override the logging level. */ + logInfo("Setting log level to [INFO] for streaming example." + + " To override add a custom log4j.properties to the classpath.") + + Logger.getRootLogger().setLevel(Level.INFO) + Logger.getLogger("org.apache.spark.examples.streaming").setLevel(Level.DEBUG); + } + + /** Populate the appropriate variables from the given args */ + val Array(stream, endpoint, recordsPerSecond, wordsPerRecord) = args + + /** Generate the records and return the totals */ + val totals: Seq[(Int, Int)] = generate(stream, endpoint, recordsPerSecond.toInt, wordsPerRecord.toInt) + + logInfo("Totals") + /** Print the array of (index, total) tuples */ + totals.foreach(total => logInfo(total.toString())) + } + + def generate(stream: String, endpoint: String, recordsPerSecond: Int, wordsPerRecord: Int): Seq[(Int, Int)] = { + val WORD_SEPARATOR = " " + + /** Create the Kinesis client */ + val KinesisClient = new AmazonKinesisClient(new DefaultAWSCredentialsProviderChain()) + + logInfo(s"Putting records onto stream $stream and endpoint $endpoint at a rate of $recordsPerSecond records per second and $wordsPerRecord words per record"); + + /** Create the String-based record serializer */ + val recordSerializer = new KinesisStringRecordSerializer() + + val totals = new Array[Int](MaxRandomInts) + /** Put String records onto the stream per the given recordPerSec and wordsPerRecord */ + for (i <- 1 to 5) { + /** Generate recordsPerSec records to put onto the stream */ + val records = (1 to recordsPerSecond.toInt).map { recordNum => + /** Randomly generate each wordsPerRec words between 0 (inclusive) and MAX_RANDOM_INTS (exclusive) */ + val data = (1 to wordsPerRecord.toInt).map(x => { + /** Generate the random int */ + val randomInt = Random.nextInt(MaxRandomInts) + + /** Keep track of the totals */ + totals(randomInt) += 1 + + /** Convert the Int to a String */ + randomInt.toString() + }) + /** Create a String of randomInts separated by WORD_SEPARATOR */ + .mkString(WORD_SEPARATOR) + + /** Create a partitionKey based on recordNum */ + val partitionKey = s"partitionKey-$recordNum" + + /** Create a PutRecordRequest with an Array[Byte] version of the data */ + val putRecordRequest = new PutRecordRequest().withStreamName(stream).withPartitionKey(partitionKey) + .withData(ByteBuffer.wrap(recordSerializer.serialize(data))); + + /** Put the record onto the stream and capture the PutRecordResult */ + val putRecordResult = KinesisClient.putRecord(putRecordRequest); + + logInfo(s"Successfully put record with partitionKey $partitionKey and shardId ${putRecordResult.getShardId()} and data $data") + } + + /** Sleep for a second */ + Thread.sleep(1000) + } + + /** Convert the totals to (index, total) tuple */ + (0 to (MaxRandomInts - 1)).zip(totals) + } +} diff --git a/extras/spark-kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/CheckpointState.scala b/extras/spark-kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/CheckpointState.scala new file mode 100644 index 0000000000000..a28d022cb61c8 --- /dev/null +++ b/extras/spark-kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/CheckpointState.scala @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.streaming.kinesis + +import org.apache.spark.Logging +import org.apache.spark.streaming.util.Clock +import org.apache.spark.streaming.util.ManualClock +import org.apache.spark.streaming.util.SystemClock + +/** + * This is a helper class for managing checkpoint clocks. + * + * @param checkpoint interval in millis + * @param current clock. if none specified, will default to current SystemClock + */ +class CheckpointState(checkpointIntervalMillis: Long, currentClock: Clock = new SystemClock()) extends Logging { + /** + * Initialize the checkpoint clock using the given currentClock + checkpointIntervalMillis + */ + val checkpointClock = new ManualClock() + checkpointClock.setTime(currentClock.currentTime() + checkpointIntervalMillis) + + /** + * Check if it's time to checkpoint based on the current time and the derived time for the next checkpoint + * + * @return true if it's time to checkpoint + */ + def shouldCheckpoint(): Boolean = { + new SystemClock().currentTime() > checkpointClock.currentTime() + } + + /** + * Advance the checkpoint clock by the checkpoint interval. + */ + def advanceCheckpoint() = { + checkpointClock.addToTime(checkpointIntervalMillis) + } +} diff --git a/extras/spark-kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala b/extras/spark-kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala new file mode 100644 index 0000000000000..98eed6eb196d9 --- /dev/null +++ b/extras/spark-kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala @@ -0,0 +1,122 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.streaming.kinesis + +import java.net.InetAddress +import java.util.UUID +import org.apache.spark.Logging +import org.apache.spark.storage.StorageLevel +import org.apache.spark.streaming.receiver.Receiver +import com.amazonaws.auth.DefaultAWSCredentialsProviderChain +import com.amazonaws.services.kinesis.clientlibrary.interfaces.IRecordProcessor +import com.amazonaws.services.kinesis.clientlibrary.interfaces.IRecordProcessorFactory +import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream +import com.amazonaws.services.kinesis.clientlibrary.lib.worker.KinesisClientLibConfiguration +import com.amazonaws.services.kinesis.clientlibrary.lib.worker.Worker +import java.nio.ByteBuffer +import org.apache.spark.streaming.util.SystemClock + +/** + * Custom AWS Kinesis-specific implementation of Spark Streaming's Receiver. + * This implementation relies on the Kinesis Client Library (KCL) Worker as described here: + * https://github.com/awslabs/amazon-kinesis-client + * This is a custom receiver used with StreamingContext.receiverStream(Receiver) as described here: + * http://spark.apache.org/docs/latest/streaming-custom-receivers.html + * Instances of this class will get shipped to the Spark Streaming Workers to run within a Spark Executor. + * + * @param app name + * @param Kinesis stream name + * @param endpoint url of Kinesis service + * @param checkpoint interval (millis) for Kinesis checkpointing (not Spark checkpointing). + * See the Kinesis Spark Streaming documentation for more details on the different types of checkpoints. + * @param in the absence of Kinesis checkpoint info, this is the worker's initial starting position in the stream. + * The values are either the beginning of the stream per Kinesis' limit of 24 hours (InitialPositionInStream.TRIM_HORIZON) + * or the tip of the stream using InitialPositionInStream.LATEST. + * @param persistence strategy for RDDs and DStreams. + */ +private[streaming] class KinesisReceiver( + app: String, + stream: String, + endpoint: String, + checkpointIntervalMillis: Long, + initialPositionInStream: InitialPositionInStream, + storageLevel: StorageLevel) + extends Receiver[Array[Byte]](storageLevel) with Logging { receiver => + + /** + * The lazy val's below will get instantiated in the remote Executor after the closure is shipped to the Spark Worker. + * These are all lazy because they're from third-party Amazon libraries and are not Serializable. + * If they're not marked lazy, they will cause NotSerializableExceptions when they're shipped to the Spark Worker. + */ + + /** + * workerId is lazy because we want the address of the actual Worker where the code runs - not the Driver's ip address. + * This makes a difference when running in a cluster. + */ + lazy val workerId = InetAddress.getLocalHost.getHostAddress() + ":" + UUID.randomUUID() + + /** + * This impl uses the DefaultAWSCredentialsProviderChain per the following url: + * http://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/auth/DefaultAWSCredentialsProviderChain.html + * and searches for credentials in the following order of precedence: + * Environment Variables - AWS_ACCESS_KEY_ID and AWS_SECRET_KEY + * Java System Properties - aws.accessKeyId and aws.secretKey + * Credential profiles file at the default location (~/.aws/credentials) shared by all AWS SDKs and the AWS CLI + * Instance profile credentials delivered through the Amazon EC2 metadata service + */ + lazy val credentialsProvider = new DefaultAWSCredentialsProviderChain() + + /** Create a KCL config instance. */ + lazy val KinesisClientLibConfiguration = new KinesisClientLibConfiguration(app, stream, credentialsProvider, workerId) + .withKinesisEndpoint(endpoint).withInitialPositionInStream(initialPositionInStream).withTaskBackoffTimeMillis(500) + + /** + * RecordProcessorFactory creates impls of IRecordProcessor. + * IRecordProcessor adapts the KCL to our Spark KinesisReceiver via the IRecordProcessor.processRecords() method. + * We're using our custom KinesisRecordProcessor in this case. + */ + lazy val recordProcessorFactory: IRecordProcessorFactory = new IRecordProcessorFactory { + override def createProcessor: IRecordProcessor = new KinesisRecordProcessor(receiver, workerId, KinesisUtils.createCheckpointState(checkpointIntervalMillis)) + } + + /** + * Create a Kinesis Worker. + * This is the core client abstraction from the Kinesis Client Library (KCL). + * We pass the RecordProcessorFactory from above as well as the KCL config instance. + * A Kinesis Worker can process 1..* shards from the given stream - each with its own RecordProcessor. + */ + lazy val worker: Worker = new Worker(recordProcessorFactory, KinesisClientLibConfiguration); + + /** + * This is called when the KinesisReceiver starts and must be non-blocking. + * The KCL creates and manages the receiving/processing thread pool through the Worker.run() method. + */ + override def onStart() { + logInfo(s"Starting receiver with workerId $workerId") + worker.run() + } + + /** + * This is called when the KinesisReceiver stops. + * The KCL worker.shutdown() method stops the receiving/processing threads. + * The KCL will do its best to drain and checkpoint any in-flight records upon shutdown. + */ + override def onStop() { + logInfo(s"Shutting down receiver with workerId $workerId") + worker.shutdown() + } +} diff --git a/extras/spark-kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala b/extras/spark-kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala new file mode 100644 index 0000000000000..8dd24501fe381 --- /dev/null +++ b/extras/spark-kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala @@ -0,0 +1,148 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.streaming.kinesis + +import java.util.List +import scala.collection.JavaConversions.asScalaBuffer +import scala.collection.mutable.ArrayBuffer +import org.apache.spark.Logging +import org.apache.spark.streaming.util.ManualClock +import org.apache.spark.streaming.util.SystemClock +import com.amazonaws.services.kinesis.clientlibrary.interfaces.IRecordProcessor +import com.amazonaws.services.kinesis.clientlibrary.interfaces.IRecordProcessorCheckpointer +import com.amazonaws.services.kinesis.clientlibrary.types.ShutdownReason +import com.amazonaws.services.kinesis.model.Record +import scala.compat.Platform +import org.apache.spark.streaming.util.Clock + +/** + * Kinesis-specific implementation of the Kinesis Client Library (KCL) IRecordProcessor. + * This implementation operates on the Array[Byte] from the KinesisReceiver. + * The Kinesis Worker creates an instance of this KinesisRecordProcessor upon startup. + * + * @param Kinesis receiver + * @param workerId for logging purposes + * @param checkpoint utils + * @param Kinesis checkpoint interval (millis) + */ +private[streaming] class KinesisRecordProcessor( + receiver: KinesisReceiver, + workerId: String, + checkpointState: CheckpointState) extends IRecordProcessor with Logging { + + /** shardId to be populated during initialize() */ + var shardId: String = _ + + /** + * The Kinesis Client Library calls this method during IRecordProcessor initialization. + * + * @param shardId assigned by the KCL to this particular RecordProcessor. + */ + override def initialize(shardId: String) { + logInfo(s"Initialize: Initializing workerId $workerId with shardId $shardId") + + this.shardId = shardId + } + + /** + * This method is called by the KCL when a batch of records is pulled from the Kinesis stream. + * This is the record-processing bridge between the KCL's IRecordProcessor.processRecords() + * and Spark Streaming's Receiver.store(). + * + * @param list of records from the Kinesis stream shard + * @param checkpointer used to update Kinesis when this batch has been processed/stored in the DStream + */ + override def processRecords(batch: List[Record], checkpointer: IRecordProcessorCheckpointer) { + if (!receiver.isStopped()) { + try { + /** + * Convert the list of records to a list of Array[Byte] + * Note: If we try to store the raw ByteBuffer from record.getData(), the Spark Streaming + * Receiver.store(ByteBuffer) attempts to deserialize the ByteBuffer using the + * internally-configured Spark serializer (kryo, etc). + * This is not desirable, so we instead store a raw Array[Byte] and decouple + * ourselves from the internal serialization strategy. + */ + val batchByteArrays = new ArrayBuffer[Array[Byte]](batch.size()) + batchByteArrays ++= batch.map(record => record.getData().array()) + + /** Store the list of Array[Byte] in Spark */ + KinesisUtils.retry(receiver.store(batchByteArrays), 4, 500) + logDebug(s"Stored: Worker $workerId stored ${batch.size} records for shardId $shardId") + + /** + * Checkpoint the sequence number of the last record successfully processed/stored in the batch. + * In this implementation, we're checkpointing after the given checkpointIntervalMillis. + */ + if (checkpointState.shouldCheckpoint()) { + /** Perform the checkpoint */ + KinesisUtils.retry(checkpointer.checkpoint(), 4, 500) + + /** Update the next checkpoint time */ + checkpointState.advanceCheckpoint() + + logDebug(s"Checkpoint: WorkerId $workerId completed checkpoint of ${batch.size} records for shardId $shardId") + logDebug(s"Checkpoint: Next checkpoint is at ${checkpointState.checkpointClock.currentTime()} for shardId $shardId") + } + } catch { + case e: Throwable => { + /** + * If there is a failure within the batch, the batch will not be checkpointed. + * This will potentially cause records since the last checkpoint to be processed more than once. + */ + logError(s"Exception: WorkerId $workerId encountered and exception while storing or checkpointing a batch for workerId $workerId and shardId $shardId.", e) + + /** Rethrow the exception to the Kinesis Worker that is managing this RecordProcessor. */ + throw e + } + } + } else { + /** RecordProcessor has been stopped. */ + logInfo(s"Stopped: The Spark KinesisReceiver has stopped for workerId $workerId and shardId $shardId. No more records will be processed.") + } + } + + /** + * Kinesis Client Library is shutting down this Worker for 1 of 2 reasons: + * 1) the stream is resharding by splitting or merging adjacent shards (ShutdownReason.TERMINATE) + * 2) the failed or latent Worker has stopped sending heartbeats for whatever reason (ShutdownReason.ZOMBIE) + * + * @param checkpointer used to performn a Kinesis checkpoint for ShutdownReason.TERMINATE + * @param shutdown reason (ShutdownReason.TERMINATE or ShutdownReason.ZOMBIE) + */ + override def shutdown(checkpointer: IRecordProcessorCheckpointer, reason: ShutdownReason) { + logInfo(s"Shutdown: Shutting down workerId $workerId with reason $reason") + reason match { + /** + * TERMINATE Use Case. Checkpoint. + * Checkpoint to indicate that all records from the shard have been drained and processed. + * It's now OK to read from the new shards that resulted from a resharding event. + */ + case ShutdownReason.TERMINATE => KinesisUtils.retry(checkpointer.checkpoint(), 4, 500) + + /** + * ZOMBIE Use Case. NoOp. + * No checkpoint because other workers may have taken over and already started processing the same records. + * This may lead to records being processed more than once. + */ + case ShutdownReason.ZOMBIE => + + /** Unknown reason. NoOp */ + case _ => + } + } +} diff --git a/extras/spark-kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordSerializer.scala b/extras/spark-kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordSerializer.scala new file mode 100644 index 0000000000000..172c9b14eebca --- /dev/null +++ b/extras/spark-kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordSerializer.scala @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.streaming.kinesis + +import org.apache.spark.storage.StorageLevel +import org.apache.spark.streaming.receiver.Receiver +import com.amazonaws.auth.AWSCredentialsProvider +import com.amazonaws.auth.DefaultAWSCredentialsProviderChain +import org.apache.spark.SparkConf +import org.apache.spark.streaming.StreamingContext +import org.apache.spark.SparkContext +import org.apache.spark.streaming.Duration +import org.apache.spark.streaming.Seconds +import org.apache.spark.streaming.dstream.ReceiverInputDStream +import scala.reflect.ClassTag +import org.apache.spark.streaming.api.java.JavaStreamingContext +import org.apache.spark.streaming.api.java.JavaReceiverInputDStream +import java.nio.ByteBuffer + +/** + * Convert custom types to/from Array[Byte]. + * @tparam type to serialize/deserialize + */ +private[streaming] trait KinesisRecordSerializer[T] extends Serializable { + /** + * Convert type to Array[Byte] + * + * @param type to serialize + * @return byte array + */ + def serialize(t: T): Array[Byte] + + /** + * Convert Array[Byte] to type + * + * @param byte array + * @return deserialized type + */ + def deserialize(array: Array[Byte]): T +} diff --git a/extras/spark-kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisStringRecordSerializer.scala b/extras/spark-kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisStringRecordSerializer.scala new file mode 100644 index 0000000000000..4fd9c39b3c535 --- /dev/null +++ b/extras/spark-kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisStringRecordSerializer.scala @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.streaming.kinesis + +import java.nio.ByteBuffer +import java.nio.charset.Charset +import java.nio.CharBuffer +import org.apache.spark.Logging + +/** + * Implementation of KinesisRecordSerializer to convert Array[Byte] to/from String. + */ +class KinesisStringRecordSerializer extends KinesisRecordSerializer[String] with Logging { + /** + * Convert String to Array[Byte] + * + * @param string to serialize + * @return byte array + */ + def serialize(string: String): Array[Byte] = { + string.getBytes() + } + + /** + * Convert Array[Byte] to String + * + * @param byte array + * @return deserialized string + */ + def deserialize(array: Array[Byte]): String = { + new String(array) + } +} diff --git a/extras/spark-kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala b/extras/spark-kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala new file mode 100644 index 0000000000000..0c3a3cc0043a6 --- /dev/null +++ b/extras/spark-kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala @@ -0,0 +1,151 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.streaming.kinesis + +import org.apache.spark.streaming.StreamingContext +import org.apache.spark.streaming.api.java.JavaReceiverInputDStream +import org.apache.spark.streaming.api.java.JavaStreamingContext +import org.apache.spark.streaming.dstream.ReceiverInputDStream +import com.amazonaws.services.kinesis.clientlibrary.exceptions.ThrottlingException +import com.amazonaws.services.kinesis.clientlibrary.exceptions.KinesisClientLibDependencyException +import com.amazonaws.services.kinesis.clientlibrary.exceptions.ShutdownException +import com.amazonaws.services.kinesis.clientlibrary.exceptions.InvalidStateException +import org.apache.spark.Logging +import com.amazonaws.services.kinesis.clientlibrary.interfaces.IRecordProcessorFactory +import com.amazonaws.services.kinesis.clientlibrary.interfaces.IRecordProcessor +import scala.util.Random +import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream +import org.apache.spark.storage.StorageLevel +import org.apache.spark.streaming.util.ManualClock +import org.apache.spark.streaming.util.Clock +import org.apache.spark.streaming.util.SystemClock + +/** + * Facade to create the Scala-based or Java-based streams. + * Also, contains a reusable utility methods. + */ +object KinesisUtils extends Logging { + /** + * Create an InputDStream that pulls messages from a Kinesis stream. + * + * @param StreamingContext object + * @param app name + * @param stream name + * @param endpoint + * @param checkpoint interval (millis) for Kinesis checkpointing (not Spark checkpointing). + * See the Kinesis Spark Streaming documentation for more details on the different types of checkpoints. + * The default is TRIM_HORIZON to avoid potential data loss. However, this presents the risk of processing records more than once. + * @param in the absence of Kinesis checkpoint info, this is the worker's initial starting position in the stream. + * The values are either the beginning of the stream per Kinesis' limit of 24 hours (InitialPositionInStream.TRIM_HORIZON) + * or the tip of the stream using InitialPositionInStream.LATEST. + * The default is StorageLevel.MEMORY_AND_DISK_2 which replicates in-memory and on-disk to 2 nodes total (primary and secondary) + * + * @return ReceiverInputDStream[Array[Byte]] + */ + def createStream( + ssc: StreamingContext, + app: String, + stream: String, + endpoint: String, + checkpointIntervalMillis: Long, + initialPositionInStream: InitialPositionInStream = InitialPositionInStream.TRIM_HORIZON, + storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_2): ReceiverInputDStream[Array[Byte]] = { + + ssc.receiverStream(new KinesisReceiver(app, stream, endpoint, checkpointIntervalMillis, initialPositionInStream, storageLevel)) + } + + /** + * Create a Java-friendly InputDStream that pulls messages from a Kinesis stream. + * + * @param JavaStreamingContext object + * @param app name + * @param stream name + * @param endpoint + * @param checkpoint interval (millis) for Kinesis checkpointing (not Spark checkpointing). + * See the Kinesis Spark Streaming documentation for more details on the different types of checkpoints. + * The default is TRIM_HORIZON to avoid potential data loss. However, this presents the risk of processing records more than once. + * @param in the absence of Kinesis checkpoint info, this is the worker's initial starting position in the stream. + * The values are either the beginning of the stream per Kinesis' limit of 24 hours (InitialPositionInStream.TRIM_HORIZON) + * or the tip of the stream using InitialPositionInStream.LATEST. + * The default is StorageLevel.MEMORY_AND_DISK_2 which replicates in-memory and on-disk to 2 nodes total (primary and secondary) + * + * @return JavaReceiverInputDStream[Array[Byte]] + */ + def createJavaStream( + jssc: JavaStreamingContext, + app: String, + stream: String, + endpoint: String, + checkpointIntervalMillis: Long, + initialPositionInStream: InitialPositionInStream = InitialPositionInStream.TRIM_HORIZON, + storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_2): JavaReceiverInputDStream[Array[Byte]] = { + + jssc.receiverStream(new KinesisReceiver(app, stream, endpoint, checkpointIntervalMillis, initialPositionInStream, storageLevel)) + } + + /** + * Create checkpoint state using the existing system clock + * @param checkpointIntervalMillis + */ + def createCheckpointState(checkpointIntervalMillis: Long): CheckpointState = { + new CheckpointState(checkpointIntervalMillis) + } + + /** + * Retry the given amount of times with a random backoff time (millis) less than the given maxBackOffMillis + * + * @param expression expression to evalute + * @param numRetriesLeft number of retries left + * @param maxBackOffMillis: max millis between retries + * + * @return Evaluation of the given expression + * @throws Unretryable exception, unexpected exception, + * or any exception that persists after numRetriesLeft reaches 0 + */ + @annotation.tailrec + def retry[T](expression: => T, numRetriesLeft: Int, maxBackOffMillis: Int): T = { + util.Try { expression } match { + /** If the function succeeded, evaluate to x. */ + case util.Success(x) => x + /** If the function failed, either retry or throw the exception */ + case util.Failure(e) => e match { + /** Retry: Throttling or other Retryable exception has occurred */ + case _: ThrottlingException | _: KinesisClientLibDependencyException if numRetriesLeft > 1 => { + val backOffMillis = Random.nextInt(maxBackOffMillis) + Thread.sleep(backOffMillis) + logError(s"Retryable Exception: Random backOffMillis=${backOffMillis}", e) + retry(expression, numRetriesLeft - 1, maxBackOffMillis) + } + /** Throw: Shutdown has been requested by the Kinesis Client Library.*/ + case _: ShutdownException => { + logError(s"ShutdownException: Caught shutdown exception, skipping checkpoint.", e) + throw e + } + /** Throw: Non-retryable exception has occurred with the Kinesis Client Library */ + case _: InvalidStateException => { + logError(s"InvalidStateException: Cannot save checkpoint to the DynamoDB table used by the Amazon Kinesis Client Library. Table likely doesn't exist.", e) + throw e + } + /** Throw: Unexpected exception has occurred */ + case _ => { + logError(s"Unexpected, non-retryable exception.", e) + throw e + } + } + } + } +} diff --git a/extras/spark-kinesis-asl/src/test/resources/log4j.properties b/extras/spark-kinesis-asl/src/test/resources/log4j.properties new file mode 100644 index 0000000000000..f6bf583b740cd --- /dev/null +++ b/extras/spark-kinesis-asl/src/test/resources/log4j.properties @@ -0,0 +1,42 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Set everything to be logged to the file streaming/target/unit-tests.log +log4j.rootCategory=WARN, console + +# File appender +log4j.appender.file=org.apache.log4j.FileAppender +log4j.appender.file.append=false +log4j.appender.file.file=target/unit-tests.log +log4j.appender.file.layout=org.apache.log4j.PatternLayout +log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %p %c{1}: %m%n + +# Console appender +log4j.appender.console=org.apache.log4j.ConsoleAppender +log4j.appender.console.target=System.out +log4j.appender.console.layout=org.apache.log4j.PatternLayout +log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n + +# Settings to quiet third party logs that are too verbose +log4j.logger.org.eclipse.jetty=WARN +log4j.logger.org.eclipse.jetty.util.component.AbstractLifeCycle=ERROR +log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO +log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO + +# Log all Kinesis Streaming messages +log4j.logger.org.apache.spark.examples.streaming=DEBUG +log4j.logger.org.apache.spark.streaming.Kinesis=DEBUG \ No newline at end of file diff --git a/extras/spark-kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala b/extras/spark-kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala new file mode 100644 index 0000000000000..3d86a7a17fa12 --- /dev/null +++ b/extras/spark-kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala @@ -0,0 +1,267 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.streaming.kinesis + +import java.nio.ByteBuffer +import java.nio.CharBuffer +import java.nio.charset.Charset +import scala.collection.JavaConversions.seqAsJavaList +import org.scalatest.BeforeAndAfter +import org.scalatest.FunSuite +import org.scalatest.Matchers +import org.scalatest.PrivateMethodTester +import org.scalatest.mock.EasyMockSugar +import com.amazonaws.services.kinesis.clientlibrary.interfaces.IRecordProcessorCheckpointer +import com.amazonaws.services.kinesis.model.Record +import scala.collection.mutable.ArrayBuffer +import org.apache.spark.streaming.receiver.Receiver +import com.amazonaws.services.kinesis.clientlibrary.types.ShutdownReason +import com.amazonaws.services.kinesis.clientlibrary.exceptions.ThrottlingException +import com.amazonaws.services.kinesis.clientlibrary.exceptions.KinesisClientLibDependencyException +import com.amazonaws.services.kinesis.clientlibrary.exceptions.ShutdownException +import com.amazonaws.services.kinesis.clientlibrary.exceptions.InvalidStateException +import com.amazonaws.services.kinesis.clientlibrary.lib.worker.Worker +import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream +import org.apache.spark.storage.StorageLevel +import org.apache.spark.streaming.util.ManualClock +import org.apache.spark.streaming.util.SystemClock +import org.apache.spark.streaming.util.Clock + +/** + * Suite of Kinesis streaming receiver tests focusing mostly on the KinesisRecordProcessor + */ +class KinesisReceiverSuite extends FunSuite with Matchers with BeforeAndAfter with EasyMockSugar { + val app = "TestKinesisReceiver" + val stream = "mySparkStream" + val endpoint = "endpoint-url" + val workerId = "dummyWorkerId" + val shardId = "dummyShardId" + + val record1 = new Record() + record1.setData(ByteBuffer.wrap("Spark In Action".getBytes())) + val record2 = new Record() + record2.setData(ByteBuffer.wrap("Learning Spark".getBytes())) + val batch = List[Record](record1, record2) + val expectedArrayBuffer = new ArrayBuffer[Array[Byte]]() += record1.getData().array() += record2.getData().array() + + var receiverMock: KinesisReceiver = _ + var checkpointerMock: IRecordProcessorCheckpointer = _ + var checkpointClockMock: ManualClock = _ + var checkpointStateMock: CheckpointState = _ + var currentClockMock: Clock = _ + + before { + receiverMock = mock[KinesisReceiver] + checkpointerMock = mock[IRecordProcessorCheckpointer] + checkpointClockMock = mock[ManualClock] + checkpointStateMock = mock[CheckpointState] + currentClockMock = mock[Clock] + } + + test("process records including store and checkpoint") { + val expectedCheckpointIntervalMillis = 10 + expecting { + receiverMock.isStopped().andReturn(false).once() + receiverMock.store(expectedArrayBuffer).once() + checkpointStateMock.shouldCheckpoint().andReturn(true).once() + checkpointerMock.checkpoint().once() + checkpointStateMock.advanceCheckpoint().once() + } + whenExecuting(receiverMock, checkpointerMock, checkpointStateMock) { + val recordProcessor = new KinesisRecordProcessor(receiverMock, workerId, checkpointStateMock) + recordProcessor.processRecords(batch, checkpointerMock) + } + } + + test("shouldn't store and checkpoint when receiver is stopped") { + expecting { + receiverMock.isStopped().andReturn(true).once() + } + whenExecuting(receiverMock, checkpointerMock, checkpointStateMock) { + val recordProcessor = new KinesisRecordProcessor(receiverMock, workerId, checkpointStateMock) + recordProcessor.processRecords(batch, checkpointerMock) + } + } + + test("shouldn't checkpoint when exception occurs during store") { + expecting { + receiverMock.isStopped().andReturn(false).once() + receiverMock.store(expectedArrayBuffer).andThrow(new RuntimeException()).once() + } + whenExecuting(receiverMock, checkpointerMock, checkpointStateMock) { + intercept[RuntimeException] { + val recordProcessor = new KinesisRecordProcessor(receiverMock, workerId, checkpointStateMock) + recordProcessor.processRecords(batch, checkpointerMock) + } + } + } + + test("should set checkpoint time to currentTime + checkpoint interval upon instantiation") { + expecting { + currentClockMock.currentTime().andReturn(0).once() + } + whenExecuting(currentClockMock) { + val checkpointIntervalMillis = 10 + val checkpointState = new CheckpointState(checkpointIntervalMillis, currentClockMock) + assert(checkpointState.checkpointClock.currentTime() == checkpointIntervalMillis) + } + } + + test("should checkpoint if we have exceeded the checkpoint interval") { + expecting { + currentClockMock.currentTime().andReturn(0).once() + } + whenExecuting(currentClockMock) { + val checkpointState = new CheckpointState(Long.MinValue, currentClockMock) + assert(checkpointState.shouldCheckpoint()) + } + } + + test("shouldn't checkpoint if we have not exceeded the checkpoint interval") { + expecting { + currentClockMock.currentTime().andReturn(0).once() + } + whenExecuting(currentClockMock) { + val checkpointState = new CheckpointState(Long.MaxValue, currentClockMock) + assert(!checkpointState.shouldCheckpoint()) + } + } + + test("should add to time when advancing checkpoint") { + expecting { + currentClockMock.currentTime().andReturn(0).once() + } + whenExecuting(currentClockMock) { + val checkpointIntervalMillis = 10 + val checkpointState = new CheckpointState(checkpointIntervalMillis, currentClockMock) + assert(checkpointState.checkpointClock.currentTime() == checkpointIntervalMillis) + checkpointState.advanceCheckpoint() + assert(checkpointState.checkpointClock.currentTime() == (2 * checkpointIntervalMillis)) + } + } + + test("shutdown should checkpoint if the reason is TERMINATE") { + expecting { + checkpointerMock.checkpoint().once() + } + whenExecuting(checkpointerMock, checkpointStateMock) { + val recordProcessor = new KinesisRecordProcessor(receiverMock, workerId, checkpointStateMock) + val reason = ShutdownReason.TERMINATE + recordProcessor.shutdown(checkpointerMock, reason) + } + } + + test("shutdown should not checkpoint if the reason is something other than TERMINATE") { + expecting { + } + whenExecuting(checkpointerMock, checkpointStateMock) { + val recordProcessor = new KinesisRecordProcessor(receiverMock, workerId, checkpointStateMock) + recordProcessor.shutdown(checkpointerMock, ShutdownReason.ZOMBIE) + recordProcessor.shutdown(checkpointerMock, null) + } + } + + test("string record converter") { + val expectedString = "http://sparkinaction.com" + val expectedByteArray = expectedString.getBytes() + val stringRecordSerializer = new KinesisStringRecordSerializer() + + expectedByteArray should be(stringRecordSerializer.serialize(expectedString)) + + expectedString should be(stringRecordSerializer.deserialize(expectedByteArray)) + expectedString should be(stringRecordSerializer.deserialize(stringRecordSerializer.serialize(expectedString))) + } + + test("retry success on first attempt") { + val expectedIsStopped = false + expecting { + receiverMock.isStopped().andReturn(expectedIsStopped).once() + } + whenExecuting(receiverMock) { + val actualVal = KinesisUtils.retry(receiverMock.isStopped(), 2, 100) + assert(actualVal == expectedIsStopped) + } + } + + test("retry success on second attempt after a Kinesis throttling exception") { + val expectedIsStopped = false + expecting { + receiverMock.isStopped().andThrow(new ThrottlingException("error message")).andReturn(expectedIsStopped).once() + } + whenExecuting(receiverMock) { + val actualVal = KinesisUtils.retry(receiverMock.isStopped(), 2, 100) + assert(actualVal == expectedIsStopped) + } + } + + test("retry success on second attempt after a Kinesis dependency exception") { + val expectedIsStopped = false + expecting { + receiverMock.isStopped().andThrow(new KinesisClientLibDependencyException("error message")).andReturn(expectedIsStopped).once() + } + whenExecuting(receiverMock) { + val actualVal = KinesisUtils.retry(receiverMock.isStopped(), 2, 100) + assert(actualVal == expectedIsStopped) + } + } + + test("retry failed after a shutdown exception") { + expecting { + checkpointerMock.checkpoint().andThrow(new ShutdownException("error message")).once() + } + whenExecuting(checkpointerMock) { + intercept[ShutdownException] { + KinesisUtils.retry(checkpointerMock.checkpoint(), 2, 100) + } + } + } + + test("retry failed after an invalid state exception") { + expecting { + checkpointerMock.checkpoint().andThrow(new InvalidStateException("error message")).once() + } + whenExecuting(checkpointerMock) { + intercept[InvalidStateException] { + KinesisUtils.retry(checkpointerMock.checkpoint(), 2, 100) + } + } + } + + test("retry failed after unexpected exception") { + expecting { + checkpointerMock.checkpoint().andThrow(new RuntimeException("error message")).once() + } + whenExecuting(checkpointerMock) { + intercept[RuntimeException] { + KinesisUtils.retry(checkpointerMock.checkpoint(), 2, 100) + } + } + } + + test("retry failed after exhausing all retries") { + val expectedErrorMessage = "final try error message" + expecting { + checkpointerMock.checkpoint().andThrow(new ThrottlingException("error message")).andThrow(new ThrottlingException(expectedErrorMessage)).once() + } + whenExecuting(checkpointerMock) { + val exception = intercept[RuntimeException] { + KinesisUtils.retry(checkpointerMock.checkpoint(), 2, 100) + } + exception.getMessage().shouldBe(expectedErrorMessage) + } + } +} diff --git a/pom.xml b/pom.xml index 05f76d566e9d1..4dfdee12ec7f0 100644 --- a/pom.xml +++ b/pom.xml @@ -958,6 +958,14 @@ + + + spark-kinesis-asl + + extras/spark-kinesis-asl + + + java8-tests diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index 599714233c18f..b2c9fd91e0e91 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -88,7 +88,7 @@ object SparkBuild extends Build { lazy val mllib = Project("mllib", file("mllib"), settings = mllibSettings) dependsOn(core) lazy val assemblyProj = Project("assembly", file("assembly"), settings = assemblyProjSettings) - .dependsOn(core, graphx, bagel, mllib, streaming, repl, sql) dependsOn(maybeYarn: _*) dependsOn(maybeHive: _*) dependsOn(maybeGanglia: _*) + .dependsOn(core, graphx, bagel, mllib, streaming, repl, sql) dependsOn(maybeYarn: _*) dependsOn(maybeHive: _*) dependsOn(maybeGanglia: _*) dependsOn(maybeKinesis: _*) lazy val assembleDepsTask = TaskKey[Unit]("assemble-deps") lazy val assembleDeps = assembleDepsTask := { @@ -135,6 +135,15 @@ object SparkBuild extends Build { val maybeGanglia: Seq[ClasspathDependency] = if (isGangliaEnabled) Seq(gangliaProj) else Seq() val maybeGangliaRef: Seq[ProjectReference] = if (isGangliaEnabled) Seq(gangliaProj) else Seq() + // Include Kinesis integration if the user has enabled Kinesis + // This is isolated from the normal build due to ASL-licensed code in the library + lazy val isKinesisEnabled = Properties.envOrNone("SPARK_KINESIS_ASL").isDefined + lazy val kinesisProj = Project("spark-kinesis-asl", file("extras/spark-kinesis-asl"), settings = kinesisSettings) + .dependsOn(streaming % "compile->compile;test->test") + val maybeKinesis: Seq[ClasspathDependency] = if (isKinesisEnabled) Seq(kinesisProj) else Seq() + val maybeKinesisRef: Seq[ProjectReference] = if (isKinesisEnabled) Seq(kinesisProj) else Seq() + + // Include the Java 8 project if the JVM version is 8+ lazy val javaVersion = System.getProperty("java.specification.version") lazy val isJava8Enabled = javaVersion.toDouble >= "1.8".toDouble @@ -171,7 +180,7 @@ object SparkBuild extends Build { .dependsOn(core, mllib, graphx, bagel, streaming, hive) dependsOn(allExternal: _*) // Everything except assembly, hive, tools, java8Tests and examples belong to packageProjects - lazy val packageProjects = Seq[ProjectReference](core, repl, bagel, streaming, mllib, graphx, catalyst, sql) ++ maybeYarnRef ++ maybeHiveRef ++ maybeGangliaRef + lazy val packageProjects = Seq[ProjectReference](core, repl, bagel, streaming, mllib, graphx, catalyst, sql) ++ maybeYarnRef ++ maybeHiveRef ++ maybeGangliaRef ++ maybeKinesisRef lazy val allProjects = packageProjects ++ allExternalRefs ++ Seq[ProjectReference](examples, tools, assemblyProj) ++ maybeJava8Tests @@ -588,6 +597,14 @@ object SparkBuild extends Build { libraryDependencies += "com.codahale.metrics" % "metrics-ganglia" % "3.0.0" ) + def kinesisSettings = streamingSettings ++ Seq( + name := "spark-kinesis-asl", + libraryDependencies ++= Seq( + "com.amazonaws" % "amazon-kinesis-client" % "1.1.0", + "com.amazonaws" % "aws-java-sdk" % "1.8.3" + ) + ) + def java8TestsSettings = sharedSettings ++ Seq( name := "java8-tests", javacOptions := Seq("-target", "1.8", "-source", "1.8"), From cd68c0d7bb0c1ef38e7c92d0cd6eb4a7ccf2ce27 Mon Sep 17 00:00:00 2001 From: Chris Fregly Date: Fri, 18 Jul 2014 20:16:13 -0700 Subject: [PATCH 02/12] fixed typos and backward compatibility --- docs/streaming-programming-guide.md | 2 +- .../spark/streaming/kinesis/KinesisRecordProcessor.scala | 3 +-- project/SparkBuild.scala | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/docs/streaming-programming-guide.md b/docs/streaming-programming-guide.md index c91a23b5c0c94..8f9b7c1fa0f0a 100644 --- a/docs/streaming-programming-guide.md +++ b/docs/streaming-programming-guide.md @@ -518,7 +518,7 @@ depending on the checkpoint frequency.
  • Failed or latent KinesisReceivers will be detected and automatically shutdown/load-balanced by the KCL.
  • If possible, explicitly shutdown the worker if a failure occurs.
  • -Example KinesisWordCount (and JavaKiensisWordCount) notes: +Example KinesisWordCount (and JavaKinesisWordCount) notes:
  • These examples automatically determine the number of threads to run locally based on the number of shards for the stream.
  • These examples automatically determine the number of KinesisReceivers/InputDStreams to create based on the number of shards for the stream.
  • These examples use InitialPositionInStream.LATEST (tip of stream) vs. InitialPositionInStream.TRIM_HORIZON (back 24 hours) to simplify reasoning about the examples.
  • diff --git a/extras/spark-kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala b/extras/spark-kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala index 8dd24501fe381..c9e8ecd2ebb14 100644 --- a/extras/spark-kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala +++ b/extras/spark-kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala @@ -36,8 +36,7 @@ import org.apache.spark.streaming.util.Clock * * @param Kinesis receiver * @param workerId for logging purposes - * @param checkpoint utils - * @param Kinesis checkpoint interval (millis) + * @param checkpoint state */ private[streaming] class KinesisRecordProcessor( receiver: KinesisReceiver, diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index 4f4e2d11f1c00..097da182e2902 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -65,7 +65,7 @@ object SparkBuild extends PomBuild { } if (Properties.envOrNone("SPARK_KINESIS_ASL").isDefined) { println("NOTE: SPARK_KINESIS_ASL is deprecated, please use -Pspark-kinesis-asl flag.") - profiles ++= Seq("spark-ganglia-lgpl") + profiles ++= Seq("spark-kinesis-asl") } if (Properties.envOrNone("SPARK_HIVE").isDefined) { println("NOTE: SPARK_HIVE is deprecated, please use -Phive flag.") From 828f8aeb1081cf7ad9e5386e1cce933ece9c3d62 Mon Sep 17 00:00:00 2001 From: Chris Fregly Date: Mon, 21 Jul 2014 22:20:42 -0700 Subject: [PATCH 03/12] more cleanup --- .../sbt_app_core/src/main/scala/SparkApp.scala | 4 ++-- extras/spark-kinesis-asl/pom.xml | 8 ++++++-- project/SparkBuild.scala | 4 ++-- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/dev/audit-release/sbt_app_core/src/main/scala/SparkApp.scala b/dev/audit-release/sbt_app_core/src/main/scala/SparkApp.scala index 225d82a6c4876..e80c6bb614816 100644 --- a/dev/audit-release/sbt_app_core/src/main/scala/SparkApp.scala +++ b/dev/audit-release/sbt_app_core/src/main/scala/SparkApp.scala @@ -47,14 +47,14 @@ object SimpleApp { System.exit(-1) } if (foundGanglia) { - println("Ganglia sink was loaded via spark-core") + println("Ganglia sink was loaded via spark-ganglia-lgpl") System.exit(-1) } // Remove kinesis from default build due to ASL license issue val foundKinesis = Try(Class.forName("org.apache.spark.streaming.kinesis.KinesisReceiver")).isSuccess if (foundKinesis) { - println("Kinesis was loaded via spark-core") + println("Kinesis was loaded via spark-kinesis-asl") System.exit(-1) } } diff --git a/extras/spark-kinesis-asl/pom.xml b/extras/spark-kinesis-asl/pom.xml index 1b4101194d42f..6e2fdc9f13690 100644 --- a/extras/spark-kinesis-asl/pom.xml +++ b/extras/spark-kinesis-asl/pom.xml @@ -24,14 +24,18 @@ ../../pom.xml - + org.apache.spark spark-kinesis-asl_2.10 jar Spark Kinesis Integration - spark-kinesis-asl + kinesis-asl diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index 097da182e2902..a2e2f54745fed 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -36,7 +36,7 @@ object BuildCommons { "streaming-zeromq").map(ProjectRef(buildLocation, _)) val optionallyEnabledProjects@Seq(yarn, yarnStable, yarnAlpha, java8Tests, sparkGangliaLgpl, sparkKinesisAsl) = - Seq("yarn", "yarn-stable", "yarn-alpha", "java8-tests", "ganglia-lgpl", "spark-kinesis-asl") + Seq("yarn", "yarn-stable", "yarn-alpha", "java8-tests", "ganglia-lgpl", "kinesis-asl") .map(ProjectRef(buildLocation, _)) val assemblyProjects@Seq(assembly, examples) = Seq("assembly", "examples") @@ -60,7 +60,7 @@ object SparkBuild extends PomBuild { var isAlphaYarn = false var profiles: mutable.Seq[String] = mutable.Seq.empty if (Properties.envOrNone("SPARK_GANGLIA_LGPL").isDefined) { - println("NOTE: SPARK_GANGLIA_LGPL is deprecated, please use -Pganglia-lgpl flag.") + println("NOTE: SPARK_GANGLIA_LGPL is deprecated, please use -Pspark-ganglia-lgpl flag.") profiles ++= Seq("spark-ganglia-lgpl") } if (Properties.envOrNone("SPARK_KINESIS_ASL").isDefined) { From 338997e6e750c206bfb50a654b725be5f33beb07 Mon Sep 17 00:00:00 2001 From: Chris Fregly Date: Tue, 22 Jul 2014 08:54:35 -0700 Subject: [PATCH 04/12] improve build docs for kinesis --- docs/streaming-programming-guide.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/streaming-programming-guide.md b/docs/streaming-programming-guide.md index 8f9b7c1fa0f0a..75d320fae4620 100644 --- a/docs/streaming-programming-guide.md +++ b/docs/streaming-programming-guide.md @@ -472,10 +472,10 @@ Furthermore, you can also implement your own custom receiver for your sources. S Build notes:
  • Spark supports a Kinesis Streaming Receiver which is not included in the default build due to licensing restrictions.
  • _**Note that by embedding this library you will include [ASL](https://aws.amazon.com/asl/)-licensed code in your Spark package**_.
  • -
  • For sbt users, set the `SPARK_KINESIS_ASL` environment variable before building.
  • -
  • For Maven users, enable the `-Pspark-kinesis-asl` profile.
  • -
  • User applications will need to link to the `spark-kinesis-asl` artifact.
  • The Spark Kinesis Streaming Receiver source code, examples, tests, and artifacts live in $SPARK_HOME/extras/spark-kinesis-asl.
  • +
  • sbt and maven builds: must enable the `-Pspark-kinesis-asl` profile.
  • +
  • To build the examples JAR, you must run the maven build with the `-Pspark-kinesis-asl` profile.
  • +
  • Applications will need to link to the `spark-kinesis-asl` artifact.
  • Deployment and runtime notes:
  • Each shard of a stream is processed by one or more KinesisReceiver's managed by the Kinesis Client Library (KCL) Worker.
  • From 6c395619dde93a9b8e9137b1150de4ae5129cf4b Mon Sep 17 00:00:00 2001 From: Chris Fregly Date: Wed, 23 Jul 2014 20:55:55 -0700 Subject: [PATCH 05/12] parameterized the versions of the aws java sdk and kinesis client --- extras/spark-kinesis-asl/pom.xml | 4 ++-- pom.xml | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/extras/spark-kinesis-asl/pom.xml b/extras/spark-kinesis-asl/pom.xml index 6e2fdc9f13690..adb63d5464754 100644 --- a/extras/spark-kinesis-asl/pom.xml +++ b/extras/spark-kinesis-asl/pom.xml @@ -52,12 +52,12 @@ com.amazonaws amazon-kinesis-client - 1.1.0 + ${aws.kinesis.client.version} com.amazonaws aws-java-sdk - 1.8.3 + ${aws.java.sdk.version} org.scalatest diff --git a/pom.xml b/pom.xml index 8619e4fa43b6e..0dece16192017 100644 --- a/pom.xml +++ b/pom.xml @@ -132,6 +132,8 @@ 3.0.0 1.7.6 0.7.1 + 1.8.3 + 1.1.0 64m 512m From 912640cb344c77102e4ca4d884b8b0d0206ed627 Mon Sep 17 00:00:00 2001 From: Chris Fregly Date: Wed, 30 Jul 2014 18:03:27 -0700 Subject: [PATCH 06/12] changed the foundKinesis class to be a publically-avail class --- dev/audit-release/sbt_app_core/src/main/scala/SparkApp.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/audit-release/sbt_app_core/src/main/scala/SparkApp.scala b/dev/audit-release/sbt_app_core/src/main/scala/SparkApp.scala index 07884afaf169e..025f71a1ce45a 100644 --- a/dev/audit-release/sbt_app_core/src/main/scala/SparkApp.scala +++ b/dev/audit-release/sbt_app_core/src/main/scala/SparkApp.scala @@ -52,7 +52,7 @@ object SimpleApp { } // Remove kinesis from default build due to ASL license issue - val foundKinesis = Try(Class.forName("org.apache.spark.streaming.kinesis.KinesisReceiver")).isSuccess + val foundKinesis = Try(Class.forName("org.apache.spark.streaming.kinesis.KinesisUtils")).isSuccess if (foundKinesis) { println("Kinesis was loaded via kinesis-asl") System.exit(-1) From d17ca6d6a36ddf0a3030eacae0eace3fdd758cc5 Mon Sep 17 00:00:00 2001 From: Chris Fregly Date: Thu, 31 Jul 2014 10:00:09 -0700 Subject: [PATCH 07/12] per TD's feedback: updated docs, simplified the KinesisUtils api --- .../streaming/JavaKinesisWordCount.java | 4 +- .../examples/streaming/KinesisWordCount.scala | 6 +- .../streaming/kinesis/KinesisReceiver.scala | 26 +++---- .../streaming/kinesis/KinesisUtils.scala | 69 +++++++------------ 4 files changed, 38 insertions(+), 67 deletions(-) diff --git a/extras/kinesis-asl/src/main/java/org/apache/spark/examples/streaming/JavaKinesisWordCount.java b/extras/kinesis-asl/src/main/java/org/apache/spark/examples/streaming/JavaKinesisWordCount.java index be699a2b8f86e..8543c07aed141 100644 --- a/extras/kinesis-asl/src/main/java/org/apache/spark/examples/streaming/JavaKinesisWordCount.java +++ b/extras/kinesis-asl/src/main/java/org/apache/spark/examples/streaming/JavaKinesisWordCount.java @@ -165,14 +165,14 @@ public static void main(String[] args) { /** Create the same number of Kinesis Receivers/DStreams as stream shards, then union them all */ JavaDStream allStreams = KinesisUtils .createStream(jssc, appName, stream, endpoint, checkpointInterval.milliseconds(), - InitialPositionInStream.LATEST, StorageLevel.MEMORY_AND_DISK_2()); + InitialPositionInStream.LATEST); /** Set the checkpoint interval */ allStreams.checkpoint(checkpointInterval); for (int i = 1; i < numStreams; i++) { /** Create a new Receiver/DStream for each stream shard */ JavaDStream dStream = KinesisUtils .createStream(jssc, appName, stream, endpoint, checkpointInterval.milliseconds(), - InitialPositionInStream.LATEST, StorageLevel.MEMORY_AND_DISK_2()); + InitialPositionInStream.LATEST); /** Set the Spark checkpoint interval */ dStream.checkpoint(checkpointInterval); diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCount.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCount.scala index d0e6cdb75cd26..bb036f4d1741e 100644 --- a/extras/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCount.scala +++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCount.scala @@ -159,15 +159,13 @@ object KinesisWordCount extends Logging { * them all. */ var allStreams: DStream[Array[Byte]] = KinesisUtils.createStream(ssc, appName, stream, - endpoint, checkpointInterval.milliseconds, InitialPositionInStream.LATEST, - StorageLevel.MEMORY_AND_DISK_2) + endpoint, checkpointInterval.milliseconds, InitialPositionInStream.LATEST) /** Set the checkpoint interval */ allStreams.checkpoint(checkpointInterval) for (i <- 1 until numStreams) { /** Create a new Receiver/DStream for each stream shard */ val dStream = KinesisUtils.createStream(ssc, appName, stream, endpoint, - checkpointInterval.milliseconds, - InitialPositionInStream.LATEST, StorageLevel.MEMORY_AND_DISK_2) + checkpointInterval.milliseconds, InitialPositionInStream.LATEST) /** Set the Spark checkpoint interval */ dStream.checkpoint(checkpointInterval) diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala index 3f0828431fe15..d6e4b7996877c 100644 --- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala +++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala @@ -41,29 +41,23 @@ import com.amazonaws.services.kinesis.clientlibrary.lib.worker.Worker * Instances of this class will get shipped to the Spark Streaming Workers * to run within a Spark Executor. * - * @param appName Kinesis Application Name. Kinesis apps are mapped to Kinesis streams - * by the Kinesis Client Library. If you change the app name or stream name, - * the KCL will throw errors. + * @param appName unique name for your Kinesis app. Multiple instances of the app pull from + * the same stream. The Kinesis Client Library coordinates all load-balancing and + * failure-recovery. * @param stream Kinesis stream name - * @param endpoint url of Kinesis service - * @param checkpointIntervalMillis for Kinesis checkpointing (not Spark checkpointing). - * See the Kinesis Spark Streaming documentation for more details on the different types - * of checkpoints. - * @param initialPositionInStream in the absence of Kinesis checkpoint info, this is the worker's initial - * starting position in the stream. - * The values are either the beginning of the stream per Kinesis' limit of 24 hours - * (InitialPositionInStream.TRIM_HORIZON) or the tip of the stream - * (InitialPositionInStream.LATEST). - * @param persistence strategy for RDDs and DStreams. + * @param endpoint url of Kinesis service (ie. https://kinesis.us-east-1.amazonaws.com) + * Available endpoints: http://docs.aws.amazon.com/general/latest/gr/rande.html#ak_region + * @param checkpointIntervalMillis interval (millis) for Kinesis checkpointing + * @param initialPositionInStream in the absence of a Kinesis checkpoint info, this is the + * worker's initial starting position in the stream. */ private[kinesis] class KinesisReceiver( appName: String, stream: String, endpoint: String, checkpointIntervalMillis: Long, - initialPositionInStream: InitialPositionInStream, - storageLevel: StorageLevel) - extends Receiver[Array[Byte]](storageLevel) with Logging { receiver => + initialPositionInStream: InitialPositionInStream) + extends Receiver[Array[Byte]](StorageLevel.MEMORY_AND_DISK_2) with Logging { receiver => /** * The following vars are built in the onStart() method which executes in the Spark Worker after diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala index 8a113bb46ddd9..f3b60f1c49686 100644 --- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala +++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala @@ -28,8 +28,7 @@ import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionIn /** - * Facade to create the Scala-based or Java-based streams. - * Also, contains a reusable utility methods. + * Helper class to create Amazon Kinesis Input Stream * :: Experimental :: */ @Experimental @@ -37,25 +36,16 @@ object KinesisUtils extends Logging { /** * Create an InputDStream that pulls messages from a Kinesis stream. * - * @param StreamingContext object - * @param appName Kinesis Application Name. Kinesis Apps are mapped to Kinesis Streams - * by the Kinesis Client Library. If you change the App name or Stream name, - * the KCL will throw errors. - * @param stream Kinesis Stream Name - * @param endpoint url of Kinesis service - * @param checkpoint interval (millis) for Kinesis checkpointing (not Spark checkpointing). - * See the Kinesis Spark Streaming documentation for more details on the different types - * of checkpoints. - * @param initialPositionInStream in the absence of Kinesis checkpoint info, this is the + * @param ssc StreamingContext + * @param appName unique name for your Kinesis app. Multiple instances of the app pull from + * the same stream. The Kinesis Client Library coordinates all load-balancing and + * failure-recovery. + * @param stream Kinesis stream name + * @param endpoint url of Kinesis service (ie. https://kinesis.us-east-1.amazonaws.com) + * Available endpoints: http://docs.aws.amazon.com/general/latest/gr/rande.html#ak_region + * @param checkpointIntervalMillis interval (millis) for Kinesis checkpointing + * @param initialPositionInStream in the absence of a Kinesis checkpoint info, this is the * worker's initial starting position in the stream. - * The values are either the beginning of the stream per Kinesis' limit of 24 hours - * (InitialPositionInStream.TRIM_HORIZON) or the tip of the stream - * (InitialPositionInStream.LATEST). - * The default is TRIM_HORIZON to avoid potential data loss. However, this presents the risk - * of processing records more than once. - * @param storageLevel The default is StorageLevel.MEMORY_AND_DISK_2 which replicates in-memory - * and on-disk to 2 nodes total (primary and secondary) - * * @return ReceiverInputDStream[Array[Byte]] */ def createStream( @@ -64,34 +54,24 @@ object KinesisUtils extends Logging { stream: String, endpoint: String, checkpointIntervalMillis: Long, - initialPositionInStream: InitialPositionInStream, - storageLevel: StorageLevel): ReceiverInputDStream[Array[Byte]] = { + initialPositionInStream: InitialPositionInStream): ReceiverInputDStream[Array[Byte]] = { ssc.receiverStream(new KinesisReceiver(appName, stream, endpoint, checkpointIntervalMillis, - initialPositionInStream, storageLevel)) + initialPositionInStream )) } /** * Create a Java-friendly InputDStream that pulls messages from a Kinesis stream. * - * @param JavaStreamingContext object - * @param appName Kinesis Application Name. Kinesis Apps are mapped to Kinesis Streams - * by the Kinesis Client Library. If you change the App name or Stream name, - * the KCL will throw errors. - * @param stream Kinesis Stream Name - * @param endpoint url of Kinesis service - * @param checkpoint interval (millis) for Kinesis checkpointing (not Spark checkpointing). - * See the Kinesis Spark Streaming documentation for more details on the different types - * of checkpoints. - * @param initialPositionInStream in the absence of Kinesis checkpoint info, this is the + * @param jssc Java StreamingContext object + * @param appName unique name for your Kinesis app. Multiple instances of the app pull from + * the same stream. The Kinesis Client Library coordinates all load-balancing and + * failure-recovery. + * @param stream Kinesis stream name + * @param endpoint url of Kinesis service (ie. https://kinesis.us-east-1.amazonaws.com) + * Available endpoints: http://docs.aws.amazon.com/general/latest/gr/rande.html#ak_region + * @param checkpointIntervalMillis interval (millis) for Kinesis checkpointing + * @param initialPositionInStream in the absence of a Kinesis checkpoint info, this is the * worker's initial starting position in the stream. - * The values are either the beginning of the stream per Kinesis' limit of 24 hours - * (InitialPositionInStream.TRIM_HORIZON) or the tip of the stream - * (InitialPositionInStream.LATEST). - * The default is TRIM_HORIZON to avoid potential data loss. However, this presents the risk - * of processing records more than once. - * @param storageLevel The default is StorageLevel.MEMORY_AND_DISK_2 which replicates in-memory - * and on-disk to 2 nodes total (primary and secondary) - * * @return JavaReceiverInputDStream[Array[Byte]] */ def createStream( @@ -99,10 +79,9 @@ object KinesisUtils extends Logging { appName: String, stream: String, endpoint: String, - checkpointIntervalMillis: Long, - initialPositionInStream: InitialPositionInStream, - storageLevel: StorageLevel): JavaReceiverInputDStream[Array[Byte]] = { + checkpointIntervalMillis: Long, + initialPositionInStream: InitialPositionInStream): JavaReceiverInputDStream[Array[Byte]] = { jssc.receiverStream(new KinesisReceiver(appName, stream, endpoint, checkpointIntervalMillis, - initialPositionInStream, storageLevel)) + initialPositionInStream)) } } From bf614e9ed870a3c23670d3783d574b1e4280bd81 Mon Sep 17 00:00:00 2001 From: Chris Fregly Date: Thu, 31 Jul 2014 10:33:20 -0700 Subject: [PATCH 08/12] per matei's feedback: moved the kinesis examples into the examples/ dir --- examples/pom.xml | 5 ++ .../streaming/JavaKinesisWordCount.java | 2 +- .../examples/streaming/KinesisWordCount.scala | 4 +- extras/kinesis-asl/bin/run-kinesis-example | 60 ------------- .../kinesis-asl/bin/run-kinesis-example.cmd | 90 ------------------- .../kinesis/KinesisRecordProcessor.scala | 2 +- .../streaming/kinesis/KinesisUtils.scala | 2 +- 7 files changed, 10 insertions(+), 155 deletions(-) rename {extras/kinesis-asl => examples}/src/main/java/org/apache/spark/examples/streaming/JavaKinesisWordCount.java (99%) rename {extras/kinesis-asl => examples}/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCount.scala (99%) delete mode 100755 extras/kinesis-asl/bin/run-kinesis-example delete mode 100755 extras/kinesis-asl/bin/run-kinesis-example.cmd diff --git a/examples/pom.xml b/examples/pom.xml index c4ed0f5a6a02b..d87ac68238eb9 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -96,6 +96,11 @@ spark-streaming-mqtt_${scala.binary.version} ${project.version} + + org.apache.spark + kinesis-asl_${scala.binary.version} + ${project.version} + org.apache.hbase hbase diff --git a/extras/kinesis-asl/src/main/java/org/apache/spark/examples/streaming/JavaKinesisWordCount.java b/examples/src/main/java/org/apache/spark/examples/streaming/JavaKinesisWordCount.java similarity index 99% rename from extras/kinesis-asl/src/main/java/org/apache/spark/examples/streaming/JavaKinesisWordCount.java rename to examples/src/main/java/org/apache/spark/examples/streaming/JavaKinesisWordCount.java index 8543c07aed141..f13d3c9acce8b 100644 --- a/extras/kinesis-asl/src/main/java/org/apache/spark/examples/streaming/JavaKinesisWordCount.java +++ b/examples/src/main/java/org/apache/spark/examples/streaming/JavaKinesisWordCount.java @@ -73,7 +73,7 @@ * Example: * $ export AWS_ACCESS_KEY_ID= * $ export AWS_SECRET_KEY= - * $ $SPARK_HOME/extras/kinesis-asl/bin/run-kinesis-example \ + * $ $SPARK_HOME/bin/run-example \ * org.apache.spark.examples.streaming.JavaKinesisWordCount mySparkStream \ * https://kinesis.us-east-1.amazonaws.com * diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCount.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCount.scala similarity index 99% rename from extras/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCount.scala rename to examples/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCount.scala index bb036f4d1741e..50c3889d277fa 100644 --- a/extras/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCount.scala +++ b/examples/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCount.scala @@ -68,7 +68,7 @@ import com.amazonaws.services.kinesis.model.PutRecordRequest * Example: * $ export AWS_ACCESS_KEY_ID= * $ export AWS_SECRET_KEY= - * $ $SPARK_HOME/extras/kinesis-asl/bin/run-kinesis-example \ + * $ $SPARK_HOME/bin/run-example \ * org.apache.spark.examples.streaming.KinesisWordCount mySparkStream \ * https://kinesis.us-east-1.amazonaws.com * @@ -260,7 +260,7 @@ object KinesisWordCount extends Logging { * Example: * $ export AWS_ACCESS_KEY_ID= * $ export AWS_SECRET_KEY= - * $ $SPARK_HOME/extras/kinesis-asl/bin/run-kinesis-example \ + * $ $SPARK_HOME/bin/run-example \ * org.apache.spark.examples.streaming.KinesisWordCountProducer mySparkStream \ * https://kinesis.us-east-1.amazonaws.com 10 5 */ diff --git a/extras/kinesis-asl/bin/run-kinesis-example b/extras/kinesis-asl/bin/run-kinesis-example deleted file mode 100755 index 6cf01fbe773a4..0000000000000 --- a/extras/kinesis-asl/bin/run-kinesis-example +++ /dev/null @@ -1,60 +0,0 @@ -#!/usr/bin/env bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -SCALA_VERSION=2.10 - -FWDIR="$(cd `dirname $0`/../../../; pwd)" -export SPARK_HOME="$FWDIR" -KINESIS_EXAMPLES_DIR="$FWDIR"/extras/kinesis-asl - -if [ -n "$1" ]; then - EXAMPLE_CLASS="$1" - shift -else - echo "Usage: $SPARK_HOME/extras/kinesis-asl/bin/run-kinesis-example [example-args]" 1>&2 - echo " - set MASTER=XX to use a specific master" 1>&2 - echo " - can use abbreviated example class name (e.g. KinesisWordCount, JavaKinesisWordCount)" 1>&2 - echo " - must set AWS_ACCESS_KEY_ID and AWS_SECRET_KEY env variables" 1>&2 - exit 1 -fi - -export GLOBIGNORE="*-javadoc.jar:*-sources.jar" -if [ -f "$FWDIR/RELEASE" ]; then - export SPARK_KINESIS_EXAMPLES_JAR=`ls "$FWDIR"/lib/kinesis-asl*.jar` -elif [ -e "$KINESIS_EXAMPLES_DIR"/target/kinesis-asl_$SCALA_VERSION-*.jar ]; then - export SPARK_KINESIS_EXAMPLES_JAR=`ls "$KINESIS_EXAMPLES_DIR"/target/kinesis-asl_$SCALA_VERSION-*.jar` -fi - -if [[ -z $SPARK_KINESIS_EXAMPLES_JAR ]]; then - echo "Failed to find Spark Kinesis examples assembly in "$FWDIR"/lib or "$KINESIS_EXAMPLES_DIR"/target" 1>&2 - echo "You need to build Spark with maven using 'mvn -Pkinesis-asl package' before running this program." 1>&2 - exit 1 -fi - -EXAMPLE_MASTER=${MASTER:-"local[*]"} - -if [[ ! $EXAMPLE_CLASS == org.apache.spark.examples.streaming* ]]; then - EXAMPLE_CLASS="org.apache.spark.examples.streaming.$EXAMPLE_CLASS" -fi - -"$FWDIR"/bin/spark-submit \ - --master $EXAMPLE_MASTER \ - --class $EXAMPLE_CLASS \ - "$SPARK_KINESIS_EXAMPLES_JAR" \ - $@ diff --git a/extras/kinesis-asl/bin/run-kinesis-example.cmd b/extras/kinesis-asl/bin/run-kinesis-example.cmd deleted file mode 100755 index 0980c78391d49..0000000000000 --- a/extras/kinesis-asl/bin/run-kinesis-example.cmd +++ /dev/null @@ -1,90 +0,0 @@ -@echo off - -rem -rem Licensed to the Apache Software Foundation (ASF) under one or more -rem contributor license agreements. See the NOTICE file distributed with -rem this work for additional information regarding copyright ownership. -rem The ASF licenses this file to You under the Apache License, Version 2.0 -rem (the "License"); you may not use this file except in compliance with -rem the License. You may obtain a copy of the License at -rem -rem http://www.apache.org/licenses/LICENSE-2.0 -rem -rem Unless required by applicable law or agreed to in writing, software -rem distributed under the License is distributed on an "AS IS" BASIS, -rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -rem See the License for the specific language governing permissions and -rem limitations under the License. -rem - -set SCALA_VERSION=2.10 - -rem Figure out where the Spark framework is installed -set FWDIR=%~dp0..\..\..\ - -rem Export this as SPARK_HOME -set SPARK_HOME=%FWDIR% - -rem Load environment variables from conf\spark-env.cmd, if it exists -if exist "%FWDIR%conf\spark-env.cmd" call "%FWDIR%conf\spark-env.cmd" - -rem Test that an argument was given -if not "x%1"=="x" goto arg_given - echo Usage: SPARK_HOME/extras/kinesis-asl/bin run-kinesis-example ^ [example-args] - echo - set MASTER=XX to use a specific master - echo - can use abbreviated example class name (e.g. KinesisWordCount, JavaKinesisWordCount) - echo " - must set AWS_ACCESS_KEY_ID and AWS_SECRET_KEY env variables" 1>&2 - - goto exit -:arg_given - -set KINESIS_EXAMPLES_DIR=%FWDIR%extras\kinesis-asl - -rem Figure out the JAR file that our examples were packaged into. -set SPARK_KINESIS_EXAMPLES_JAR= -if exist "%FWDIR%RELEASE" ( - for %%d in ("%FWDIR%lib\kinesis-asl*.jar") do ( - set SPARK_KINESIS_EXAMPLES_JAR=%%d - ) -) else ( - for %%d in ("%KINESIS_EXAMPLES_DIR%\target\kinesis-asl*.jar") do ( - set SPARK_KINESIS_EXAMPLES_JAR=%%d - ) -) -if "x%SPARK_KINESIS_EXAMPLES_JAR%"=="x" ( - echo Failed to find Spark Kinesis examples assembly JAR. - echo You need to build Spark with maven using 'mvn -Pkinesis-asl package' before running this program. - goto exit -) - -rem Set master from MASTER environment variable if given -if "x%MASTER%"=="x" ( - set EXAMPLE_MASTER=local[*] -) else ( - set EXAMPLE_MASTER=%MASTER% -) - -rem If the EXAMPLE_CLASS does not start with org.apache.spark.examples.streaming, add that -set EXAMPLE_CLASS=%1 -set PREFIX=%EXAMPLE_CLASS:~0,25% -if not %PREFIX%==org.apache.spark.examples.streaming ( - set EXAMPLE_CLASS=org.apache.spark.examples.streaming.%EXAMPLE_CLASS% -) - -rem Get the tail of the argument list, to skip the first one. This is surprisingly -rem complicated on Windows. -set "ARGS=" -:top -shift -if "%~1" neq "" ( - set ARGS=%ARGS% "%~1" - goto :top -) -if defined ARGS set ARGS=%ARGS:~1% - -call "%FWDIR%bin\spark-submit.cmd" ^ - --master %EXAMPLE_MASTER% ^ - --class %EXAMPLE_CLASS% ^ - "%SPARK_KINESIS_EXAMPLES_JAR%" %ARGS% - -:exit diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala index 1c665cf9fd0d3..055e7297706ae 100644 --- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala +++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala @@ -36,7 +36,7 @@ import com.amazonaws.services.kinesis.model.Record * @param Kinesis receiver * @param workerId for logging purposes * @param checkpointState represents the checkpoint state including the next time a - * checkpoint is needed. it's injected here for mocking purposes. + * checkpoint is needed. it's injected here for mocking purposes. */ private[kinesis] class KinesisRecordProcessor( receiver: KinesisReceiver, diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala index f3b60f1c49686..2b6b833457e35 100644 --- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala +++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala @@ -56,7 +56,7 @@ object KinesisUtils extends Logging { checkpointIntervalMillis: Long, initialPositionInStream: InitialPositionInStream): ReceiverInputDStream[Array[Byte]] = { ssc.receiverStream(new KinesisReceiver(appName, stream, endpoint, checkpointIntervalMillis, - initialPositionInStream )) + initialPositionInStream)) } /** From 74e5c7c3ce99f5cd30d269d62aca31d2b275288c Mon Sep 17 00:00:00 2001 From: Chris Fregly Date: Fri, 1 Aug 2014 14:14:42 -0700 Subject: [PATCH 09/12] updated per TD's feedback. simplified examples, updated docs --- assembly/pom.xml | 2 +- bin/run-example | 4 +- .../src/main/scala/SparkApp.scala | 4 +- dev/audit-release/sbt_app_kinesis/build.sbt | 2 +- docs/streaming-kinesis.md | 1 + examples/pom.xml | 2 +- .../streaming/JavaKinesisWordCount.java | 294 -------------- .../streaming/JavaKinesisWordCountASL.java | 187 +++++++++ .../examples/streaming/KinesisWordCount.scala | 369 ------------------ .../streaming/KinesisWordCountASL.scala | 235 +++++++++++ extras/kinesis-asl/pom.xml | 3 +- ...ate.scala => KinesisCheckpointState.scala} | 15 +- .../streaming/kinesis/KinesisReceiver.scala | 45 ++- .../kinesis/KinesisRecordProcessor.scala | 87 ++++- .../kinesis/KinesisRecordProcessorUtils.scala | 79 ---- .../kinesis/KinesisRecordSerializer.scala | 39 -- .../KinesisStringRecordSerializer.scala | 44 --- .../streaming/kinesis/KinesisUtils.scala | 78 ++-- .../kinesis/JavaKinesisStreamSuite.java | 41 ++ .../src/test/resources/log4j.properties | 1 - .../kinesis/KinesisReceiverSuite.scala | 67 ++-- make-distribution.sh | 2 - 22 files changed, 658 insertions(+), 943 deletions(-) delete mode 100644 examples/src/main/java/org/apache/spark/examples/streaming/JavaKinesisWordCount.java create mode 100644 examples/src/main/java/org/apache/spark/examples/streaming/JavaKinesisWordCountASL.java delete mode 100644 examples/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCount.scala create mode 100644 examples/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala rename extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/{CheckpointState.scala => KinesisCheckpointState.scala} (82%) delete mode 100644 extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessorUtils.scala delete mode 100644 extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordSerializer.scala delete mode 100644 extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisStringRecordSerializer.scala create mode 100644 extras/kinesis-asl/src/test/java/org/apache/spark/streaming/kinesis/JavaKinesisStreamSuite.java diff --git a/assembly/pom.xml b/assembly/pom.xml index 824ef383d2e47..76099b074c7ed 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -190,7 +190,7 @@ org.apache.spark - kinesis-asl_${scala.binary.version} + spark-streaming-kinesis-asl_${scala.binary.version} ${project.version} diff --git a/bin/run-example b/bin/run-example index 942706d733122..65d20738260bf 100755 --- a/bin/run-example +++ b/bin/run-example @@ -29,7 +29,9 @@ if [ -n "$1" ]; then else echo "Usage: ./bin/run-example [example-args]" 1>&2 echo " - set MASTER=XX to use a specific master" 1>&2 - echo " - can use abbreviated example class name (e.g. SparkPi, mllib.LinearRegression)" 1>&2 + echo " - can use abbreviated example class name relative to com.apache.spark.examples" 1>&2 + echo " (e.g. SparkPi, mllib.LinearRegression, streaming.KinesisWordCountASL)" 1>&2 + echo " - to run the Kinesis Spark Streaming example, make sure you build with -Pkinesis-asl" 1>&2 exit 1 fi diff --git a/dev/audit-release/sbt_app_core/src/main/scala/SparkApp.scala b/dev/audit-release/sbt_app_core/src/main/scala/SparkApp.scala index 025f71a1ce45a..fc03fec9866a6 100644 --- a/dev/audit-release/sbt_app_core/src/main/scala/SparkApp.scala +++ b/dev/audit-release/sbt_app_core/src/main/scala/SparkApp.scala @@ -47,14 +47,14 @@ object SimpleApp { System.exit(-1) } if (foundGanglia) { - println("Ganglia sink was loaded via spark-ganglia-lgpl") + println("Ganglia sink was loaded via spark-core") System.exit(-1) } // Remove kinesis from default build due to ASL license issue val foundKinesis = Try(Class.forName("org.apache.spark.streaming.kinesis.KinesisUtils")).isSuccess if (foundKinesis) { - println("Kinesis was loaded via kinesis-asl") + println("Kinesis was loaded via spark-core") System.exit(-1) } } diff --git a/dev/audit-release/sbt_app_kinesis/build.sbt b/dev/audit-release/sbt_app_kinesis/build.sbt index 9d821b9a09fbf..5dfd16c185f61 100644 --- a/dev/audit-release/sbt_app_kinesis/build.sbt +++ b/dev/audit-release/sbt_app_kinesis/build.sbt @@ -23,7 +23,7 @@ scalaVersion := System.getenv.get("SCALA_VERSION") libraryDependencies += "org.apache.spark" %% "spark-core" % System.getenv.get("SPARK_VERSION") libraryDependencies += "org.apache.spark" %% "spark-streaming" % System.getenv.get("SPARK_VERSION") -libraryDependencies += "org.apache.spark" %% "kinesis-asl" % System.getenv.get("SPARK_VERSION") +libraryDependencies += "org.apache.spark" %% "spark-streaming-kinesis-asl" % System.getenv.get("SPARK_VERSION") resolvers ++= Seq( "Spark Release Repository" at System.getenv.get("SPARK_RELEASE_REPOSITORY"), diff --git a/docs/streaming-kinesis.md b/docs/streaming-kinesis.md index 4c5ad434a243a..3dd6c01c4cd7c 100644 --- a/docs/streaming-kinesis.md +++ b/docs/streaming-kinesis.md @@ -16,6 +16,7 @@ Deployment and runtime notes:
  • Each shard of a stream is processed by one or more KinesisReceiver's managed by the Kinesis Client Library (KCL) Worker.
  • Said differently, a single KinesisReceiver can process many shards of a stream.
  • You never need more KinesisReceivers than the number of shards in your stream.
  • +
  • You can horizontally scale the receiving by creating more KinesisReceiver/DStreams (up to the number of shards for a given stream)
  • The Kinesis assembly jar must also be present on all worker nodes, as they will need access to the Kinesis Client Library.
  • /tmp/checkpoint is a valid and accessible directory on all workers (or locally if running in local mode)
  • This code uses the DefaultAWSCredentialsProviderChain and searches for credentials in the following order of precedence:
    diff --git a/examples/pom.xml b/examples/pom.xml index d87ac68238eb9..ffcec8d56f5c6 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -98,7 +98,7 @@ org.apache.spark - kinesis-asl_${scala.binary.version} + spark-streaming-kinesis-asl_${scala.binary.version} ${project.version} diff --git a/examples/src/main/java/org/apache/spark/examples/streaming/JavaKinesisWordCount.java b/examples/src/main/java/org/apache/spark/examples/streaming/JavaKinesisWordCount.java deleted file mode 100644 index f13d3c9acce8b..0000000000000 --- a/examples/src/main/java/org/apache/spark/examples/streaming/JavaKinesisWordCount.java +++ /dev/null @@ -1,294 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.spark.examples.streaming; - -import java.util.List; -import java.util.regex.Pattern; - -import org.apache.log4j.Level; -import org.apache.log4j.Logger; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.function.FlatMapFunction; -import org.apache.spark.api.java.function.Function; -import org.apache.spark.api.java.function.Function2; -import org.apache.spark.api.java.function.PairFunction; -import org.apache.spark.storage.StorageLevel; -import org.apache.spark.streaming.Duration; -import org.apache.spark.streaming.Milliseconds; -import org.apache.spark.streaming.api.java.JavaDStream; -import org.apache.spark.streaming.api.java.JavaPairDStream; -import org.apache.spark.streaming.api.java.JavaStreamingContext; -import org.apache.spark.streaming.kinesis.KinesisRecordSerializer; -import org.apache.spark.streaming.kinesis.KinesisStringRecordSerializer; -import org.apache.spark.streaming.kinesis.KinesisUtils; - -import scala.Tuple2; - -import com.amazonaws.auth.DefaultAWSCredentialsProviderChain; -import com.amazonaws.services.kinesis.AmazonKinesisClient; -import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream; -import com.google.common.base.Optional; -import com.google.common.collect.Lists; - -/** - * Java-friendly Kinesis Spark Streaming WordCount example - * - * See http://spark.apache.org/docs/latest/streaming-programming-guide.html for more details - * on the Kinesis Spark Streaming integration. - * - * This example spins up 1 Kinesis Worker (Spark Streaming Receivers) per shard - * of the given stream. - * It then starts pulling from the last checkpointed sequence number of the given - * and . - * - * Valid endpoint urls: http://docs.aws.amazon.com/general/latest/gr/rande.html#ak_region - * - * This code uses the DefaultAWSCredentialsProviderChain and searches for credentials - * in the following order of precedence: - * Environment Variables - AWS_ACCESS_KEY_ID and AWS_SECRET_KEY - * Java System Properties - aws.accessKeyId and aws.secretKey - * Credential profiles file - default location (~/.aws/credentials) shared by all AWS SDKs - * Instance profile credentials - delivered through the Amazon EC2 metadata service - * - * Usage: JavaKinesisWordCount - * is the name of the Kinesis stream (ie. mySparkStream) - * is the endpoint of the Kinesis service - * (ie. https://kinesis.us-east-1.amazonaws.com) - * - * Example: - * $ export AWS_ACCESS_KEY_ID= - * $ export AWS_SECRET_KEY= - * $ $SPARK_HOME/bin/run-example \ - * org.apache.spark.examples.streaming.JavaKinesisWordCount mySparkStream \ - * https://kinesis.us-east-1.amazonaws.com - * - * There is a companion helper class called KinesisWordCountProducer which puts dummy data - * onto the Kinesis stream. - * Usage instructions for KinesisWordCountProducer are provided in the class definition. - */ -public final class JavaKinesisWordCount { - private static final Pattern WORD_SEPARATOR = Pattern.compile(" "); - private static final Logger logger = Logger.getLogger(JavaKinesisWordCount.class); - - /** - * Make the constructor private to enforce singleton - */ - private JavaKinesisWordCount() { - } - - public static void main(String[] args) { - /** - * Check that all required args were passed in. - */ - if (args.length < 2) { - System.err.println("Usage: JavaKinesisWordCount "); - System.exit(1); - } - - /** - * (This was lifted from the StreamingExamples.scala in order to avoid the dependency on the spark-examples artifact.) - * Set reasonable logging levels for streaming if the user has not configured log4j. - */ - boolean log4jInitialized = Logger.getRootLogger().getAllAppenders() - .hasMoreElements(); - if (!log4jInitialized) { - /** We first log something to initialize Spark's default logging, then we override the logging level. */ - Logger.getRootLogger() - .info("Setting log level to [ERROR] for streaming example." - + " To override add a custom log4j.properties to the classpath."); - Logger.getRootLogger().setLevel(Level.ERROR); - Logger.getLogger("org.apache.spark.examples.streaming").setLevel(Level.DEBUG); - } - - /** Populate the appropriate variables from the given args */ - String stream = args[0]; - String endpoint = args[1]; - /** Set the batch interval to a fixed 2000 millis (2 seconds) */ - Integer batchIntervalMillis = 2000; - - /** Create a Kinesis client in order to determine the number of shards for the given stream */ - AmazonKinesisClient kinesisClient = new AmazonKinesisClient( - new DefaultAWSCredentialsProviderChain()); - kinesisClient.setEndpoint(endpoint); - - /** Determine the number of shards from the stream */ - int numShards = kinesisClient.describeStream(stream) - .getStreamDescription().getShards().size(); - - /** In this example, we're going to create 1 Kinesis Worker/Receiver/DStream for each shard */ - int numStreams = numShards; - - /** Must add 1 more thread than the number of receivers or the output won't show properly from the driver */ - int numSparkThreads = numStreams + 1; - - /** Set the app name */ - String appName = "KinesisWordCount"; - - /** Setup the Spark config. */ - SparkConf sparkConfig = new SparkConf().setAppName(appName).setMaster( - "local[" + numSparkThreads + "]"); - - /** - * Set the batch interval. - * Records will be pulled from the Kinesis stream and stored as a single DStream within Spark every batch interval. - */ - Duration batchInterval = Milliseconds.apply(batchIntervalMillis); - - /** - * It's recommended that you perform a Spark checkpoint between 5 and 10 times the batch interval. - * While this is the Spark checkpoint interval, we're going to use it for the Kinesis checkpoint interval, as well. - * For example purposes, we'll just use the batchInterval. - */ - Duration checkpointInterval = Milliseconds.apply(batchIntervalMillis); - - /** Setup the StreamingContext */ - JavaStreamingContext jssc = new JavaStreamingContext(sparkConfig, batchInterval); - - /** Setup the checkpoint directory used by Spark Streaming */ - jssc.checkpoint("/tmp/checkpoint"); - - /** Create the same number of Kinesis Receivers/DStreams as stream shards, then union them all */ - JavaDStream allStreams = KinesisUtils - .createStream(jssc, appName, stream, endpoint, checkpointInterval.milliseconds(), - InitialPositionInStream.LATEST); - /** Set the checkpoint interval */ - allStreams.checkpoint(checkpointInterval); - for (int i = 1; i < numStreams; i++) { - /** Create a new Receiver/DStream for each stream shard */ - JavaDStream dStream = KinesisUtils - .createStream(jssc, appName, stream, endpoint, checkpointInterval.milliseconds(), - InitialPositionInStream.LATEST); - /** Set the Spark checkpoint interval */ - dStream.checkpoint(checkpointInterval); - - /** Union with the existing streams */ - allStreams = allStreams.union(dStream); - } - - /** This implementation uses the String-based KinesisRecordSerializer impl */ - final KinesisRecordSerializer recordSerializer = new KinesisStringRecordSerializer(); - - /** - * Split each line of the union'd DStreams into multiple words using flatMap to produce the collection. - * Convert lines of byte[] to multiple Strings by first converting to String, then splitting on WORD_SEPARATOR - * We're caching the result here so that we can use it later without having to re-materialize the underlying RDDs. - */ - JavaDStream words = allStreams.flatMap(new FlatMapFunction() { - /** - * Convert lines of byte[] to multiple words split by WORD_SEPARATOR - * @param byte array - * @return iterable of words split by WORD_SEPARATOR - */ - @Override - public Iterable call(byte[] line) { - return Lists.newArrayList(WORD_SEPARATOR.split(recordSerializer.deserialize(line))); - } - }).cache(); - - /** - * Map each word to a (word, 1) tuple so we can reduce/aggregate later. - * We're caching the result here so that we can use it later without having - * to re-materialize the underlying RDDs. - */ - JavaPairDStream wordCounts = words.mapToPair( - new PairFunction() { - /** - * Create the (word, 1) tuple - * @param word - * @return (word, 1) tuple - */ - @Override - public Tuple2 call(String s) { - return new Tuple2(s, 1); - } - }); - - /** - * Reduce/aggregate by key - * We're caching the result here so that we can use it later without having - * to re-materialize the underlying RDDs. - */ - JavaPairDStream wordCountsByKey = wordCounts.reduceByKey( - new Function2() { - @Override - public Integer call(Integer i1, Integer i2) { - return i1 + i2; - } - }).cache(); - - /** Update the running totals of words. */ - Function2, Optional, Optional> updateTotals = - /** - * @param sequence of new counts - * @param current running total (could be None if no current count exists) - * @return updated count - */ - new Function2, Optional, Optional>() { - @Override public Optional call(List newCounts, Optional currentCount) { - Integer currentSum = 0; - if (currentCount.isPresent()) { - currentSum = currentCount.get(); - } - Integer newSum = currentSum; - - for (Integer newCount : newCounts) { - newSum += newCount; - } - return Optional.of(newSum); - } - }; - - /** - * Calculate the running totals using the updateTotals method. - */ - JavaPairDStream wordTotalsByKey = wordCountsByKey.updateStateByKey(updateTotals); - - /** - * Sort and print the running word totals. - * This is an Output Operation and will materialize the DStream. - */ - sortAndPrint("Word Count Totals By Key", wordTotalsByKey); - - /** Start the streaming context and await termination */ - jssc.start(); - jssc.awaitTermination(); - } - - /** - * Sort and print the given dstream. - * This is an Output Operation that will materialize the underlying DStream. - * Everything up to this point is a lazy Transformation Operation. - * - * @param description of the dstream for logging purposes - * @param dstream to sort and print - */ - private static void sortAndPrint(final String description, JavaPairDStream dstream) { - dstream.foreachRDD( - new Function, Void>() { - public Void call(JavaPairRDD batch) { - JavaPairRDD sortedBatch = batch.sortByKey(true); - logger.info(description); - for (Object wordCount: sortedBatch.collect()) { - logger.info(wordCount); - } - - return null; - } - }); - } -} diff --git a/examples/src/main/java/org/apache/spark/examples/streaming/JavaKinesisWordCountASL.java b/examples/src/main/java/org/apache/spark/examples/streaming/JavaKinesisWordCountASL.java new file mode 100644 index 0000000000000..31793aaa020ba --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/streaming/JavaKinesisWordCountASL.java @@ -0,0 +1,187 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.examples.streaming; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Pattern; + +import org.apache.log4j.Logger; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.FlatMapFunction; +import org.apache.spark.api.java.function.Function2; +import org.apache.spark.api.java.function.PairFunction; +import org.apache.spark.storage.StorageLevel; +import org.apache.spark.streaming.Duration; +import org.apache.spark.streaming.api.java.JavaDStream; +import org.apache.spark.streaming.api.java.JavaPairDStream; +import org.apache.spark.streaming.api.java.JavaStreamingContext; +import org.apache.spark.streaming.kinesis.KinesisUtils; + +import scala.Tuple2; + +import com.amazonaws.auth.DefaultAWSCredentialsProviderChain; +import com.amazonaws.services.kinesis.AmazonKinesisClient; +import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream; +import com.google.common.collect.Lists; + +/** + * Java-friendly Kinesis Spark Streaming WordCount example + * + * See http://spark.apache.org/docs/latest/streaming-kinesis.html for more details + * on the Kinesis Spark Streaming integration. + * + * This example spins up 1 Kinesis Worker (Spark Streaming Receiver) per shard + * for the given stream. + * It then starts pulling from the last checkpointed sequence number of the given + * and . + * + * Valid endpoint urls: http://docs.aws.amazon.com/general/latest/gr/rande.html#ak_region + * + * This code uses the DefaultAWSCredentialsProviderChain and searches for credentials + * in the following order of precedence: + * Environment Variables - AWS_ACCESS_KEY_ID and AWS_SECRET_KEY + * Java System Properties - aws.accessKeyId and aws.secretKey + * Credential profiles file - default location (~/.aws/credentials) shared by all AWS SDKs + * Instance profile credentials - delivered through the Amazon EC2 metadata service + * + * Usage: JavaKinesisWordCountASL + * is the name of the Kinesis stream (ie. mySparkStream) + * is the endpoint of the Kinesis service + * (ie. https://kinesis.us-east-1.amazonaws.com) + * + * Example: + * $ export AWS_ACCESS_KEY_ID= + * $ export AWS_SECRET_KEY= + * $ $SPARK_HOME/bin/run-example \ + * org.apache.spark.examples.streaming.JavaKinesisWordCountASL mySparkStream \ + * https://kinesis.us-east-1.amazonaws.com + * + * There is a companion helper class called KinesisWordCountProducerASL which puts dummy data + * onto the Kinesis stream. + * Usage instructions for KinesisWordCountProducerASL are provided in the class definition. + */ +public final class JavaKinesisWordCountASL { + private static final Pattern WORD_SEPARATOR = Pattern.compile(" "); + private static final Logger logger = Logger.getLogger(JavaKinesisWordCountASL.class); + + /** + * Make the constructor private to enforce singleton + */ + private JavaKinesisWordCountASL() { + } + + public static void main(String[] args) { + /** + * Check that all required args were passed in. + */ + if (args.length < 2) { + System.err.println( + "|Usage: KinesisWordCount \n" + + "| is the name of the Kinesis stream\n" + + "| is the endpoint of the Kinesis service\n" + + "| (e.g. https://kinesis.us-east-1.amazonaws.com)\n"); + System.exit(1); + } + + StreamingExamples.setStreamingLogLevels(); + + /** Populate the appropriate variables from the given args */ + String streamName = args[0]; + String endpointUrl = args[1]; + /** Set the batch interval to a fixed 2000 millis (2 seconds) */ + Duration batchInterval = new Duration(2000); + + /** Create a Kinesis client in order to determine the number of shards for the given stream */ + AmazonKinesisClient kinesisClient = new AmazonKinesisClient( + new DefaultAWSCredentialsProviderChain()); + kinesisClient.setEndpoint(endpointUrl); + + /** Determine the number of shards from the stream */ + int numShards = kinesisClient.describeStream(streamName) + .getStreamDescription().getShards().size(); + + /** In this example, we're going to create 1 Kinesis Worker/Receiver/DStream for each shard */ + int numStreams = numShards; + + /** Must add 1 more thread than the number of receivers or the output won't show properly from the driver */ + int numSparkThreads = numStreams + 1; + + /** Setup the Spark config. */ + SparkConf sparkConfig = new SparkConf().setAppName("KinesisWordCount").setMaster( + "local[" + numSparkThreads + "]"); + + /** Kinesis checkpoint interval. Same as batchInterval for this example. */ + Duration checkpointInterval = batchInterval; + + /** Setup the StreamingContext */ + JavaStreamingContext jssc = new JavaStreamingContext(sparkConfig, batchInterval); + + /** Setup the checkpoint directory used by Spark Streaming */ + jssc.checkpoint("/tmp/checkpoint"); + + /** Create the same number of Kinesis DStreams/Receivers as Kinesis stream's shards */ + List> streamsList = new ArrayList>(numStreams); + for (int i = 0; i < streamsList.size(); i++) { + streamsList.add( + KinesisUtils.createStream(jssc, streamName, endpointUrl, checkpointInterval, + InitialPositionInStream.LATEST, StorageLevel.MEMORY_AND_DISK_2()) + ); + } + + /** Union all the streams if there is more than 1 stream */ + JavaDStream unionStreams; + if (streamsList.size() > 1) { + unionStreams = jssc.union(streamsList.get(0), streamsList.subList(1, streamsList.size())); + } else { + /** Otherwise, just use the 1 stream */ + unionStreams = streamsList.get(0); + } + + /** + * Split each line of the union'd DStreams into multiple words using flatMap to produce the collection. + * Convert lines of byte[] to multiple Strings by first converting to String, then splitting on WORD_SEPARATOR. + */ + JavaDStream words = unionStreams.flatMap(new FlatMapFunction() { + @Override + public Iterable call(byte[] line) { + return Lists.newArrayList(WORD_SEPARATOR.split(new String(line))); + } + }); + + /** Map each word to a (word, 1) tuple, then reduce/aggregate by key. */ + JavaPairDStream wordCounts = words.mapToPair( + new PairFunction() { + @Override + public Tuple2 call(String s) { + return new Tuple2(s, 1); + } + }).reduceByKey(new Function2() { + @Override + public Integer call(Integer i1, Integer i2) { + return i1 + i2; + } + }); + + /** Print the first 10 wordCounts by key */ + wordCounts.print(); + + /** Start the streaming context and await termination */ + jssc.start(); + jssc.awaitTermination(); + } +} diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCount.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCount.scala deleted file mode 100644 index 50c3889d277fa..0000000000000 --- a/examples/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCount.scala +++ /dev/null @@ -1,369 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.streaming - -import java.nio.ByteBuffer - -import scala.util.Random - -import org.apache.log4j.Level -import org.apache.log4j.Logger -import org.apache.spark.Logging -import org.apache.spark.SparkConf -import org.apache.spark.SparkContext.rddToOrderedRDDFunctions -import org.apache.spark.annotation.Experimental -import org.apache.spark.storage.StorageLevel -import org.apache.spark.streaming.Milliseconds -import org.apache.spark.streaming.StreamingContext -import org.apache.spark.streaming.StreamingContext.toPairDStreamFunctions -import org.apache.spark.streaming.dstream.DStream -import org.apache.spark.streaming.kinesis.KinesisStringRecordSerializer -import org.apache.spark.streaming.kinesis.KinesisUtils - -import com.amazonaws.auth.DefaultAWSCredentialsProviderChain -import com.amazonaws.services.kinesis.AmazonKinesisClient -import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream -import com.amazonaws.services.kinesis.model.PutRecordRequest - -/** - * Kinesis Spark Streaming WordCount example. - * - * See http://spark.apache.org/docs/latest/streaming-programming-guide.html for more details on - * the Kinesis Spark Streaming integration. - * - * This example spins up 1 Kinesis Worker (Spark Streaming Receivers) per shard of the - * given stream. - * It then starts pulling from the last checkpointed sequence number of the given - * and . - * - * Valid endpoint urls: http://docs.aws.amazon.com/general/latest/gr/rande.html#ak_region - * - * This code uses the DefaultAWSCredentialsProviderChain and searches for credentials - * in the following order of precedence: - * Environment Variables - AWS_ACCESS_KEY_ID and AWS_SECRET_KEY - * Java System Properties - aws.accessKeyId and aws.secretKey - * Credential profiles file - default location (~/.aws/credentials) shared by all AWS SDKs - * Instance profile credentials - delivered through the Amazon EC2 metadata service - * - * Usage: KinesisWordCount - * is the name of the Kinesis stream (ie. mySparkStream) - * is the endpoint of the Kinesis service - * (ie. https://kinesis.us-east-1.amazonaws.com) - * - * Example: - * $ export AWS_ACCESS_KEY_ID= - * $ export AWS_SECRET_KEY= - * $ $SPARK_HOME/bin/run-example \ - * org.apache.spark.examples.streaming.KinesisWordCount mySparkStream \ - * https://kinesis.us-east-1.amazonaws.com - * - * There is a companion helper class below called KinesisWordCountProducer which puts - * dummy data onto the Kinesis stream. - * Usage instructions for KinesisWordCountProducer are provided in that class definition. - */ -object KinesisWordCount extends Logging { - val WordSeparator = " " - - def main(args: Array[String]) { -/** - * Check that all required args were passed in. - */ - if (args.length < 2) { - System.err.println("Usage: KinesisWordCount ") - System.exit(1) - } - - /** - * (This was lifted from the StreamingExamples.scala in order to avoid the dependency - * on the spark-examples artifact.) - * Set reasonable logging levels for streaming if the user has not configured log4j. - */ - val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements() - if (!log4jInitialized) { - /** - * We first log something to initialize Spark's default logging, - * then we override the logging level. - * */ - logInfo("Setting log level to [INFO] for streaming example." + - " To override add a custom log4j.properties to the classpath.") - - Logger.getRootLogger().setLevel(Level.INFO) - Logger.getLogger("org.apache.spark.examples.streaming").setLevel(Level.DEBUG); - } - - /** Populate the appropriate variables from the given args */ - val Array(stream, endpoint) = args - val batchIntervalMillis = 2000 - - /** Create a Kinesis client in order to determine the number of shards for the given stream */ - val kinesisClient = new AmazonKinesisClient(new DefaultAWSCredentialsProviderChain()) - kinesisClient.setEndpoint(endpoint) - - /** Determine the number of shards from the stream */ - val numShards = kinesisClient.describeStream(stream).getStreamDescription().getShards().size() - - /** In this example, we're going to create 1 Kinesis Worker/Receiver/DStream for each shard.*/ - val numStreams = numShards - - /** - * Must add 1 more thread than the number of receivers or the output won't show properly - * from the driver - */ - val numSparkThreads = numStreams + 1 - - /** Set the app name */ - val appName = "KinesisWordCount" - - /** Setup the Spark config. */ - val sparkConfig = new SparkConf().setAppName(appName).setMaster(s"local[$numSparkThreads]") - - /** - * Set the batch interval. - * Records will be pulled from the Kinesis stream and stored as a single DStream within Spark - * every batch interval. - */ - val batchInterval = Milliseconds(batchIntervalMillis) - - /** - * It's recommended that you perform a Spark checkpoint between 5 and 10 times the batch - * interval. - * While this is the Spark checkpoint interval, we're going to use it for the Kinesis - * checkpoint interval, as well. - * For example purposes, we'll just use the batchInterval. - */ - val checkpointInterval = batchInterval - - /** Setup the StreamingContext */ - val ssc = new StreamingContext(sparkConfig, batchInterval) - - /** Setup the checkpoint directory used by Spark Streaming */ - ssc.checkpoint("/tmp/checkpoint"); - - /** - * Create the same number of Kinesis Receivers/DStreams as stream shards, then union - * them all. - */ - var allStreams: DStream[Array[Byte]] = KinesisUtils.createStream(ssc, appName, stream, - endpoint, checkpointInterval.milliseconds, InitialPositionInStream.LATEST) - /** Set the checkpoint interval */ - allStreams.checkpoint(checkpointInterval) - for (i <- 1 until numStreams) { - /** Create a new Receiver/DStream for each stream shard */ - val dStream = KinesisUtils.createStream(ssc, appName, stream, endpoint, - checkpointInterval.milliseconds, InitialPositionInStream.LATEST) - /** Set the Spark checkpoint interval */ - dStream.checkpoint(checkpointInterval) - - /** Union with the existing streams */ - allStreams = allStreams.union(dStream) - } - - /** This implementation uses the String-based KinesisRecordSerializer impl */ - val recordSerializer = new KinesisStringRecordSerializer() - - /** - * Sort and print the given dstream. - * This is an Output Operation that will materialize the underlying DStream. - * Everything up to this point is a lazy Transformation Operation. - * - * @param description of the dstream for logging purposes - * @param dstream to sort and print - */ - def sortAndPrint(description: String, dstream: DStream[(String,Int)]) = { - dstream.foreachRDD((batch, endOfWindowTime) => { - val sortedBatch = batch.sortByKey(true) - logInfo(s"$description @ $endOfWindowTime") - sortedBatch.collect().foreach( - wordCount => logInfo(s"$wordCount")) - } - ) - } - - /** - * Split each line of the union'd DStreams into multiple words using flatMap - * to produce the collection. - * Convert lines of Array[Byte] to multiple Strings by first converting to String, - * then splitting on WORD_SEPARATOR - * We're caching the result here so that we can use it later without having - * to re-materialize the underlying RDDs. - */ - val words = allStreams.flatMap(line => recordSerializer.deserialize(line) - .split(WordSeparator)).cache() - - /** - * Map each word to a (word, 1) tuple so we can reduce/aggregate later. - * We're caching the result here so that we can use it later without having - * to re-materialize the underlying RDDs. - */ - val wordCounts = words.map(word => (word, 1)) - - /** - * Reduce/aggregate by key. - * We're caching the result here so that we can use it later without having - * to re-materialize the underlying RDDs. - */ - val wordCountsByKey = wordCounts.reduceByKey((left, right) => left + right) - - /** - * Update the running totals of words. - * - * @param sequence of new counts - * @param current running total (could be None if no current count exists) - */ - def updateTotals = (newCounts: Seq[Int], currentCounts: Option[Int]) => { - val newCount = newCounts.foldLeft(0)((left, right) => left + right) - val currentCount = currentCounts.getOrElse(0) - Some(newCount + currentCount) - } - - /** - * Calculate the running totals using the updateTotals method. - */ - val wordTotalsByKey = wordCountsByKey.updateStateByKey[Int](updateTotals) - - /** - * Sort and print the running word totals. - * This is an Output Operation and will materialize the DStream. - */ - sortAndPrint("Word Count Totals By Key", wordTotalsByKey) - - /** Start the streaming context and await termination */ - ssc.start() - ssc.awaitTermination() - } -} - -/** - * Usage: KinesisWordCountProducer - * - * is the name of the Kinesis stream (ie. mySparkStream) - * is the endpoint of the Kinesis service - * (ie. https://kinesis.us-east-1.amazonaws.com) - * is the rate of records per second to put onto the stream - * is the rate of records per second to put onto the stream - * - * Example: - * $ export AWS_ACCESS_KEY_ID= - * $ export AWS_SECRET_KEY= - * $ $SPARK_HOME/bin/run-example \ - * org.apache.spark.examples.streaming.KinesisWordCountProducer mySparkStream \ - * https://kinesis.us-east-1.amazonaws.com 10 5 - */ -private[streaming] -object KinesisWordCountProducer extends Logging { - val MaxRandomInts = 10 - - def main(args: Array[String]) { - if (args.length < 4) { - System.err.println("Usage: KinesisWordCountProducer " + - " ") - System.exit(1) - } - - /** - * (This was lifted from the StreamingExamples.scala in order to avoid the dependency - * on the spark-examples artifact.) - * Set reasonable logging levels for streaming if the user has not configured log4j. - */ - val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements - if (!log4jInitialized) { - /** - * We first log something to initialize Spark's default logging, then we override - * the logging level. - */ - logInfo("Setting log level to [INFO] for streaming example." + - " To override add a custom log4j.properties to the classpath.") - - Logger.getRootLogger().setLevel(Level.INFO) - Logger.getLogger("org.apache.spark.examples.streaming").setLevel(Level.DEBUG); - } - - /** Populate the appropriate variables from the given args */ - val Array(stream, endpoint, recordsPerSecond, wordsPerRecord) = args - - /** Generate the records and return the totals */ - val totals: Seq[(Int, Int)] = generate(stream, endpoint, recordsPerSecond.toInt, - wordsPerRecord.toInt) - - logInfo("Totals") - /** Print the array of (index, total) tuples */ - totals.foreach(total => logInfo(total.toString())) - } - - def generate(stream: String, - endpoint: String, - recordsPerSecond: Int, - wordsPerRecord: Int): Seq[(Int, Int)] = { - val WORD_SEPARATOR = " " - - /** Create the Kinesis client */ - val kinesisClient = new AmazonKinesisClient(new DefaultAWSCredentialsProviderChain()) - kinesisClient.setEndpoint(endpoint) - - logInfo(s"Putting records onto stream $stream and endpoint $endpoint at a rate of" + - s" $recordsPerSecond records per second and $wordsPerRecord words per record"); - - /** Create the String-based record serializer */ - val recordSerializer = new KinesisStringRecordSerializer() - - val totals = new Array[Int](MaxRandomInts) - /** Put String records onto the stream per the given recordPerSec and wordsPerRecord */ - for (i <- 1 to 5) { - /** Generate recordsPerSec records to put onto the stream */ - val records = (1 to recordsPerSecond.toInt).map { recordNum => - /** - * Randomly generate each wordsPerRec words between 0 (inclusive) - * and MAX_RANDOM_INTS (exclusive) - */ - val data = (1 to wordsPerRecord.toInt).map(x => { - /** Generate the random int */ - val randomInt = Random.nextInt(MaxRandomInts) - - /** Keep track of the totals */ - totals(randomInt) += 1 - - /** Convert the Int to a String */ - randomInt.toString() - }) - /** Create a String of randomInts separated by WORD_SEPARATOR */ - .mkString(WORD_SEPARATOR) - - /** Create a partitionKey based on recordNum */ - val partitionKey = s"partitionKey-$recordNum" - - /** Create a PutRecordRequest with an Array[Byte] version of the data */ - val putRecordRequest = new PutRecordRequest().withStreamName(stream) - .withPartitionKey(partitionKey) - .withData(ByteBuffer.wrap(recordSerializer.serialize(data))); - - /** Put the record onto the stream and capture the PutRecordResult */ - val putRecordResult = kinesisClient.putRecord(putRecordRequest); - - logInfo(s"Successfully put record with partitionKey $partitionKey and shardId" + - s" ${putRecordResult.getShardId()} and data $data and endpoint $endpoint and stream" + - s" $stream") - } - - /** Sleep for a second */ - Thread.sleep(1000) - } - - /** Convert the totals to (index, total) tuple */ - (0 to (MaxRandomInts - 1)).zip(totals) - } -} diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala new file mode 100644 index 0000000000000..865eea433aeb9 --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala @@ -0,0 +1,235 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.examples.streaming + +import java.nio.ByteBuffer + +import scala.util.Random + +import org.apache.spark.Logging +import org.apache.spark.SparkConf +import org.apache.spark.storage.StorageLevel +import org.apache.spark.streaming.Milliseconds +import org.apache.spark.streaming.StreamingContext +import org.apache.spark.streaming.StreamingContext.toPairDStreamFunctions +import org.apache.spark.streaming.kinesis.KinesisUtils + +import com.amazonaws.auth.DefaultAWSCredentialsProviderChain +import com.amazonaws.services.kinesis.AmazonKinesisClient +import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream +import com.amazonaws.services.kinesis.model.PutRecordRequest + +/** + * Kinesis Spark Streaming WordCount example. + * + * See http://spark.apache.org/docs/latest/streaming-kinesis.html for more details on + * the Kinesis Spark Streaming integration. + * + * This example spins up 1 Kinesis Worker (Spark Streaming Receiver) per shard + * for the given stream. + * It then starts pulling from the last checkpointed sequence number of the given + * and . + * + * Valid endpoint urls: http://docs.aws.amazon.com/general/latest/gr/rande.html#ak_region + * + * This code uses the DefaultAWSCredentialsProviderChain and searches for credentials + * in the following order of precedence: + * Environment Variables - AWS_ACCESS_KEY_ID and AWS_SECRET_KEY + * Java System Properties - aws.accessKeyId and aws.secretKey + * Credential profiles file - default location (~/.aws/credentials) shared by all AWS SDKs + * Instance profile credentials - delivered through the Amazon EC2 metadata service + * + * Usage: KinesisWordCountASL + * is the name of the Kinesis stream (ie. mySparkStream) + * is the endpoint of the Kinesis service + * (ie. https://kinesis.us-east-1.amazonaws.com) + * + * Example: + * $ export AWS_ACCESS_KEY_ID= + * $ export AWS_SECRET_KEY= + * $ $SPARK_HOME/bin/run-example \ + * org.apache.spark.examples.streaming.KinesisWordCountASL mySparkStream \ + * https://kinesis.us-east-1.amazonaws.com + * + * There is a companion helper class below called KinesisWordCountProducerASL which puts + * dummy data onto the Kinesis stream. + * Usage instructions for KinesisWordCountProducerASL are provided in that class definition. + */ +object KinesisWordCountASL extends Logging { + def main(args: Array[String]) { +/** + * Check that all required args were passed in. + */ + if (args.length < 2) { + System.err.println( + """ + |Usage: KinesisWordCount + | is the name of the Kinesis stream + | is the endpoint of the Kinesis service + | (e.g. https://kinesis.us-east-1.amazonaws.com) + """.stripMargin) + System.exit(1) + } + + StreamingExamples.setStreamingLogLevels() + + /** Populate the appropriate variables from the given args */ + val Array(streamName, endpointUrl) = args + + /** Determine the number of shards from the stream */ + val kinesisClient = new AmazonKinesisClient(new DefaultAWSCredentialsProviderChain()) + kinesisClient.setEndpoint(endpointUrl) + val numShards = kinesisClient.describeStream(streamName).getStreamDescription().getShards().size() + + /** In this example, we're going to create 1 Kinesis Worker/Receiver/DStream for each shard. */ + val numStreams = numShards + + /** + * numSparkThreads should be 1 more thread than the number of receivers. + * This leaves one thread available for actually processing the data. + */ + val numSparkThreads = numStreams + 1 + + /** Setup the and SparkConfig and StreamingContext */ + /** Spark Streaming batch interval */ + val batchInterval = Milliseconds(2000) + val sparkConfig = new SparkConf().setAppName("KinesisWordCount").setMaster(s"local[$numSparkThreads]") + val ssc = new StreamingContext(sparkConfig, batchInterval) + /** Setup the checkpoint directory used by Spark Streaming */ + ssc.checkpoint("/tmp/checkpoint"); + + /** Kinesis checkpoint interval. Same as batchInterval for this example. */ + val kinesisCheckpointInterval = batchInterval + + /** Create the same number of Kinesis DStreams/Receivers as Kinesis stream's shards */ + val kinesisStreams = (0 until numStreams).map { i => + KinesisUtils.createStream(ssc, streamName, endpointUrl, kinesisCheckpointInterval, + InitialPositionInStream.LATEST, StorageLevel.MEMORY_AND_DISK_2) + } + + /** Union all the streams */ + val unionStreams = ssc.union(kinesisStreams) + + /** Convert each line of Array[Byte] to String, split into words, and count them */ + val words = unionStreams.flatMap(byteArray => new String(byteArray) + .split(" ")) + + /** Map each word to a (word, 1) tuple so we can reduce/aggregate by key. */ + val wordCounts = words.map(word => (word, 1)).reduceByKey(_ + _) + + /** Print the first 10 wordCounts by key */ + wordCounts.print() + + /** Start the streaming context and await termination */ + ssc.start() + ssc.awaitTermination() + } +} + +/** + * Usage: KinesisWordCountProducerASL + * + * is the name of the Kinesis stream (ie. mySparkStream) + * is the endpoint of the Kinesis service + * (ie. https://kinesis.us-east-1.amazonaws.com) + * is the rate of records per second to put onto the stream + * is the rate of records per second to put onto the stream + * + * Example: + * $ export AWS_ACCESS_KEY_ID= + * $ export AWS_SECRET_KEY= + * $ $SPARK_HOME/bin/run-example \ + * org.apache.spark.examples.streaming.KinesisWordCountProducerASL mySparkStream \ + * https://kinesis.us-east-1.amazonaws.com 10 5 + */ +object KinesisWordCountProducerASL { + def main(args: Array[String]) { + if (args.length < 4) { + System.err.println("Usage: KinesisWordCountProducerASL " + + " ") + System.exit(1) + } + + StreamingExamples.setStreamingLogLevels() + + /** Populate the appropriate variables from the given args */ + val Array(stream, endpoint, recordsPerSecond, wordsPerRecord) = args + + /** Generate the records and return the totals */ + val totals = generate(stream, endpoint, recordsPerSecond.toInt, wordsPerRecord.toInt) + + /** Print the array of (index, total) tuples */ + println("Totals") + totals.foreach(total => println(total.toString())) + } + + def generate(stream: String, + endpoint: String, + recordsPerSecond: Int, + wordsPerRecord: Int): Seq[(Int, Int)] = { + + val MaxRandomInts = 10 + + /** Create the Kinesis client */ + val kinesisClient = new AmazonKinesisClient(new DefaultAWSCredentialsProviderChain()) + kinesisClient.setEndpoint(endpoint) + + println(s"Putting records onto stream $stream and endpoint $endpoint at a rate of" + + s" $recordsPerSecond records per second and $wordsPerRecord words per record"); + + val totals = new Array[Int](MaxRandomInts) + /** Put String records onto the stream per the given recordPerSec and wordsPerRecord */ + for (i <- 1 to 5) { + + /** Generate recordsPerSec records to put onto the stream */ + val records = (1 to recordsPerSecond.toInt).map { recordNum => + /** + * Randomly generate each wordsPerRec words between 0 (inclusive) + * and MAX_RANDOM_INTS (exclusive) + */ + val data = (1 to wordsPerRecord.toInt).map(x => { + /** Generate the random int */ + val randomInt = Random.nextInt(MaxRandomInts) + + /** Keep track of the totals */ + totals(randomInt) += 1 + + randomInt.toString() + }).mkString(" ") + + /** Create a partitionKey based on recordNum */ + val partitionKey = s"partitionKey-$recordNum" + + /** Create a PutRecordRequest with an Array[Byte] version of the data */ + val putRecordRequest = new PutRecordRequest().withStreamName(stream) + .withPartitionKey(partitionKey) + .withData(ByteBuffer.wrap(data.getBytes())); + + /** Put the record onto the stream and capture the PutRecordResult */ + val putRecordResult = kinesisClient.putRecord(putRecordRequest); + } + + /** Sleep for a second */ + Thread.sleep(1000) + println("Sent " + recordsPerSecond + " records") + } + + /** Convert the totals to (index, total) tuple */ + (0 to (MaxRandomInts - 1)).zip(totals) + } +} diff --git a/extras/kinesis-asl/pom.xml b/extras/kinesis-asl/pom.xml index 0afb076d1f0eb..739a010200dee 100644 --- a/extras/kinesis-asl/pom.xml +++ b/extras/kinesis-asl/pom.xml @@ -27,10 +27,9 @@ org.apache.spark - kinesis-asl_2.10 + spark-streaming-kinesis-asl_2.10 jar Spark Kinesis Integration diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/CheckpointState.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointState.scala similarity index 82% rename from extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/CheckpointState.scala rename to extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointState.scala index febde542723b2..a541a72614cbf 100644 --- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/CheckpointState.scala +++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointState.scala @@ -17,6 +17,7 @@ package org.apache.spark.streaming.kinesis import org.apache.spark.Logging +import org.apache.spark.streaming.Duration import org.apache.spark.streaming.util.Clock import org.apache.spark.streaming.util.ManualClock import org.apache.spark.streaming.util.SystemClock @@ -24,18 +25,18 @@ import org.apache.spark.streaming.util.SystemClock /** * This is a helper class for managing checkpoint clocks. * - * @param checkpoint interval in millis - * @param current clock. if none specified, will default to current SystemClock + * @param checkpointInterval + * @param currentClock. Default to current SystemClock if none is passed in (mocking purposes) */ -private[kinesis] class CheckpointState( - checkpointIntervalMillis: Long, +private[kinesis] class KinesisCheckpointState( + checkpointInterval: Duration, currentClock: Clock = new SystemClock()) extends Logging { /** - * Initialize the checkpoint clock using the given currentClock + checkpointIntervalMillis + * Initialize the checkpoint clock using the given currentClock + checkpointInterval millis */ val checkpointClock = new ManualClock() - checkpointClock.setTime(currentClock.currentTime() + checkpointIntervalMillis) + checkpointClock.setTime(currentClock.currentTime() + checkpointInterval.milliseconds) /** * Check if it's time to checkpoint based on the current time and the derived time @@ -51,6 +52,6 @@ private[kinesis] class CheckpointState( * Advance the checkpoint clock by the checkpoint interval. */ def advanceCheckpoint() = { - checkpointClock.addToTime(checkpointIntervalMillis) + checkpointClock.addToTime(checkpointInterval.milliseconds) } } diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala index d6e4b7996877c..7d3897d45c77f 100644 --- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala +++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala @@ -21,6 +21,7 @@ import java.util.UUID import org.apache.spark.Logging import org.apache.spark.storage.StorageLevel +import org.apache.spark.streaming.Duration import org.apache.spark.streaming.receiver.Receiver import com.amazonaws.auth.AWSCredentialsProvider @@ -41,23 +42,33 @@ import com.amazonaws.services.kinesis.clientlibrary.lib.worker.Worker * Instances of this class will get shipped to the Spark Streaming Workers * to run within a Spark Executor. * - * @param appName unique name for your Kinesis app. Multiple instances of the app pull from - * the same stream. The Kinesis Client Library coordinates all load-balancing and - * failure-recovery. - * @param stream Kinesis stream name - * @param endpoint url of Kinesis service (ie. https://kinesis.us-east-1.amazonaws.com) - * Available endpoints: http://docs.aws.amazon.com/general/latest/gr/rande.html#ak_region - * @param checkpointIntervalMillis interval (millis) for Kinesis checkpointing - * @param initialPositionInStream in the absence of a Kinesis checkpoint info, this is the - * worker's initial starting position in the stream. + * @param appName Kinesis application name. Kinesis Apps are mapped to Kinesis Streams + * by the Kinesis Client Library. If you change the App name or Stream name, + * the KCL will throw errors. This usually requires deleting the backing + * DynamoDB table with the same name this Kinesis application. + * @param streamName Kinesis stream name + * @param endpointUrl Url of Kinesis service (e.g., https://kinesis.us-east-1.amazonaws.com) + * @param checkpointInterval Checkpoint interval for Kinesis checkpointing. + * See the Kinesis Spark Streaming documentation for more + * details on the different types of checkpoints. + * @param initialPositionInStream In the absence of Kinesis checkpoint info, this is the + * worker's initial starting position in the stream. + * The values are either the beginning of the stream + * per Kinesis' limit of 24 hours + * (InitialPositionInStream.TRIM_HORIZON) or + * the tip of the stream (InitialPositionInStream.LATEST). + * @param storageLevel Storage level to use for storing the received objects + * + * @return ReceiverInputDStream[Array[Byte]] */ private[kinesis] class KinesisReceiver( appName: String, - stream: String, - endpoint: String, - checkpointIntervalMillis: Long, - initialPositionInStream: InitialPositionInStream) - extends Receiver[Array[Byte]](StorageLevel.MEMORY_AND_DISK_2) with Logging { receiver => + streamName: String, + endpointUrl: String, + checkpointInterval: Duration, + initialPositionInStream: InitialPositionInStream, + storageLevel: StorageLevel) + extends Receiver[Array[Byte]](storageLevel) with Logging { receiver => /** * The following vars are built in the onStart() method which executes in the Spark Worker after @@ -109,12 +120,12 @@ private[kinesis] class KinesisReceiver( override def onStart() { workerId = InetAddress.getLocalHost.getHostAddress() + ":" + UUID.randomUUID() credentialsProvider = new DefaultAWSCredentialsProviderChain() - kinesisClientLibConfiguration = new KinesisClientLibConfiguration(appName, stream, - credentialsProvider, workerId).withKinesisEndpoint(endpoint) + kinesisClientLibConfiguration = new KinesisClientLibConfiguration(appName, streamName, + credentialsProvider, workerId).withKinesisEndpoint(endpointUrl) .withInitialPositionInStream(initialPositionInStream).withTaskBackoffTimeMillis(500) recordProcessorFactory = new IRecordProcessorFactory { override def createProcessor: IRecordProcessor = new KinesisRecordProcessor(receiver, - workerId, new CheckpointState(checkpointIntervalMillis)) + workerId, new KinesisCheckpointState(checkpointInterval)) } worker = new Worker(recordProcessorFactory, kinesisClientLibConfiguration) worker.run() diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala index 055e7297706ae..5d201819a8f87 100644 --- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala +++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala @@ -19,10 +19,14 @@ package org.apache.spark.streaming.kinesis import java.util.List import scala.collection.JavaConversions.asScalaBuffer -import scala.collection.mutable.ArrayBuffer +import scala.util.Random import org.apache.spark.Logging +import com.amazonaws.services.kinesis.clientlibrary.exceptions.InvalidStateException +import com.amazonaws.services.kinesis.clientlibrary.exceptions.KinesisClientLibDependencyException +import com.amazonaws.services.kinesis.clientlibrary.exceptions.ShutdownException +import com.amazonaws.services.kinesis.clientlibrary.exceptions.ThrottlingException import com.amazonaws.services.kinesis.clientlibrary.interfaces.IRecordProcessor import com.amazonaws.services.kinesis.clientlibrary.interfaces.IRecordProcessorCheckpointer import com.amazonaws.services.kinesis.clientlibrary.types.ShutdownReason @@ -33,15 +37,15 @@ import com.amazonaws.services.kinesis.model.Record * This implementation operates on the Array[Byte] from the KinesisReceiver. * The Kinesis Worker creates an instance of this KinesisRecordProcessor upon startup. * - * @param Kinesis receiver + * @param receiver Kinesis receiver * @param workerId for logging purposes - * @param checkpointState represents the checkpoint state including the next time a - * checkpoint is needed. it's injected here for mocking purposes. + * @param checkpointState represents the checkpoint state including the next checkpoint time. + * It's injected here for mocking purposes. */ private[kinesis] class KinesisRecordProcessor( receiver: KinesisReceiver, workerId: String, - checkpointState: CheckpointState) extends IRecordProcessor with Logging { + checkpointState: KinesisCheckpointState) extends IRecordProcessor with Logging { /** shardId to be populated during initialize() */ var shardId: String = _ @@ -61,7 +65,7 @@ private[kinesis] class KinesisRecordProcessor( * This is the record-processing bridge between the KCL's IRecordProcessor.processRecords() * and Spark Streaming's Receiver.store(). * - * @param list of records from the Kinesis stream shard + * @param batch list of records from the Kinesis stream shard * @param checkpointer used to update Kinesis when this batch has been processed/stored * in the DStream */ @@ -69,18 +73,16 @@ private[kinesis] class KinesisRecordProcessor( if (!receiver.isStopped()) { try { /** - * Convert the list of records to a list of Array[Byte] * Note: If we try to store the raw ByteBuffer from record.getData(), the Spark Streaming * Receiver.store(ByteBuffer) attempts to deserialize the ByteBuffer using the * internally-configured Spark serializer (kryo, etc). - * This is not desirable, so we instead store a raw Array[Byte] and decouple - * ourselves from the internal serialization strategy. + * This is not desirable, so we instead store a raw Array[Byte] and decouple + * ourselves from Spark's internal serialization strategy. */ - val batchByteArrays = new ArrayBuffer[Array[Byte]](batch.size()) - batchByteArrays ++= batch.map(record => record.getData().array()) - - /** Store the list of Array[Byte] in Spark */ - KinesisRecordProcessorUtils.retry(receiver.store(batchByteArrays), 4, 500) + batch.foreach(record => + KinesisRecordProcessor.retry(receiver.store(record.getData().array()), 4, 500) + ) + logDebug(s"Stored: Worker $workerId stored ${batch.size} records for shardId $shardId") /** @@ -96,7 +98,7 @@ private[kinesis] class KinesisRecordProcessor( */ if (checkpointState.shouldCheckpoint()) { /** Perform the checkpoint */ - KinesisRecordProcessorUtils.retry(checkpointer.checkpoint(), 4, 500) + KinesisRecordProcessor.retry(checkpointer.checkpoint(), 4, 500) /** Update the next checkpoint time */ checkpointState.advanceCheckpoint() @@ -134,8 +136,8 @@ private[kinesis] class KinesisRecordProcessor( * 2) the failed or latent Worker has stopped sending heartbeats for whatever reason * (ShutdownReason.ZOMBIE) * - * @param checkpointer used to performn a Kinesis checkpoint for ShutdownReason.TERMINATE - * @param shutdown reason (ShutdownReason.TERMINATE or ShutdownReason.ZOMBIE) + * @param checkpointer used to perform a Kinesis checkpoint for ShutdownReason.TERMINATE + * @param reason for shutdown (ShutdownReason.TERMINATE or ShutdownReason.ZOMBIE) */ override def shutdown(checkpointer: IRecordProcessorCheckpointer, reason: ShutdownReason) { logInfo(s"Shutdown: Shutting down workerId $workerId with reason $reason") @@ -145,7 +147,7 @@ private[kinesis] class KinesisRecordProcessor( * Checkpoint to indicate that all records from the shard have been drained and processed. * It's now OK to read from the new shards that resulted from a resharding event. */ - case ShutdownReason.TERMINATE => KinesisRecordProcessorUtils.retry(checkpointer.checkpoint(), + case ShutdownReason.TERMINATE => KinesisRecordProcessor.retry(checkpointer.checkpoint(), 4, 500) /** @@ -161,3 +163,52 @@ private[kinesis] class KinesisRecordProcessor( } } } + +private[kinesis] object KinesisRecordProcessor extends Logging { + /** + * Retry the given amount of times with a random backoff time (millis) less than the + * given maxBackOffMillis + * + * @param expression expression to evalute + * @param numRetriesLeft number of retries left + * @param maxBackOffMillis: max millis between retries + * + * @return evaluation of the given expression + * @throws Unretryable exception, unexpected exception, + * or any exception that persists after numRetriesLeft reaches 0 + */ + @annotation.tailrec + def retry[T](expression: => T, numRetriesLeft: Int, maxBackOffMillis: Int): T = { + util.Try { expression } match { + /** If the function succeeded, evaluate to x. */ + case util.Success(x) => x + /** If the function failed, either retry or throw the exception */ + case util.Failure(e) => e match { + /** Retry: Throttling or other Retryable exception has occurred */ + case _: ThrottlingException | _: KinesisClientLibDependencyException if numRetriesLeft > 1 + => { + val backOffMillis = Random.nextInt(maxBackOffMillis) + Thread.sleep(backOffMillis) + logError(s"Retryable Exception: Random backOffMillis=${backOffMillis}", e) + retry(expression, numRetriesLeft - 1, maxBackOffMillis) + } + /** Throw: Shutdown has been requested by the Kinesis Client Library.*/ + case _: ShutdownException => { + logError(s"ShutdownException: Caught shutdown exception, skipping checkpoint.", e) + throw e + } + /** Throw: Non-retryable exception has occurred with the Kinesis Client Library */ + case _: InvalidStateException => { + logError(s"InvalidStateException: Cannot save checkpoint to the DynamoDB table used" + + s" by the Amazon Kinesis Client Library. Table likely doesn't exist.", e) + throw e + } + /** Throw: Unexpected exception has occurred */ + case _ => { + logError(s"Unexpected, non-retryable exception.", e) + throw e + } + } + } + } +} diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessorUtils.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessorUtils.scala deleted file mode 100644 index 63d839f3a3bb3..0000000000000 --- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessorUtils.scala +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.spark.streaming.kinesis - -import scala.util.Random - -import org.apache.spark.Logging - -import com.amazonaws.services.kinesis.clientlibrary.exceptions.InvalidStateException -import com.amazonaws.services.kinesis.clientlibrary.exceptions.KinesisClientLibDependencyException -import com.amazonaws.services.kinesis.clientlibrary.exceptions.ShutdownException -import com.amazonaws.services.kinesis.clientlibrary.exceptions.ThrottlingException - - -/** - * Helper for the KinesisRecordProcessor. - */ -private[kinesis] object KinesisRecordProcessorUtils extends Logging { - /** - * Retry the given amount of times with a random backoff time (millis) less than the - * given maxBackOffMillis - * - * @param expression expression to evalute - * @param numRetriesLeft number of retries left - * @param maxBackOffMillis: max millis between retries - * - * @return Evaluation of the given expression - * @throws Unretryable exception, unexpected exception, - * or any exception that persists after numRetriesLeft reaches 0 - */ - @annotation.tailrec - def retry[T](expression: => T, numRetriesLeft: Int, maxBackOffMillis: Int): T = { - util.Try { expression } match { - /** If the function succeeded, evaluate to x. */ - case util.Success(x) => x - /** If the function failed, either retry or throw the exception */ - case util.Failure(e) => e match { - /** Retry: Throttling or other Retryable exception has occurred */ - case _: ThrottlingException | _: KinesisClientLibDependencyException if numRetriesLeft > 1 - => { - val backOffMillis = Random.nextInt(maxBackOffMillis) - Thread.sleep(backOffMillis) - logError(s"Retryable Exception: Random backOffMillis=${backOffMillis}", e) - retry(expression, numRetriesLeft - 1, maxBackOffMillis) - } - /** Throw: Shutdown has been requested by the Kinesis Client Library.*/ - case _: ShutdownException => { - logError(s"ShutdownException: Caught shutdown exception, skipping checkpoint.", e) - throw e - } - /** Throw: Non-retryable exception has occurred with the Kinesis Client Library */ - case _: InvalidStateException => { - logError(s"InvalidStateException: Cannot save checkpoint to the DynamoDB table used" + - s" by the Amazon Kinesis Client Library. Table likely doesn't exist.", e) - throw e - } - /** Throw: Unexpected exception has occurred */ - case _ => { - logError(s"Unexpected, non-retryable exception.", e) - throw e - } - } - } - } -} diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordSerializer.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordSerializer.scala deleted file mode 100644 index b63f19a8fead8..0000000000000 --- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordSerializer.scala +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.spark.streaming.kinesis - -/** - * Convert custom types to/from Array[Byte]. - * @tparam type to serialize/deserialize - */ -private[streaming] trait KinesisRecordSerializer[T] extends Serializable { - /** - * Convert type to Array[Byte] - * - * @param type to serialize - * @return byte array - */ - def serialize(t: T): Array[Byte] - - /** - * Convert Array[Byte] to type - * - * @param byte array - * @return deserialized type - */ - def deserialize(array: Array[Byte]): T -} diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisStringRecordSerializer.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisStringRecordSerializer.scala deleted file mode 100644 index 4833ccd63d380..0000000000000 --- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisStringRecordSerializer.scala +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.spark.streaming.kinesis - -import org.apache.spark.Logging - -/** - * Implementation of KinesisRecordSerializer to convert Array[Byte] to/from String. - */ -class KinesisStringRecordSerializer extends KinesisRecordSerializer[String] with Logging { - /** - * Convert String to Array[Byte] - * - * @param string to serialize - * @return byte array - */ - def serialize(string: String): Array[Byte] = { - string.getBytes() - } - - /** - * Convert Array[Byte] to String - * - * @param byte array - * @return deserialized string - */ - def deserialize(array: Array[Byte]): String = { - new String(array) - } -} diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala index 2b6b833457e35..d3560f6a690fc 100644 --- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala +++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala @@ -19,6 +19,7 @@ package org.apache.spark.streaming.kinesis import org.apache.spark.Logging import org.apache.spark.annotation.Experimental import org.apache.spark.storage.StorageLevel +import org.apache.spark.streaming.Duration import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.api.java.JavaReceiverInputDStream import org.apache.spark.streaming.api.java.JavaStreamingContext @@ -36,52 +37,63 @@ object KinesisUtils extends Logging { /** * Create an InputDStream that pulls messages from a Kinesis stream. * - * @param ssc StreamingContext - * @param appName unique name for your Kinesis app. Multiple instances of the app pull from - * the same stream. The Kinesis Client Library coordinates all load-balancing and - * failure-recovery. - * @param stream Kinesis stream name - * @param endpoint url of Kinesis service (ie. https://kinesis.us-east-1.amazonaws.com) - * Available endpoints: http://docs.aws.amazon.com/general/latest/gr/rande.html#ak_region - * @param checkpointIntervalMillis interval (millis) for Kinesis checkpointing - * @param initialPositionInStream in the absence of a Kinesis checkpoint info, this is the - * worker's initial starting position in the stream. + * @param ssc StreamingContext object + * @param streamName Kinesis stream name + * @param endpointUrl Url of Kinesis service (e.g., https://kinesis.us-east-1.amazonaws.com) + * @param checkpointInterval Checkpoint interval for Kinesis checkpointing. + * See the Kinesis Spark Streaming documentation for more + * details on the different types of checkpoints. + * @param initialPositionInStream In the absence of Kinesis checkpoint info, this is the + * worker's initial starting position in the stream. + * The values are either the beginning of the stream + * per Kinesis' limit of 24 hours + * (InitialPositionInStream.TRIM_HORIZON) or + * the tip of the stream (InitialPositionInStream.LATEST). + * @param storageLevel Storage level to use for storing the received objects + * * @return ReceiverInputDStream[Array[Byte]] */ def createStream( ssc: StreamingContext, - appName: String, - stream: String, - endpoint: String, - checkpointIntervalMillis: Long, - initialPositionInStream: InitialPositionInStream): ReceiverInputDStream[Array[Byte]] = { - ssc.receiverStream(new KinesisReceiver(appName, stream, endpoint, checkpointIntervalMillis, - initialPositionInStream)) + streamName: String, + endpointUrl: String, + checkpointInterval: Duration, + initialPositionInStream: InitialPositionInStream, + storageLevel: StorageLevel): ReceiverInputDStream[Array[Byte]] = { + ssc.receiverStream(new KinesisReceiver(ssc.sc.appName, streamName, endpointUrl, + checkpointInterval, initialPositionInStream, storageLevel)) } /** * Create a Java-friendly InputDStream that pulls messages from a Kinesis stream. * * @param jssc Java StreamingContext object - * @param appName unique name for your Kinesis app. Multiple instances of the app pull from - * the same stream. The Kinesis Client Library coordinates all load-balancing and - * failure-recovery. - * @param stream Kinesis stream name - * @param endpoint url of Kinesis service (ie. https://kinesis.us-east-1.amazonaws.com) - * Available endpoints: http://docs.aws.amazon.com/general/latest/gr/rande.html#ak_region - * @param checkpointIntervalMillis interval (millis) for Kinesis checkpointing - * @param initialPositionInStream in the absence of a Kinesis checkpoint info, this is the - * worker's initial starting position in the stream. + * @param ssc StreamingContext object + * @param streamName Kinesis stream name + * @param endpointUrl Url of Kinesis service (e.g., https://kinesis.us-east-1.amazonaws.com) + * @param checkpointInterval Checkpoint interval for Kinesis checkpointing. + * See the Kinesis Spark Streaming documentation for more + * details on the different types of checkpoints. + * @param initialPositionInStream In the absence of Kinesis checkpoint info, this is the + * worker's initial starting position in the stream. + * The values are either the beginning of the stream + * per Kinesis' limit of 24 hours + * (InitialPositionInStream.TRIM_HORIZON) or + * the tip of the stream (InitialPositionInStream.LATEST). + * @param storageLevel Storage level to use for storing the received objects + * + * @return JavaReceiverInputDStream[Array[Byte]] + * * @return JavaReceiverInputDStream[Array[Byte]] */ def createStream( jssc: JavaStreamingContext, - appName: String, - stream: String, - endpoint: String, - checkpointIntervalMillis: Long, - initialPositionInStream: InitialPositionInStream): JavaReceiverInputDStream[Array[Byte]] = { - jssc.receiverStream(new KinesisReceiver(appName, stream, endpoint, checkpointIntervalMillis, - initialPositionInStream)) + streamName: String, + endpointUrl: String, + checkpointInterval: Duration, + initialPositionInStream: InitialPositionInStream, + storageLevel: StorageLevel): JavaReceiverInputDStream[Array[Byte]] = { + jssc.receiverStream(new KinesisReceiver(jssc.ssc.sc.appName, streamName, + endpointUrl, checkpointInterval, initialPositionInStream, storageLevel)) } } diff --git a/extras/kinesis-asl/src/test/java/org/apache/spark/streaming/kinesis/JavaKinesisStreamSuite.java b/extras/kinesis-asl/src/test/java/org/apache/spark/streaming/kinesis/JavaKinesisStreamSuite.java new file mode 100644 index 0000000000000..87954a31f60ce --- /dev/null +++ b/extras/kinesis-asl/src/test/java/org/apache/spark/streaming/kinesis/JavaKinesisStreamSuite.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.streaming.kinesis; + +import org.apache.spark.storage.StorageLevel; +import org.apache.spark.streaming.Duration; +import org.apache.spark.streaming.LocalJavaStreamingContext; +import org.apache.spark.streaming.api.java.JavaDStream; +import org.junit.Test; + +import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream; + +/** + * Demonstrate the use of the KinesisUtils Java API + */ +public class JavaKinesisStreamSuite extends LocalJavaStreamingContext { + @Test + public void testKinesisStream() { + // Tests the API, does not actually test data receiving + JavaDStream kinesisStream = KinesisUtils.createStream(ssc, "mySparkStream", + "https://kinesis.us-west-2.amazonaws.com", new Duration(2000), + InitialPositionInStream.LATEST, StorageLevel.MEMORY_AND_DISK_2()); + + ssc.stop(); + } +} diff --git a/extras/kinesis-asl/src/test/resources/log4j.properties b/extras/kinesis-asl/src/test/resources/log4j.properties index b4519708afdf2..b01d4482378c1 100644 --- a/extras/kinesis-asl/src/test/resources/log4j.properties +++ b/extras/kinesis-asl/src/test/resources/log4j.properties @@ -25,4 +25,3 @@ log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %p %c{1}: # Ignore messages below warning level from Jetty, because it's a bit verbose log4j.logger.org.eclipse.jetty=WARN - diff --git a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala index 3e97b2ce289c3..5db0b48113e88 100644 --- a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala +++ b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala @@ -19,12 +19,16 @@ package org.apache.spark.streaming.kinesis import java.nio.ByteBuffer import scala.collection.JavaConversions.seqAsJavaList -import scala.collection.mutable.ArrayBuffer +import org.apache.spark.annotation.Experimental +import org.apache.spark.storage.StorageLevel +import org.apache.spark.streaming.Milliseconds +import org.apache.spark.streaming.Seconds +import org.apache.spark.streaming.StreamingContext +import org.apache.spark.streaming.TestSuiteBase import org.apache.spark.streaming.util.Clock import org.apache.spark.streaming.util.ManualClock import org.scalatest.BeforeAndAfter -import org.scalatest.FunSuite import org.scalatest.Matchers import org.scalatest.mock.EasyMockSugar @@ -33,13 +37,25 @@ import com.amazonaws.services.kinesis.clientlibrary.exceptions.KinesisClientLibD import com.amazonaws.services.kinesis.clientlibrary.exceptions.ShutdownException import com.amazonaws.services.kinesis.clientlibrary.exceptions.ThrottlingException import com.amazonaws.services.kinesis.clientlibrary.interfaces.IRecordProcessorCheckpointer +import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream import com.amazonaws.services.kinesis.clientlibrary.types.ShutdownReason import com.amazonaws.services.kinesis.model.Record /** * Suite of Kinesis streaming receiver tests focusing mostly on the KinesisRecordProcessor */ -class KinesisReceiverSuite extends FunSuite with Matchers with BeforeAndAfter with EasyMockSugar { +class KinesisReceiverSuite extends TestSuiteBase with Matchers with BeforeAndAfter + with EasyMockSugar { + + test("kinesis input stream") { + val ssc = new StreamingContext(master, framework, batchDuration) + // Tests the API, does not actually test data receiving + val kinesisStream = KinesisUtils.createStream(ssc, "mySparkStream", + "https://kinesis.us-west-2.amazonaws.com", Seconds(2), + InitialPositionInStream.LATEST, StorageLevel.MEMORY_AND_DISK_2); + ssc.stop() + } + val app = "TestKinesisReceiver" val stream = "mySparkStream" val endpoint = "endpoint-url" @@ -51,20 +67,18 @@ class KinesisReceiverSuite extends FunSuite with Matchers with BeforeAndAfter wi val record2 = new Record() record2.setData(ByteBuffer.wrap("Learning Spark".getBytes())) val batch = List[Record](record1, record2) - val expectedArrayBuffer = new ArrayBuffer[Array[Byte]]() += record1.getData().array() += - record2.getData().array() var receiverMock: KinesisReceiver = _ var checkpointerMock: IRecordProcessorCheckpointer = _ var checkpointClockMock: ManualClock = _ - var checkpointStateMock: CheckpointState = _ + var checkpointStateMock: KinesisCheckpointState = _ var currentClockMock: Clock = _ before { receiverMock = mock[KinesisReceiver] checkpointerMock = mock[IRecordProcessorCheckpointer] checkpointClockMock = mock[ManualClock] - checkpointStateMock = mock[CheckpointState] + checkpointStateMock = mock[KinesisCheckpointState] currentClockMock = mock[Clock] } @@ -72,7 +86,8 @@ class KinesisReceiverSuite extends FunSuite with Matchers with BeforeAndAfter wi val expectedCheckpointIntervalMillis = 10 expecting { receiverMock.isStopped().andReturn(false).once() - receiverMock.store(expectedArrayBuffer).once() + receiverMock.store(record1.getData().array()).once() + receiverMock.store(record2.getData().array()).once() checkpointStateMock.shouldCheckpoint().andReturn(true).once() checkpointerMock.checkpoint().once() checkpointStateMock.advanceCheckpoint().once() @@ -98,7 +113,7 @@ class KinesisReceiverSuite extends FunSuite with Matchers with BeforeAndAfter wi test("shouldn't checkpoint when exception occurs during store") { expecting { receiverMock.isStopped().andReturn(false).once() - receiverMock.store(expectedArrayBuffer).andThrow(new RuntimeException()).once() + receiverMock.store(record1.getData().array()).andThrow(new RuntimeException()).once() } whenExecuting(receiverMock, checkpointerMock, checkpointStateMock) { intercept[RuntimeException] { @@ -115,7 +130,7 @@ class KinesisReceiverSuite extends FunSuite with Matchers with BeforeAndAfter wi } whenExecuting(currentClockMock) { val checkpointIntervalMillis = 10 - val checkpointState = new CheckpointState(checkpointIntervalMillis, currentClockMock) + val checkpointState = new KinesisCheckpointState(Milliseconds(checkpointIntervalMillis), currentClockMock) assert(checkpointState.checkpointClock.currentTime() == checkpointIntervalMillis) } } @@ -125,7 +140,7 @@ class KinesisReceiverSuite extends FunSuite with Matchers with BeforeAndAfter wi currentClockMock.currentTime().andReturn(0).once() } whenExecuting(currentClockMock) { - val checkpointState = new CheckpointState(Long.MinValue, currentClockMock) + val checkpointState = new KinesisCheckpointState(Milliseconds(Long.MinValue), currentClockMock) assert(checkpointState.shouldCheckpoint()) } } @@ -135,7 +150,7 @@ class KinesisReceiverSuite extends FunSuite with Matchers with BeforeAndAfter wi currentClockMock.currentTime().andReturn(0).once() } whenExecuting(currentClockMock) { - val checkpointState = new CheckpointState(Long.MaxValue, currentClockMock) + val checkpointState = new KinesisCheckpointState(Milliseconds(Long.MaxValue), currentClockMock) assert(!checkpointState.shouldCheckpoint()) } } @@ -146,7 +161,7 @@ class KinesisReceiverSuite extends FunSuite with Matchers with BeforeAndAfter wi } whenExecuting(currentClockMock) { val checkpointIntervalMillis = 10 - val checkpointState = new CheckpointState(checkpointIntervalMillis, currentClockMock) + val checkpointState = new KinesisCheckpointState(Milliseconds(checkpointIntervalMillis), currentClockMock) assert(checkpointState.checkpointClock.currentTime() == checkpointIntervalMillis) checkpointState.advanceCheckpoint() assert(checkpointState.checkpointClock.currentTime() == (2 * checkpointIntervalMillis)) @@ -176,25 +191,13 @@ class KinesisReceiverSuite extends FunSuite with Matchers with BeforeAndAfter wi } } - test("string record converter") { - val expectedString = "http://sparkinaction.com" - val expectedByteArray = expectedString.getBytes() - val stringRecordSerializer = new KinesisStringRecordSerializer() - - expectedByteArray should be(stringRecordSerializer.serialize(expectedString)) - - expectedString should be(stringRecordSerializer.deserialize(expectedByteArray)) - expectedString should - be(stringRecordSerializer.deserialize(stringRecordSerializer.serialize(expectedString))) - } - test("retry success on first attempt") { val expectedIsStopped = false expecting { receiverMock.isStopped().andReturn(expectedIsStopped).once() } whenExecuting(receiverMock) { - val actualVal = KinesisRecordProcessorUtils.retry(receiverMock.isStopped(), 2, 100) + val actualVal = KinesisRecordProcessor.retry(receiverMock.isStopped(), 2, 100) assert(actualVal == expectedIsStopped) } } @@ -206,7 +209,7 @@ class KinesisReceiverSuite extends FunSuite with Matchers with BeforeAndAfter wi .andReturn(expectedIsStopped).once() } whenExecuting(receiverMock) { - val actualVal = KinesisRecordProcessorUtils.retry(receiverMock.isStopped(), 2, 100) + val actualVal = KinesisRecordProcessor.retry(receiverMock.isStopped(), 2, 100) assert(actualVal == expectedIsStopped) } } @@ -218,7 +221,7 @@ class KinesisReceiverSuite extends FunSuite with Matchers with BeforeAndAfter wi .andReturn(expectedIsStopped).once() } whenExecuting(receiverMock) { - val actualVal = KinesisRecordProcessorUtils.retry(receiverMock.isStopped(), 2, 100) + val actualVal = KinesisRecordProcessor.retry(receiverMock.isStopped(), 2, 100) assert(actualVal == expectedIsStopped) } } @@ -229,7 +232,7 @@ class KinesisReceiverSuite extends FunSuite with Matchers with BeforeAndAfter wi } whenExecuting(checkpointerMock) { intercept[ShutdownException] { - KinesisRecordProcessorUtils.retry(checkpointerMock.checkpoint(), 2, 100) + KinesisRecordProcessor.retry(checkpointerMock.checkpoint(), 2, 100) } } } @@ -240,7 +243,7 @@ class KinesisReceiverSuite extends FunSuite with Matchers with BeforeAndAfter wi } whenExecuting(checkpointerMock) { intercept[InvalidStateException] { - KinesisRecordProcessorUtils.retry(checkpointerMock.checkpoint(), 2, 100) + KinesisRecordProcessor.retry(checkpointerMock.checkpoint(), 2, 100) } } } @@ -251,7 +254,7 @@ class KinesisReceiverSuite extends FunSuite with Matchers with BeforeAndAfter wi } whenExecuting(checkpointerMock) { intercept[RuntimeException] { - KinesisRecordProcessorUtils.retry(checkpointerMock.checkpoint(), 2, 100) + KinesisRecordProcessor.retry(checkpointerMock.checkpoint(), 2, 100) } } } @@ -264,7 +267,7 @@ class KinesisReceiverSuite extends FunSuite with Matchers with BeforeAndAfter wi } whenExecuting(checkpointerMock) { val exception = intercept[RuntimeException] { - KinesisRecordProcessorUtils.retry(checkpointerMock.checkpoint(), 2, 100) + KinesisRecordProcessor.retry(checkpointerMock.checkpoint(), 2, 100) } exception.getMessage().shouldBe(expectedErrorMessage) } diff --git a/make-distribution.sh b/make-distribution.sh index 6a50bc74022ef..0a3283ecec6f8 100755 --- a/make-distribution.sh +++ b/make-distribution.sh @@ -196,8 +196,6 @@ cp -r "$FWDIR/bin" "$DISTDIR" cp -r "$FWDIR/python" "$DISTDIR" cp -r "$FWDIR/sbin" "$DISTDIR" cp -r "$FWDIR/ec2" "$DISTDIR" -cp -r "$FWDIR/extras/kinesis-asl/bin" "$DISTDIR" - # Download and copy in tachyon, if requested if [ "$SPARK_TACHYON" == "true" ]; then From 691a6be900015358d55a03c046f93d6336297ea2 Mon Sep 17 00:00:00 2001 From: Chris Fregly Date: Fri, 1 Aug 2014 14:47:00 -0700 Subject: [PATCH 10/12] fixed tests and formatting, fixed a bug with JavaKinesisWordCount during union of streams --- .../streaming/JavaKinesisWordCountASL.java | 2 +- .../streaming/KinesisWordCountASL.scala | 6 +++-- .../kinesis/KinesisReceiverSuite.scala | 22 +++++++++---------- 3 files changed, 16 insertions(+), 14 deletions(-) diff --git a/examples/src/main/java/org/apache/spark/examples/streaming/JavaKinesisWordCountASL.java b/examples/src/main/java/org/apache/spark/examples/streaming/JavaKinesisWordCountASL.java index 31793aaa020ba..f630dcd0ab16f 100644 --- a/examples/src/main/java/org/apache/spark/examples/streaming/JavaKinesisWordCountASL.java +++ b/examples/src/main/java/org/apache/spark/examples/streaming/JavaKinesisWordCountASL.java @@ -136,7 +136,7 @@ public static void main(String[] args) { /** Create the same number of Kinesis DStreams/Receivers as Kinesis stream's shards */ List> streamsList = new ArrayList>(numStreams); - for (int i = 0; i < streamsList.size(); i++) { + for (int i = 0; i < numStreams; i++) { streamsList.add( KinesisUtils.createStream(jssc, streamName, endpointUrl, checkpointInterval, InitialPositionInStream.LATEST, StorageLevel.MEMORY_AND_DISK_2()) diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala index 865eea433aeb9..8fe90dc18e471 100644 --- a/examples/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala +++ b/examples/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala @@ -94,7 +94,8 @@ object KinesisWordCountASL extends Logging { /** Determine the number of shards from the stream */ val kinesisClient = new AmazonKinesisClient(new DefaultAWSCredentialsProviderChain()) kinesisClient.setEndpoint(endpointUrl) - val numShards = kinesisClient.describeStream(streamName).getStreamDescription().getShards().size() + val numShards = kinesisClient.describeStream(streamName).getStreamDescription().getShards() + .size() /** In this example, we're going to create 1 Kinesis Worker/Receiver/DStream for each shard. */ val numStreams = numShards @@ -108,7 +109,8 @@ object KinesisWordCountASL extends Logging { /** Setup the and SparkConfig and StreamingContext */ /** Spark Streaming batch interval */ val batchInterval = Milliseconds(2000) - val sparkConfig = new SparkConf().setAppName("KinesisWordCount").setMaster(s"local[$numSparkThreads]") + val sparkConfig = new SparkConf().setAppName("KinesisWordCount") + .setMaster(s"local[$numSparkThreads]") val ssc = new StreamingContext(sparkConfig, batchInterval) /** Setup the checkpoint directory used by Spark Streaming */ ssc.checkpoint("/tmp/checkpoint"); diff --git a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala index 5db0b48113e88..cafac31961103 100644 --- a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala +++ b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala @@ -46,16 +46,7 @@ import com.amazonaws.services.kinesis.model.Record */ class KinesisReceiverSuite extends TestSuiteBase with Matchers with BeforeAndAfter with EasyMockSugar { - - test("kinesis input stream") { - val ssc = new StreamingContext(master, framework, batchDuration) - // Tests the API, does not actually test data receiving - val kinesisStream = KinesisUtils.createStream(ssc, "mySparkStream", - "https://kinesis.us-west-2.amazonaws.com", Seconds(2), - InitialPositionInStream.LATEST, StorageLevel.MEMORY_AND_DISK_2); - ssc.stop() - } - + val app = "TestKinesisReceiver" val stream = "mySparkStream" val endpoint = "endpoint-url" @@ -74,7 +65,7 @@ class KinesisReceiverSuite extends TestSuiteBase with Matchers with BeforeAndAft var checkpointStateMock: KinesisCheckpointState = _ var currentClockMock: Clock = _ - before { + override def beforeFunction() = { receiverMock = mock[KinesisReceiver] checkpointerMock = mock[IRecordProcessorCheckpointer] checkpointClockMock = mock[ManualClock] @@ -82,6 +73,15 @@ class KinesisReceiverSuite extends TestSuiteBase with Matchers with BeforeAndAft currentClockMock = mock[Clock] } + test("kinesis utils api") { + val ssc = new StreamingContext(master, framework, batchDuration) + // Tests the API, does not actually test data receiving + val kinesisStream = KinesisUtils.createStream(ssc, "mySparkStream", + "https://kinesis.us-west-2.amazonaws.com", Seconds(2), + InitialPositionInStream.LATEST, StorageLevel.MEMORY_AND_DISK_2); + ssc.stop() + } + test("process records including store and checkpoint") { val expectedCheckpointIntervalMillis = 10 expecting { From 0393795b53c2789973c081dba6f7651fd8678adc Mon Sep 17 00:00:00 2001 From: Chris Fregly Date: Fri, 1 Aug 2014 19:23:04 -0700 Subject: [PATCH 11/12] moved Kinesis examples out of examples/ and back into extras/kinesis-asl updated the build to only include kinesis-asl inside the examples jar when -Pkinesis-asl is specified --- assembly/pom.xml | 10 --- examples/pom.xml | 18 ++-- .../streaming/JavaKinesisWordCountASL.java | 44 +++++----- .../streaming/KinesisWordCountASL.scala | 86 +++++++++++-------- .../kinesis/KinesisCheckpointState.scala | 5 +- .../streaming/kinesis/KinesisReceiver.scala | 12 +-- .../kinesis/KinesisRecordProcessor.scala | 36 ++++---- 7 files changed, 112 insertions(+), 99 deletions(-) rename {examples => extras/kinesis-asl}/src/main/java/org/apache/spark/examples/streaming/JavaKinesisWordCountASL.java (82%) rename {examples => extras/kinesis-asl}/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala (76%) diff --git a/assembly/pom.xml b/assembly/pom.xml index 76099b074c7ed..703f15925bc44 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -185,16 +185,6 @@ - - kinesis-asl - - - org.apache.spark - spark-streaming-kinesis-asl_${scala.binary.version} - ${project.version} - - - bigtop-dist + org.apache.spark spark-streaming-kinesis-asl_2.10 jar diff --git a/extras/kinesis-asl/src/main/java/org/apache/spark/examples/streaming/JavaKinesisWordCountASL.java b/extras/kinesis-asl/src/main/java/org/apache/spark/examples/streaming/JavaKinesisWordCountASL.java index 647772131d293..a8b907b241893 100644 --- a/extras/kinesis-asl/src/main/java/org/apache/spark/examples/streaming/JavaKinesisWordCountASL.java +++ b/extras/kinesis-asl/src/main/java/org/apache/spark/examples/streaming/JavaKinesisWordCountASL.java @@ -79,16 +79,12 @@ public final class JavaKinesisWordCountASL { private static final Pattern WORD_SEPARATOR = Pattern.compile(" "); private static final Logger logger = Logger.getLogger(JavaKinesisWordCountASL.class); - /* - * Make the constructor private to enforce singleton - */ + /* Make the constructor private to enforce singleton */ private JavaKinesisWordCountASL() { } public static void main(String[] args) { - /* - * Check that all required args were passed in. - */ + /* Check that all required args were passed in. */ if (args.length < 2) { System.err.println( "|Usage: KinesisWordCount \n" + @@ -131,9 +127,6 @@ public static void main(String[] args) { /* Setup the StreamingContext */ JavaStreamingContext jssc = new JavaStreamingContext(sparkConfig, batchInterval); - /* Setup the checkpoint directory used by Spark Streaming */ - jssc.checkpoint("/tmp/checkpoint"); - /* Create the same number of Kinesis DStreams/Receivers as Kinesis stream's shards */ List> streamsList = new ArrayList>(numStreams); for (int i = 0; i < numStreams; i++) { @@ -163,7 +156,7 @@ public Iterable call(byte[] line) { } }); - /* Map each word to a (word, 1) tuple, then reduce/aggregate by key. */ + /* Map each word to a (word, 1) tuple, then reduce/aggregate by word. */ JavaPairDStream wordCounts = words.mapToPair( new PairFunction() { @Override @@ -177,7 +170,7 @@ public Integer call(Integer i1, Integer i2) { } }); - /* Print the first 10 wordCounts by key */ + /* Print the first 10 wordCounts */ wordCounts.print(); /* Start the streaming context and await termination */ diff --git a/extras/kinesis-asl/src/main/resources/log4j.properties b/extras/kinesis-asl/src/main/resources/log4j.properties index ad789341e62c9..97348fb5b6123 100644 --- a/extras/kinesis-asl/src/main/resources/log4j.properties +++ b/extras/kinesis-asl/src/main/resources/log4j.properties @@ -15,7 +15,6 @@ # limitations under the License. # -# Set everything to be logged to the file streaming/target/unit-tests.log log4j.rootCategory=WARN, console # File appender @@ -35,8 +34,4 @@ log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: log4j.logger.org.eclipse.jetty=WARN log4j.logger.org.eclipse.jetty.util.component.AbstractLifeCycle=ERROR log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO -log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO - -# Log all Kinesis Streaming messages -log4j.logger.org.apache.spark.examples.streaming=DEBUG -log4j.logger.org.apache.spark.streaming.kinesis=DEBUG +log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO \ No newline at end of file diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala index 63b610ac29879..d03edf8b30a9f 100644 --- a/extras/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala +++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala @@ -109,8 +109,6 @@ object KinesisWordCountASL extends Logging { val sparkConfig = new SparkConf().setAppName("KinesisWordCount") .setMaster(s"local[$numSparkThreads]") val ssc = new StreamingContext(sparkConfig, batchInterval) - /* Setup the checkpoint directory used by Spark Streaming */ - ssc.checkpoint("/tmp/checkpoint"); /* Kinesis checkpoint interval. Same as batchInterval for this example. */ val kinesisCheckpointInterval = batchInterval @@ -131,7 +129,7 @@ object KinesisWordCountASL extends Logging { /* Map each word to a (word, 1) tuple so we can reduce/aggregate by key. */ val wordCounts = words.map(word => (word, 1)).reduceByKey(_ + _) - /* Print the first 10 wordCounts by key */ + /* Print the first 10 wordCounts */ wordCounts.print() /* Start the streaming context and await termination */ diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala index ba41435d2363d..8ecc2d90160b1 100644 --- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala +++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala @@ -79,9 +79,7 @@ private[kinesis] class KinesisRecordProcessor( * This is not desirable, so we instead store a raw Array[Byte] and decouple * ourselves from Spark's internal serialization strategy. */ - batch.foreach(record => - KinesisRecordProcessor.retry(receiver.store(record.getData().array()), 4, 500) - ) + batch.foreach(record => receiver.store(record.getData().array())) logDebug(s"Stored: Worker $workerId stored ${batch.size} records for shardId $shardId") @@ -98,7 +96,7 @@ private[kinesis] class KinesisRecordProcessor( */ if (checkpointState.shouldCheckpoint()) { /* Perform the checkpoint */ - KinesisRecordProcessor.retry(checkpointer.checkpoint(), 4, 500) + KinesisRecordProcessor.retryRandom(checkpointer.checkpoint(), 4, 100) /* Update the next checkpoint time */ checkpointState.advanceCheckpoint() @@ -147,8 +145,8 @@ private[kinesis] class KinesisRecordProcessor( * Checkpoint to indicate that all records from the shard have been drained and processed. * It's now OK to read from the new shards that resulted from a resharding event. */ - case ShutdownReason.TERMINATE => KinesisRecordProcessor.retry(checkpointer.checkpoint(), - 4, 500) + case ShutdownReason.TERMINATE => + KinesisRecordProcessor.retryRandom(checkpointer.checkpoint(), 4, 100) /* * ZOMBIE Use Case. NoOp. @@ -178,7 +176,7 @@ private[kinesis] object KinesisRecordProcessor extends Logging { * or any exception that persists after numRetriesLeft reaches 0 */ @annotation.tailrec - def retry[T](expression: => T, numRetriesLeft: Int, maxBackOffMillis: Int): T = { + def retryRandom[T](expression: => T, numRetriesLeft: Int, maxBackOffMillis: Int): T = { util.Try { expression } match { /* If the function succeeded, evaluate to x. */ case util.Success(x) => x @@ -190,7 +188,7 @@ private[kinesis] object KinesisRecordProcessor extends Logging { val backOffMillis = Random.nextInt(maxBackOffMillis) Thread.sleep(backOffMillis) logError(s"Retryable Exception: Random backOffMillis=${backOffMillis}", e) - retry(expression, numRetriesLeft - 1, maxBackOffMillis) + retryRandom(expression, numRetriesLeft - 1, maxBackOffMillis) } /* Throw: Shutdown has been requested by the Kinesis Client Library.*/ case _: ShutdownException => { diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala index d3560f6a690fc..713cac0e293c0 100644 --- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala +++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala @@ -16,7 +16,6 @@ */ package org.apache.spark.streaming.kinesis -import org.apache.spark.Logging import org.apache.spark.annotation.Experimental import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.Duration @@ -33,7 +32,7 @@ import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionIn * :: Experimental :: */ @Experimental -object KinesisUtils extends Logging { +object KinesisUtils { /** * Create an InputDStream that pulls messages from a Kinesis stream. * @@ -82,8 +81,6 @@ object KinesisUtils extends Logging { * the tip of the stream (InitialPositionInStream.LATEST). * @param storageLevel Storage level to use for storing the received objects * - * @return JavaReceiverInputDStream[Array[Byte]] - * * @return JavaReceiverInputDStream[Array[Byte]] */ def createStream( diff --git a/extras/kinesis-asl/src/test/resources/log4j.properties b/extras/kinesis-asl/src/test/resources/log4j.properties index b01d4482378c1..e01e049595475 100644 --- a/extras/kinesis-asl/src/test/resources/log4j.properties +++ b/extras/kinesis-asl/src/test/resources/log4j.properties @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # -# Set everything to be logged to the file streaming/target/unit-tests.log log4j.rootCategory=INFO, file # log4j.appender.file=org.apache.log4j.FileAppender log4j.appender.file=org.apache.log4j.FileAppender diff --git a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala index cafac31961103..41dbd64c2b1fa 100644 --- a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala +++ b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala @@ -197,7 +197,7 @@ class KinesisReceiverSuite extends TestSuiteBase with Matchers with BeforeAndAft receiverMock.isStopped().andReturn(expectedIsStopped).once() } whenExecuting(receiverMock) { - val actualVal = KinesisRecordProcessor.retry(receiverMock.isStopped(), 2, 100) + val actualVal = KinesisRecordProcessor.retryRandom(receiverMock.isStopped(), 2, 100) assert(actualVal == expectedIsStopped) } } @@ -209,7 +209,7 @@ class KinesisReceiverSuite extends TestSuiteBase with Matchers with BeforeAndAft .andReturn(expectedIsStopped).once() } whenExecuting(receiverMock) { - val actualVal = KinesisRecordProcessor.retry(receiverMock.isStopped(), 2, 100) + val actualVal = KinesisRecordProcessor.retryRandom(receiverMock.isStopped(), 2, 100) assert(actualVal == expectedIsStopped) } } @@ -221,7 +221,7 @@ class KinesisReceiverSuite extends TestSuiteBase with Matchers with BeforeAndAft .andReturn(expectedIsStopped).once() } whenExecuting(receiverMock) { - val actualVal = KinesisRecordProcessor.retry(receiverMock.isStopped(), 2, 100) + val actualVal = KinesisRecordProcessor.retryRandom(receiverMock.isStopped(), 2, 100) assert(actualVal == expectedIsStopped) } } @@ -232,7 +232,7 @@ class KinesisReceiverSuite extends TestSuiteBase with Matchers with BeforeAndAft } whenExecuting(checkpointerMock) { intercept[ShutdownException] { - KinesisRecordProcessor.retry(checkpointerMock.checkpoint(), 2, 100) + KinesisRecordProcessor.retryRandom(checkpointerMock.checkpoint(), 2, 100) } } } @@ -243,7 +243,7 @@ class KinesisReceiverSuite extends TestSuiteBase with Matchers with BeforeAndAft } whenExecuting(checkpointerMock) { intercept[InvalidStateException] { - KinesisRecordProcessor.retry(checkpointerMock.checkpoint(), 2, 100) + KinesisRecordProcessor.retryRandom(checkpointerMock.checkpoint(), 2, 100) } } } @@ -254,7 +254,7 @@ class KinesisReceiverSuite extends TestSuiteBase with Matchers with BeforeAndAft } whenExecuting(checkpointerMock) { intercept[RuntimeException] { - KinesisRecordProcessor.retry(checkpointerMock.checkpoint(), 2, 100) + KinesisRecordProcessor.retryRandom(checkpointerMock.checkpoint(), 2, 100) } } } @@ -267,7 +267,7 @@ class KinesisReceiverSuite extends TestSuiteBase with Matchers with BeforeAndAft } whenExecuting(checkpointerMock) { val exception = intercept[RuntimeException] { - KinesisRecordProcessor.retry(checkpointerMock.checkpoint(), 2, 100) + KinesisRecordProcessor.retryRandom(checkpointerMock.checkpoint(), 2, 100) } exception.getMessage().shouldBe(expectedErrorMessage) }