diff --git a/docs/quick-start/bullet-on-spark-with-rest.md b/docs/quick-start/bullet-on-spark-with-rest.md
index e94ac640..94330b90 100644
--- a/docs/quick-start/bullet-on-spark-with-rest.md
+++ b/docs/quick-start/bullet-on-spark-with-rest.md
@@ -106,7 +106,7 @@ tar -xzf spark-2.2.1-bin-hadoop2.7.tgz
```bash
cp $BULLET_HOME/bullet-examples/backend/spark/* $BULLET_SPARK
-curl -Lo bullet-spark.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-spark/0.1.1/bullet-spark-0.1.1-standalone.jar
+curl -Lo bullet-spark.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-spark/0.1.2/bullet-spark-0.1.2-standalone.jar
```
#### Step 8: Launch the Bullet Spark Backend
@@ -117,7 +117,7 @@ curl -Lo bullet-spark.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-spa
$BULLET_SPARK/spark-2.2.1-bin-hadoop2.7/bin/spark-submit \
--master local[10] \
--class com.yahoo.bullet.spark.BulletSparkStreamingMain \
- --driver-class-path $BULLET_SPARK/bullet-spark.jar:$BULLET_HOME/pubsub/bullet-kafka.jar:$BULLET_SPARK/bullet-spark-example.jar \
+ --jars $BULLET_SPARK/bullet-spark-example.jar \
$BULLET_SPARK/bullet-spark.jar \
--bullet-spark-conf=$BULLET_SPARK/bullet_spark_rest_settings.yaml &> log.txt &
diff --git a/docs/quick-start/bullet-on-spark.md b/docs/quick-start/bullet-on-spark.md
index fb647935..385d9083 100644
--- a/docs/quick-start/bullet-on-spark.md
+++ b/docs/quick-start/bullet-on-spark.md
@@ -117,7 +117,7 @@ tar -xzf spark-2.2.1-bin-hadoop2.7.tgz
```bash
cp $BULLET_HOME/bullet-examples/backend/spark/* $BULLET_SPARK
-curl -Lo bullet-spark.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-spark/0.1.1/bullet-spark-0.1.1-standalone.jar
+curl -Lo bullet-spark.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-spark/0.1.2/bullet-spark-0.1.2-standalone.jar
```
#### Step 11: Launch the Bullet Spark Backend
@@ -128,7 +128,7 @@ curl -Lo bullet-spark.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-spa
$BULLET_SPARK/spark-2.2.1-bin-hadoop2.7/bin/spark-submit \
--master local[10] \
--class com.yahoo.bullet.spark.BulletSparkStreamingMain \
- --driver-class-path $BULLET_SPARK/bullet-spark.jar:$BULLET_HOME/pubsub/bullet-kafka.jar:$BULLET_SPARK/bullet-spark-example.jar \
+ --jars $BULLET_HOME/pubsub/bullet-kafka.jar,$BULLET_SPARK/bullet-spark-example.jar \
$BULLET_SPARK/bullet-spark.jar \
--bullet-spark-conf=$BULLET_SPARK/bullet_spark_kafka_settings.yaml &> log.txt &
diff --git a/examples/spark/pom.xml b/examples/spark/pom.xml
index 05abcd57..20e978e6 100644
--- a/examples/spark/pom.xml
+++ b/examples/spark/pom.xml
@@ -11,6 +11,7 @@
2.11
2.3.0
0.1.2
+ 0.2.0
diff --git a/examples/spark/src/main/resources/bullet_spark_kafka_settings.yaml b/examples/spark/src/main/resources/bullet_spark_kafka_settings.yaml
index 0c63339b..728546c3 100644
--- a/examples/spark/src/main/resources/bullet_spark_kafka_settings.yaml
+++ b/examples/spark/src/main/resources/bullet_spark_kafka_settings.yaml
@@ -1,34 +1,103 @@
########################################################################################################################
-######################################### Bullet Spark Settings #####################################
+############################################### Bullet Spark defaults #################################################
########################################################################################################################
+# This is the name of the concrete implementation of Data Producer to use.
bullet.spark.data.producer.class.name: "com.yahoo.bullet.spark.examples.RandomProducer"
+
+# This is the batch interval of your Spark Streaming job. Find out more at
+# https://spark.apache.org/docs/latest/streaming-programming-guide.html#setting-the-right-batch-interval.
bullet.spark.batch.duration.ms: 1000
+
+# This is the size of the buffer for accumulating queries in the Query Receiver before emitting to Spark.
bullet.spark.receiver.query.block.size: 1
+
+# This is the maximum number of partitions that will be created by the Query Receiver.
bullet.spark.receiver.query.coalesce.partitions: 10
+
+# This is the number of Data Producers.
bullet.spark.data.producer.parallelism: 1
+
+# This is the checkpoint directory. If you are running your Spark on a cluster, the directory must be an HDFS path.
bullet.spark.checkpoint.dir: "/tmp/spark-checkpoint"
+
+# If true, Bullet Spark recovers context from checkpoint files when restarting.
+# Otherwise Bullet Spark creates a new context.
bullet.spark.recover.from.checkpoint.enable: false
+
+# This is the Spark application name.
bullet.spark.app.name: "BulletSparkStreamingJob"
+
+# If true, Bullet Spark collects metrics which can be accessed via the Spark REST API (/metrics/json).
bullet.spark.metrics.enabled: false
+
+# If true, enables parallel processing of queries in each partition of the Filter Streaming job, This is particularly
+# useful when using Producers that are Direct (e.g. DirectKafkaProducer) and you would like to avoid repartitioning
+# the data and instead choose to parallelize within each partition (fixed by the producer) instead.
+# It speeds up the processing within those partitions by partitioning queries to multiple threads to do the filtering
+# operation concurrently.
bullet.spark.filter.partition.parallel.mode.enabled: false
+
+# This is the thread pool size to use when bullet.spark.filter.partition.parallel.mode.enabled is true.
bullet.spark.filter.partition.parallel.mode.parallelism: 4
+
+# This is the minimum number of queries at which the parallel partition filtering is applied. Since there are fixed
+# costs to manage a thread pool, they are only created once the number of queries exceeds this threshold.
+# It is only used when bullet.spark.filter.partition.parallel.mode.enabled is true.
bullet.spark.filter.partition.parallel.mode.min.query.threshold: 10
+# The following 2 settings are used to set the checkpoint intervals independently for each stateful transformation.
+# Checkpoint interval = Spark duration * checkpoint duration multiplier
+# Use this to control the frequency of checkpointing operation. If this is set too high, there might be too much
+# data to checkpoint (RDD lineage graph).
+bullet.spark.query.union.checkpoint.duration.multiplier: 10
+bullet.spark.join.checkpoint.duration.multiplier: 10
+
+# The feedback publisher switches your PubSub into QUERY_SUBMISSION mode to loop back metadata messages to query
+# receiver. If you need to change settings for your publisher in this mode that is different from the settings
+# used in the result publisher, override them here. This setting needs to be a Map if provided.
+# The example below pretends that your PubSub settings start with bullet.pubsub.custom. You will provide yours.
+# Example:
+#
+# bullet.spark.loop.pubsub.overrides:
+# bullet.pubsub.custom.publisher.setting: 1
+# bullet.pubsub.custom.nested.publisher.setting:
+# foo: bar
+# bar: baz
+bullet.spark.loop.pubsub.overrides: {}
+
########################################################################################################################
-######################################### Spark Streaming Settings #####################################
+############################################### Spark Streaming defaults ###############################################
########################################################################################################################
+# The following settings are passed to Spark directly. You can add more settings here.
+# Find out more information about configuring a Spark job at https://spark.apache.org/docs/latest/configuration.html.
+# Add configuration that change infrequently here and submit more variable settings while submitting the job on the
+# command line.
spark.serializer: "org.apache.spark.serializer.KryoSerializer"
spark.closure.serializer: "org.apache.spark.serializer.KryoSerializer"
-spark.kryo.registrator: "com.yahoo.bullet.spark.utils.BulletKryoRegistrator"
spark.streaming.stopGracefullyOnShutdown: "true"
spark.streaming.receiver.writeAheadLog.enable: "false"
spark.streaming.driver.writeAheadLog.allowBatching: "false"
########################################################################################################################
-######################################### Query PubSub Settings ########################################
+############################################### Query PubSub defaults ##################################################
########################################################################################################################
+# This is the type of PubSub context to use for result publisher.
+# The feedback publisher uses QUERY_SUBMISSION since it submits messages.
bullet.pubsub.context.name: "QUERY_PROCESSING"
+# This is the name of the concrete implementation of PubSub to use.
+# By default, it is the bulletin REST in-memory PubSub.
bullet.pubsub.class.name: "com.yahoo.bullet.kafka.KafkaPubSub"
+# Add settings specific to your PubSub.
bullet.pubsub.kafka.bootstrap.servers: "localhost:9092"
bullet.pubsub.kafka.request.topic.name: "bullet.requests"
bullet.pubsub.kafka.response.topic.name: "bullet.responses"
+
+########################################################################################################################
+############################################### Bullet Core settings ###################################################
+########################################################################################################################
+## You can also configure the core Bullet settings here. For documentation and defaults for those settings, refer to:
+## https://github.com/bullet-db/bullet-core/blob/master/src/main/resources/bullet_defaults.yaml
+########################################################################################################################
+########################################################################################################################
+# Factory class to get new BulletRecords.
+bullet.record.provider.class.name: "com.yahoo.bullet.record.SimpleBulletRecordProvider"
diff --git a/examples/spark/src/main/resources/bullet_spark_kafka_settings_new.yaml b/examples/spark/src/main/resources/bullet_spark_kafka_settings_new.yaml
deleted file mode 100644
index 728546c3..00000000
--- a/examples/spark/src/main/resources/bullet_spark_kafka_settings_new.yaml
+++ /dev/null
@@ -1,103 +0,0 @@
-########################################################################################################################
-############################################### Bullet Spark defaults #################################################
-########################################################################################################################
-# This is the name of the concrete implementation of Data Producer to use.
-bullet.spark.data.producer.class.name: "com.yahoo.bullet.spark.examples.RandomProducer"
-
-# This is the batch interval of your Spark Streaming job. Find out more at
-# https://spark.apache.org/docs/latest/streaming-programming-guide.html#setting-the-right-batch-interval.
-bullet.spark.batch.duration.ms: 1000
-
-# This is the size of the buffer for accumulating queries in the Query Receiver before emitting to Spark.
-bullet.spark.receiver.query.block.size: 1
-
-# This is the maximum number of partitions that will be created by the Query Receiver.
-bullet.spark.receiver.query.coalesce.partitions: 10
-
-# This is the number of Data Producers.
-bullet.spark.data.producer.parallelism: 1
-
-# This is the checkpoint directory. If you are running your Spark on a cluster, the directory must be an HDFS path.
-bullet.spark.checkpoint.dir: "/tmp/spark-checkpoint"
-
-# If true, Bullet Spark recovers context from checkpoint files when restarting.
-# Otherwise Bullet Spark creates a new context.
-bullet.spark.recover.from.checkpoint.enable: false
-
-# This is the Spark application name.
-bullet.spark.app.name: "BulletSparkStreamingJob"
-
-# If true, Bullet Spark collects metrics which can be accessed via the Spark REST API (/metrics/json).
-bullet.spark.metrics.enabled: false
-
-# If true, enables parallel processing of queries in each partition of the Filter Streaming job, This is particularly
-# useful when using Producers that are Direct (e.g. DirectKafkaProducer) and you would like to avoid repartitioning
-# the data and instead choose to parallelize within each partition (fixed by the producer) instead.
-# It speeds up the processing within those partitions by partitioning queries to multiple threads to do the filtering
-# operation concurrently.
-bullet.spark.filter.partition.parallel.mode.enabled: false
-
-# This is the thread pool size to use when bullet.spark.filter.partition.parallel.mode.enabled is true.
-bullet.spark.filter.partition.parallel.mode.parallelism: 4
-
-# This is the minimum number of queries at which the parallel partition filtering is applied. Since there are fixed
-# costs to manage a thread pool, they are only created once the number of queries exceeds this threshold.
-# It is only used when bullet.spark.filter.partition.parallel.mode.enabled is true.
-bullet.spark.filter.partition.parallel.mode.min.query.threshold: 10
-
-# The following 2 settings are used to set the checkpoint intervals independently for each stateful transformation.
-# Checkpoint interval = Spark duration * checkpoint duration multiplier
-# Use this to control the frequency of checkpointing operation. If this is set too high, there might be too much
-# data to checkpoint (RDD lineage graph).
-bullet.spark.query.union.checkpoint.duration.multiplier: 10
-bullet.spark.join.checkpoint.duration.multiplier: 10
-
-# The feedback publisher switches your PubSub into QUERY_SUBMISSION mode to loop back metadata messages to query
-# receiver. If you need to change settings for your publisher in this mode that is different from the settings
-# used in the result publisher, override them here. This setting needs to be a Map if provided.
-# The example below pretends that your PubSub settings start with bullet.pubsub.custom. You will provide yours.
-# Example:
-#
-# bullet.spark.loop.pubsub.overrides:
-# bullet.pubsub.custom.publisher.setting: 1
-# bullet.pubsub.custom.nested.publisher.setting:
-# foo: bar
-# bar: baz
-bullet.spark.loop.pubsub.overrides: {}
-
-########################################################################################################################
-############################################### Spark Streaming defaults ###############################################
-########################################################################################################################
-# The following settings are passed to Spark directly. You can add more settings here.
-# Find out more information about configuring a Spark job at https://spark.apache.org/docs/latest/configuration.html.
-# Add configuration that change infrequently here and submit more variable settings while submitting the job on the
-# command line.
-spark.serializer: "org.apache.spark.serializer.KryoSerializer"
-spark.closure.serializer: "org.apache.spark.serializer.KryoSerializer"
-spark.streaming.stopGracefullyOnShutdown: "true"
-spark.streaming.receiver.writeAheadLog.enable: "false"
-spark.streaming.driver.writeAheadLog.allowBatching: "false"
-
-########################################################################################################################
-############################################### Query PubSub defaults ##################################################
-########################################################################################################################
-# This is the type of PubSub context to use for result publisher.
-# The feedback publisher uses QUERY_SUBMISSION since it submits messages.
-bullet.pubsub.context.name: "QUERY_PROCESSING"
-# This is the name of the concrete implementation of PubSub to use.
-# By default, it is the bulletin REST in-memory PubSub.
-bullet.pubsub.class.name: "com.yahoo.bullet.kafka.KafkaPubSub"
-# Add settings specific to your PubSub.
-bullet.pubsub.kafka.bootstrap.servers: "localhost:9092"
-bullet.pubsub.kafka.request.topic.name: "bullet.requests"
-bullet.pubsub.kafka.response.topic.name: "bullet.responses"
-
-########################################################################################################################
-############################################### Bullet Core settings ###################################################
-########################################################################################################################
-## You can also configure the core Bullet settings here. For documentation and defaults for those settings, refer to:
-## https://github.com/bullet-db/bullet-core/blob/master/src/main/resources/bullet_defaults.yaml
-########################################################################################################################
-########################################################################################################################
-# Factory class to get new BulletRecords.
-bullet.record.provider.class.name: "com.yahoo.bullet.record.SimpleBulletRecordProvider"
diff --git a/examples/spark/src/main/resources/bullet_spark_rest_settings.yaml b/examples/spark/src/main/resources/bullet_spark_rest_settings.yaml
index f1e37c06..396419ab 100644
--- a/examples/spark/src/main/resources/bullet_spark_rest_settings.yaml
+++ b/examples/spark/src/main/resources/bullet_spark_rest_settings.yaml
@@ -1,36 +1,104 @@
########################################################################################################################
-######################################### Bullet Spark Settings #####################################
+############################################### Bullet Spark defaults #################################################
########################################################################################################################
+# This is the name of the concrete implementation of Data Producer to use.
bullet.spark.data.producer.class.name: "com.yahoo.bullet.spark.examples.RandomProducer"
+
+# This is the batch interval of your Spark Streaming job. Find out more at
+# https://spark.apache.org/docs/latest/streaming-programming-guide.html#setting-the-right-batch-interval.
bullet.spark.batch.duration.ms: 1000
+
+# This is the size of the buffer for accumulating queries in the Query Receiver before emitting to Spark.
bullet.spark.receiver.query.block.size: 1
+
+# This is the maximum number of partitions that will be created by the Query Receiver.
bullet.spark.receiver.query.coalesce.partitions: 10
+
+# This is the number of Data Producers.
bullet.spark.data.producer.parallelism: 1
+
+# This is the checkpoint directory. If you are running your Spark on a cluster, the directory must be an HDFS path.
bullet.spark.checkpoint.dir: "/tmp/spark-checkpoint"
+
+# If true, Bullet Spark recovers context from checkpoint files when restarting.
+# Otherwise Bullet Spark creates a new context.
bullet.spark.recover.from.checkpoint.enable: false
+
+# This is the Spark application name.
bullet.spark.app.name: "BulletSparkStreamingJob"
+
+# If true, Bullet Spark collects metrics which can be accessed via the Spark REST API (/metrics/json).
bullet.spark.metrics.enabled: false
+
+# If true, enables parallel processing of queries in each partition of the Filter Streaming job, This is particularly
+# useful when using Producers that are Direct (e.g. DirectKafkaProducer) and you would like to avoid repartitioning
+# the data and instead choose to parallelize within each partition (fixed by the producer) instead.
+# It speeds up the processing within those partitions by partitioning queries to multiple threads to do the filtering
+# operation concurrently.
bullet.spark.filter.partition.parallel.mode.enabled: false
+
+# This is the thread pool size to use when bullet.spark.filter.partition.parallel.mode.enabled is true.
bullet.spark.filter.partition.parallel.mode.parallelism: 4
+
+# This is the minimum number of queries at which the parallel partition filtering is applied. Since there are fixed
+# costs to manage a thread pool, they are only created once the number of queries exceeds this threshold.
+# It is only used when bullet.spark.filter.partition.parallel.mode.enabled is true.
bullet.spark.filter.partition.parallel.mode.min.query.threshold: 10
+# The following 2 settings are used to set the checkpoint intervals independently for each stateful transformation.
+# Checkpoint interval = Spark duration * checkpoint duration multiplier
+# Use this to control the frequency of checkpointing operation. If this is set too high, there might be too much
+# data to checkpoint (RDD lineage graph).
+bullet.spark.query.union.checkpoint.duration.multiplier: 10
+bullet.spark.join.checkpoint.duration.multiplier: 10
+
+# The feedback publisher switches your PubSub into QUERY_SUBMISSION mode to loop back metadata messages to query
+# receiver. If you need to change settings for your publisher in this mode that is different from the settings
+# used in the result publisher, override them here. This setting needs to be a Map if provided.
+# The example below pretends that your PubSub settings start with bullet.pubsub.custom. You will provide yours.
+# Example:
+#
+# bullet.spark.loop.pubsub.overrides:
+# bullet.pubsub.custom.publisher.setting: 1
+# bullet.pubsub.custom.nested.publisher.setting:
+# foo: bar
+# bar: baz
+bullet.spark.loop.pubsub.overrides: {}
+
########################################################################################################################
-######################################### Spark Streaming Settings #####################################
+############################################### Spark Streaming defaults ###############################################
########################################################################################################################
+# The following settings are passed to Spark directly. You can add more settings here.
+# Find out more information about configuring a Spark job at https://spark.apache.org/docs/latest/configuration.html.
+# Add configuration that change infrequently here and submit more variable settings while submitting the job on the
+# command line.
spark.serializer: "org.apache.spark.serializer.KryoSerializer"
spark.closure.serializer: "org.apache.spark.serializer.KryoSerializer"
-spark.kryo.registrator: "com.yahoo.bullet.spark.utils.BulletKryoRegistrator"
spark.streaming.stopGracefullyOnShutdown: "true"
spark.streaming.receiver.writeAheadLog.enable: "false"
spark.streaming.driver.writeAheadLog.allowBatching: "false"
########################################################################################################################
-######################################### Query PubSub Settings ########################################
+############################################### Query PubSub defaults ##################################################
########################################################################################################################
+# This is the type of PubSub context to use for result publisher.
+# The feedback publisher uses QUERY_SUBMISSION since it submits messages.
bullet.pubsub.context.name: "QUERY_PROCESSING"
+# This is the name of the concrete implementation of PubSub to use.
+# By default, it is the bulletin REST in-memory PubSub.
bullet.pubsub.class.name: "com.yahoo.bullet.pubsub.rest.RESTPubSub"
# A list of url(s) for the query endpoint. In the web service, this should contain a single URL for the query endpoint
# of the in-memory pubsub instance running on that web service. For the backend it should contain the urls of all
# the pubsub instances.
bullet.pubsub.rest.query.urls:
- "http://localhost:9999/api/bullet/pubsub/query"
+
+########################################################################################################################
+############################################### Bullet Core settings ###################################################
+########################################################################################################################
+## You can also configure the core Bullet settings here. For documentation and defaults for those settings, refer to:
+## https://github.com/bullet-db/bullet-core/blob/master/src/main/resources/bullet_defaults.yaml
+########################################################################################################################
+########################################################################################################################
+# Factory class to get new BulletRecords.
+bullet.record.provider.class.name: "com.yahoo.bullet.record.SimpleBulletRecordProvider"