From 1bd5fbc98322a83a8485f88705c7af42bb16dd7f Mon Sep 17 00:00:00 2001
From: Haibin Xie <xie.haibin@yahoo.com>
Date: Tue, 19 Jun 2018 16:45:05 -0700
Subject: [PATCH 1/3] Bullet Spark 0.1.2 for quick start

---
 examples/spark/pom.xml                        |   1 +
 .../bullet_spark_kafka_settings.yaml          |  77 ++++++++++++-
 .../bullet_spark_kafka_settings_new.yaml      | 103 ------------------
 .../resources/bullet_spark_rest_settings.yaml |  76 ++++++++++++-
 4 files changed, 146 insertions(+), 111 deletions(-)
 delete mode 100644 examples/spark/src/main/resources/bullet_spark_kafka_settings_new.yaml
diff --git a/examples/spark/pom.xml b/examples/spark/pom.xml
index 05abcd57..20e978e6 100644
--- a/examples/spark/pom.xml
+++ b/examples/spark/pom.xml
@@ -11,6 +11,7 @@
         <scala.dep.version>2.11</scala.dep.version>
         <spark.version>2.3.0</spark.version>
         <bullet.spark.version>0.1.2</bullet.spark.version>
+        <bullet.record.version>0.2.0</bullet.record.version>
     </properties>
 
     <repositories>
diff --git a/examples/spark/src/main/resources/bullet_spark_kafka_settings.yaml b/examples/spark/src/main/resources/bullet_spark_kafka_settings.yaml
index 0c63339b..728546c3 100644
--- a/examples/spark/src/main/resources/bullet_spark_kafka_settings.yaml
+++ b/examples/spark/src/main/resources/bullet_spark_kafka_settings.yaml
@@ -1,34 +1,103 @@
 ########################################################################################################################
-######################################### Bullet Spark Settings #####################################
+###############################################  Bullet Spark defaults #################################################
 ########################################################################################################################
+# This is the name of the concrete implementation of Data Producer to use.
 bullet.spark.data.producer.class.name: "com.yahoo.bullet.spark.examples.RandomProducer"
+
+# This is the batch interval of your Spark Streaming job. Find out more at
+# https://spark.apache.org/docs/latest/streaming-programming-guide.html#setting-the-right-batch-interval.
 bullet.spark.batch.duration.ms: 1000
+
+# This is the size of the buffer for accumulating queries in the Query Receiver before emitting to Spark.
 bullet.spark.receiver.query.block.size: 1
+
+# This is the maximum number of partitions that will be created by the Query Receiver.
 bullet.spark.receiver.query.coalesce.partitions: 10
+
+# This is the number of Data Producers.
 bullet.spark.data.producer.parallelism: 1
+
+# This is the checkpoint directory. If you are running your Spark on a cluster, the directory must be an HDFS path.
 bullet.spark.checkpoint.dir: "/tmp/spark-checkpoint"
+
+# If true, Bullet Spark recovers context from checkpoint files when restarting.
+# Otherwise Bullet Spark creates a new context.
 bullet.spark.recover.from.checkpoint.enable: false
+
+# This is the Spark application name.
 bullet.spark.app.name: "BulletSparkStreamingJob"
+
+# If true, Bullet Spark collects metrics which can be accessed via the Spark REST API (/metrics/json).
 bullet.spark.metrics.enabled: false
+
+# If true, enables parallel processing of queries in each partition of the Filter Streaming job, This is particularly
+# useful when using Producers that are Direct (e.g. DirectKafkaProducer) and you would like to avoid repartitioning
+# the data and instead choose to parallelize within each partition (fixed by the producer) instead.
+# It speeds up the processing within those partitions by partitioning queries to multiple threads to do the filtering
+# operation concurrently.
 bullet.spark.filter.partition.parallel.mode.enabled: false
+
+# This is the thread pool size to use when bullet.spark.filter.partition.parallel.mode.enabled is true.
 bullet.spark.filter.partition.parallel.mode.parallelism: 4
+
+# This is the minimum number of queries at which the parallel partition filtering is applied. Since there are fixed
+# costs to manage a thread pool, they are only created once the number of queries exceeds this threshold.
+# It is only used when bullet.spark.filter.partition.parallel.mode.enabled is true.
 bullet.spark.filter.partition.parallel.mode.min.query.threshold: 10
 
+# The following 2 settings are used to set the checkpoint intervals independently for each stateful transformation.
+# Checkpoint interval = Spark duration * checkpoint duration multiplier
+# Use this to control the frequency of checkpointing operation. If this is set too high, there might be too much
+# data to checkpoint (RDD lineage graph).
+bullet.spark.query.union.checkpoint.duration.multiplier: 10
+bullet.spark.join.checkpoint.duration.multiplier: 10
+
+# The feedback publisher switches your PubSub into QUERY_SUBMISSION mode to loop back metadata messages to query
+# receiver. If you need to change settings for your publisher in this mode that is different from the settings
+# used in the result publisher, override them here. This setting needs to be a Map if provided.
+# The example below pretends that your PubSub settings start with bullet.pubsub.custom. You will provide yours.
+# Example:
+#
+# bullet.spark.loop.pubsub.overrides:
+#   bullet.pubsub.custom.publisher.setting: 1
+#   bullet.pubsub.custom.nested.publisher.setting:
+#     foo: bar
+#     bar: baz
+bullet.spark.loop.pubsub.overrides: {}
+
 ########################################################################################################################
-######################################### Spark Streaming Settings #####################################
+############################################### Spark Streaming defaults ###############################################
 ########################################################################################################################
+# The following settings are passed to Spark directly. You can add more settings here.
+# Find out more information about configuring a Spark job at https://spark.apache.org/docs/latest/configuration.html.
+# Add configuration that change infrequently here and submit more variable settings while submitting the job on the
+# command line.
 spark.serializer: "org.apache.spark.serializer.KryoSerializer"
 spark.closure.serializer: "org.apache.spark.serializer.KryoSerializer"
-spark.kryo.registrator: "com.yahoo.bullet.spark.utils.BulletKryoRegistrator"
 spark.streaming.stopGracefullyOnShutdown: "true"
 spark.streaming.receiver.writeAheadLog.enable: "false"
 spark.streaming.driver.writeAheadLog.allowBatching: "false"
 
 ########################################################################################################################
-######################################### Query PubSub Settings ########################################
+############################################### Query PubSub defaults ##################################################
 ########################################################################################################################
+# This is the type of PubSub context to use for result publisher.
+# The feedback publisher uses QUERY_SUBMISSION since it submits messages.
 bullet.pubsub.context.name: "QUERY_PROCESSING"
+# This is the name of the concrete implementation of PubSub to use.
+# By default, it is the bulletin REST in-memory PubSub.
 bullet.pubsub.class.name: "com.yahoo.bullet.kafka.KafkaPubSub"
+# Add settings specific to your PubSub.
 bullet.pubsub.kafka.bootstrap.servers: "localhost:9092"
 bullet.pubsub.kafka.request.topic.name: "bullet.requests"
 bullet.pubsub.kafka.response.topic.name: "bullet.responses"
+
+########################################################################################################################
+############################################### Bullet Core settings ###################################################
+########################################################################################################################
+## You can also configure the core Bullet settings here. For documentation and defaults for those settings, refer to:
+## https://github.com/bullet-db/bullet-core/blob/master/src/main/resources/bullet_defaults.yaml
+########################################################################################################################
+########################################################################################################################
+# Factory class to get new BulletRecords.
+bullet.record.provider.class.name: "com.yahoo.bullet.record.SimpleBulletRecordProvider"
diff --git a/examples/spark/src/main/resources/bullet_spark_kafka_settings_new.yaml b/examples/spark/src/main/resources/bullet_spark_kafka_settings_new.yaml
deleted file mode 100644
index 728546c3..00000000
--- a/examples/spark/src/main/resources/bullet_spark_kafka_settings_new.yaml
+++ /dev/null
@@ -1,103 +0,0 @@
-########################################################################################################################
-###############################################  Bullet Spark defaults #################################################
-########################################################################################################################
-# This is the name of the concrete implementation of Data Producer to use.
-bullet.spark.data.producer.class.name: "com.yahoo.bullet.spark.examples.RandomProducer"
-
-# This is the batch interval of your Spark Streaming job. Find out more at
-# https://spark.apache.org/docs/latest/streaming-programming-guide.html#setting-the-right-batch-interval.
-bullet.spark.batch.duration.ms: 1000
-
-# This is the size of the buffer for accumulating queries in the Query Receiver before emitting to Spark.
-bullet.spark.receiver.query.block.size: 1
-
-# This is the maximum number of partitions that will be created by the Query Receiver.
-bullet.spark.receiver.query.coalesce.partitions: 10
-
-# This is the number of Data Producers.
-bullet.spark.data.producer.parallelism: 1
-
-# This is the checkpoint directory. If you are running your Spark on a cluster, the directory must be an HDFS path.
-bullet.spark.checkpoint.dir: "/tmp/spark-checkpoint"
-
-# If true, Bullet Spark recovers context from checkpoint files when restarting.
-# Otherwise Bullet Spark creates a new context.
-bullet.spark.recover.from.checkpoint.enable: false
-
-# This is the Spark application name.
-bullet.spark.app.name: "BulletSparkStreamingJob"
-
-# If true, Bullet Spark collects metrics which can be accessed via the Spark REST API (/metrics/json).
-bullet.spark.metrics.enabled: false
-
-# If true, enables parallel processing of queries in each partition of the Filter Streaming job, This is particularly
-# useful when using Producers that are Direct (e.g. DirectKafkaProducer) and you would like to avoid repartitioning
-# the data and instead choose to parallelize within each partition (fixed by the producer) instead.
-# It speeds up the processing within those partitions by partitioning queries to multiple threads to do the filtering
-# operation concurrently.
-bullet.spark.filter.partition.parallel.mode.enabled: false
-
-# This is the thread pool size to use when bullet.spark.filter.partition.parallel.mode.enabled is true.
-bullet.spark.filter.partition.parallel.mode.parallelism: 4
-
-# This is the minimum number of queries at which the parallel partition filtering is applied. Since there are fixed
-# costs to manage a thread pool, they are only created once the number of queries exceeds this threshold.
-# It is only used when bullet.spark.filter.partition.parallel.mode.enabled is true.
-bullet.spark.filter.partition.parallel.mode.min.query.threshold: 10
-
-# The following 2 settings are used to set the checkpoint intervals independently for each stateful transformation.
-# Checkpoint interval = Spark duration * checkpoint duration multiplier
-# Use this to control the frequency of checkpointing operation. If this is set too high, there might be too much
-# data to checkpoint (RDD lineage graph).
-bullet.spark.query.union.checkpoint.duration.multiplier: 10
-bullet.spark.join.checkpoint.duration.multiplier: 10
-
-# The feedback publisher switches your PubSub into QUERY_SUBMISSION mode to loop back metadata messages to query
-# receiver. If you need to change settings for your publisher in this mode that is different from the settings
-# used in the result publisher, override them here. This setting needs to be a Map if provided.
-# The example below pretends that your PubSub settings start with bullet.pubsub.custom. You will provide yours.
-# Example:
-#
-# bullet.spark.loop.pubsub.overrides:
-#   bullet.pubsub.custom.publisher.setting: 1
-#   bullet.pubsub.custom.nested.publisher.setting:
-#     foo: bar
-#     bar: baz
-bullet.spark.loop.pubsub.overrides: {}
-
-########################################################################################################################
-############################################### Spark Streaming defaults ###############################################
-########################################################################################################################
-# The following settings are passed to Spark directly. You can add more settings here.
-# Find out more information about configuring a Spark job at https://spark.apache.org/docs/latest/configuration.html.
-# Add configuration that change infrequently here and submit more variable settings while submitting the job on the
-# command line.
-spark.serializer: "org.apache.spark.serializer.KryoSerializer"
-spark.closure.serializer: "org.apache.spark.serializer.KryoSerializer"
-spark.streaming.stopGracefullyOnShutdown: "true"
-spark.streaming.receiver.writeAheadLog.enable: "false"
-spark.streaming.driver.writeAheadLog.allowBatching: "false"
-
-########################################################################################################################
-############################################### Query PubSub defaults ##################################################
-########################################################################################################################
-# This is the type of PubSub context to use for result publisher.
-# The feedback publisher uses QUERY_SUBMISSION since it submits messages.
-bullet.pubsub.context.name: "QUERY_PROCESSING"
-# This is the name of the concrete implementation of PubSub to use.
-# By default, it is the bulletin REST in-memory PubSub.
-bullet.pubsub.class.name: "com.yahoo.bullet.kafka.KafkaPubSub"
-# Add settings specific to your PubSub.
-bullet.pubsub.kafka.bootstrap.servers: "localhost:9092"
-bullet.pubsub.kafka.request.topic.name: "bullet.requests"
-bullet.pubsub.kafka.response.topic.name: "bullet.responses"
-
-########################################################################################################################
-############################################### Bullet Core settings ###################################################
-########################################################################################################################
-## You can also configure the core Bullet settings here. For documentation and defaults for those settings, refer to:
-## https://github.com/bullet-db/bullet-core/blob/master/src/main/resources/bullet_defaults.yaml
-########################################################################################################################
-########################################################################################################################
-# Factory class to get new BulletRecords.
-bullet.record.provider.class.name: "com.yahoo.bullet.record.SimpleBulletRecordProvider"
diff --git a/examples/spark/src/main/resources/bullet_spark_rest_settings.yaml b/examples/spark/src/main/resources/bullet_spark_rest_settings.yaml
index f1e37c06..396419ab 100644
--- a/examples/spark/src/main/resources/bullet_spark_rest_settings.yaml
+++ b/examples/spark/src/main/resources/bullet_spark_rest_settings.yaml
@@ -1,36 +1,104 @@
 ########################################################################################################################
-######################################### Bullet Spark Settings #####################################
+###############################################  Bullet Spark defaults #################################################
 ########################################################################################################################
+# This is the name of the concrete implementation of Data Producer to use.
 bullet.spark.data.producer.class.name: "com.yahoo.bullet.spark.examples.RandomProducer"
+
+# This is the batch interval of your Spark Streaming job. Find out more at
+# https://spark.apache.org/docs/latest/streaming-programming-guide.html#setting-the-right-batch-interval.
 bullet.spark.batch.duration.ms: 1000
+
+# This is the size of the buffer for accumulating queries in the Query Receiver before emitting to Spark.
 bullet.spark.receiver.query.block.size: 1
+
+# This is the maximum number of partitions that will be created by the Query Receiver.
 bullet.spark.receiver.query.coalesce.partitions: 10
+
+# This is the number of Data Producers.
 bullet.spark.data.producer.parallelism: 1
+
+# This is the checkpoint directory. If you are running your Spark on a cluster, the directory must be an HDFS path.
 bullet.spark.checkpoint.dir: "/tmp/spark-checkpoint"
+
+# If true, Bullet Spark recovers context from checkpoint files when restarting.
+# Otherwise Bullet Spark creates a new context.
 bullet.spark.recover.from.checkpoint.enable: false
+
+# This is the Spark application name.
 bullet.spark.app.name: "BulletSparkStreamingJob"
+
+# If true, Bullet Spark collects metrics which can be accessed via the Spark REST API (/metrics/json).
 bullet.spark.metrics.enabled: false
+
+# If true, enables parallel processing of queries in each partition of the Filter Streaming job, This is particularly
+# useful when using Producers that are Direct (e.g. DirectKafkaProducer) and you would like to avoid repartitioning
+# the data and instead choose to parallelize within each partition (fixed by the producer) instead.
+# It speeds up the processing within those partitions by partitioning queries to multiple threads to do the filtering
+# operation concurrently.
 bullet.spark.filter.partition.parallel.mode.enabled: false
+
+# This is the thread pool size to use when bullet.spark.filter.partition.parallel.mode.enabled is true.
 bullet.spark.filter.partition.parallel.mode.parallelism: 4
+
+# This is the minimum number of queries at which the parallel partition filtering is applied. Since there are fixed
+# costs to manage a thread pool, they are only created once the number of queries exceeds this threshold.
+# It is only used when bullet.spark.filter.partition.parallel.mode.enabled is true.
 bullet.spark.filter.partition.parallel.mode.min.query.threshold: 10
 
+# The following 2 settings are used to set the checkpoint intervals independently for each stateful transformation.
+# Checkpoint interval = Spark duration * checkpoint duration multiplier
+# Use this to control the frequency of checkpointing operation. If this is set too high, there might be too much
+# data to checkpoint (RDD lineage graph).
+bullet.spark.query.union.checkpoint.duration.multiplier: 10
+bullet.spark.join.checkpoint.duration.multiplier: 10
+
+# The feedback publisher switches your PubSub into QUERY_SUBMISSION mode to loop back metadata messages to query
+# receiver. If you need to change settings for your publisher in this mode that is different from the settings
+# used in the result publisher, override them here. This setting needs to be a Map if provided.
+# The example below pretends that your PubSub settings start with bullet.pubsub.custom. You will provide yours.
+# Example:
+#
+# bullet.spark.loop.pubsub.overrides:
+#   bullet.pubsub.custom.publisher.setting: 1
+#   bullet.pubsub.custom.nested.publisher.setting:
+#     foo: bar
+#     bar: baz
+bullet.spark.loop.pubsub.overrides: {}
+
 ########################################################################################################################
-######################################### Spark Streaming Settings #####################################
+############################################### Spark Streaming defaults ###############################################
 ########################################################################################################################
+# The following settings are passed to Spark directly. You can add more settings here.
+# Find out more information about configuring a Spark job at https://spark.apache.org/docs/latest/configuration.html.
+# Add configuration that change infrequently here and submit more variable settings while submitting the job on the
+# command line.
 spark.serializer: "org.apache.spark.serializer.KryoSerializer"
 spark.closure.serializer: "org.apache.spark.serializer.KryoSerializer"
-spark.kryo.registrator: "com.yahoo.bullet.spark.utils.BulletKryoRegistrator"
 spark.streaming.stopGracefullyOnShutdown: "true"
 spark.streaming.receiver.writeAheadLog.enable: "false"
 spark.streaming.driver.writeAheadLog.allowBatching: "false"
 
 ########################################################################################################################
-######################################### Query PubSub Settings ########################################
+############################################### Query PubSub defaults ##################################################
 ########################################################################################################################
+# This is the type of PubSub context to use for result publisher.
+# The feedback publisher uses QUERY_SUBMISSION since it submits messages.
 bullet.pubsub.context.name: "QUERY_PROCESSING"
+# This is the name of the concrete implementation of PubSub to use.
+# By default, it is the bulletin REST in-memory PubSub.
 bullet.pubsub.class.name: "com.yahoo.bullet.pubsub.rest.RESTPubSub"
 # A list of url(s) for the query endpoint. In the web service, this should contain a single URL for the query endpoint
 # of the in-memory pubsub instance running on that web service. For the backend it should contain the urls of all
 # the pubsub instances.
 bullet.pubsub.rest.query.urls:
     - "http://localhost:9999/api/bullet/pubsub/query"
+
+########################################################################################################################
+############################################### Bullet Core settings ###################################################
+########################################################################################################################
+## You can also configure the core Bullet settings here. For documentation and defaults for those settings, refer to:
+## https://github.com/bullet-db/bullet-core/blob/master/src/main/resources/bullet_defaults.yaml
+########################################################################################################################
+########################################################################################################################
+# Factory class to get new BulletRecords.
+bullet.record.provider.class.name: "com.yahoo.bullet.record.SimpleBulletRecordProvider"

From a259da63ae3e652867603a11d2e1c0b4df7c8cee Mon Sep 17 00:00:00 2001
From: Haibin Xie <xie.haibin@yahoo.com>
Date: Tue, 19 Jun 2018 16:48:39 -0700
Subject: [PATCH 2/3] change quickstart

---
 docs/quick-start/bullet-on-spark-with-rest.md | 2 +-
 docs/quick-start/bullet-on-spark.md           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/quick-start/bullet-on-spark-with-rest.md b/docs/quick-start/bullet-on-spark-with-rest.md
index e94ac640..e3431832 100644
--- a/docs/quick-start/bullet-on-spark-with-rest.md
+++ b/docs/quick-start/bullet-on-spark-with-rest.md
@@ -106,7 +106,7 @@ tar -xzf spark-2.2.1-bin-hadoop2.7.tgz
 
 ```bash
 cp $BULLET_HOME/bullet-examples/backend/spark/* $BULLET_SPARK
-curl -Lo bullet-spark.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-spark/0.1.1/bullet-spark-0.1.1-standalone.jar
+curl -Lo bullet-spark.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-spark/0.1.2/bullet-spark-0.1.2-standalone.jar
 ```
 
 #### Step 8: Launch the Bullet Spark Backend
diff --git a/docs/quick-start/bullet-on-spark.md b/docs/quick-start/bullet-on-spark.md
index fb647935..a567e7ff 100644
--- a/docs/quick-start/bullet-on-spark.md
+++ b/docs/quick-start/bullet-on-spark.md
@@ -117,7 +117,7 @@ tar -xzf spark-2.2.1-bin-hadoop2.7.tgz
 
 ```bash
 cp $BULLET_HOME/bullet-examples/backend/spark/* $BULLET_SPARK
-curl -Lo bullet-spark.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-spark/0.1.1/bullet-spark-0.1.1-standalone.jar
+curl -Lo bullet-spark.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-spark/0.1.2/bullet-spark-0.1.2-standalone.jar
 ```
 
 #### Step 11: Launch the Bullet Spark Backend

From 19814c950a46ad14843080d8c76e58bf25ad47d5 Mon Sep 17 00:00:00 2001
From: Haibin Xie <xie.haibin@yahoo.com>
Date: Tue, 19 Jun 2018 17:58:50 -0700
Subject: [PATCH 3/3] use --jars instead of --driver-class-path

---
 docs/quick-start/bullet-on-spark-with-rest.md | 2 +-
 docs/quick-start/bullet-on-spark.md           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/quick-start/bullet-on-spark-with-rest.md b/docs/quick-start/bullet-on-spark-with-rest.md
index e3431832..94330b90 100644
--- a/docs/quick-start/bullet-on-spark-with-rest.md
+++ b/docs/quick-start/bullet-on-spark-with-rest.md
@@ -117,7 +117,7 @@ curl -Lo bullet-spark.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-spa
 $BULLET_SPARK/spark-2.2.1-bin-hadoop2.7/bin/spark-submit \
     --master local[10]  \
     --class com.yahoo.bullet.spark.BulletSparkStreamingMain \
-    --driver-class-path $BULLET_SPARK/bullet-spark.jar:$BULLET_HOME/pubsub/bullet-kafka.jar:$BULLET_SPARK/bullet-spark-example.jar \
+    --jars $BULLET_SPARK/bullet-spark-example.jar \
     $BULLET_SPARK/bullet-spark.jar \
     --bullet-spark-conf=$BULLET_SPARK/bullet_spark_rest_settings.yaml &> log.txt &
 
diff --git a/docs/quick-start/bullet-on-spark.md b/docs/quick-start/bullet-on-spark.md
index a567e7ff..385d9083 100644
--- a/docs/quick-start/bullet-on-spark.md
+++ b/docs/quick-start/bullet-on-spark.md
@@ -128,7 +128,7 @@ curl -Lo bullet-spark.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-spa
 $BULLET_SPARK/spark-2.2.1-bin-hadoop2.7/bin/spark-submit \
     --master local[10]  \
     --class com.yahoo.bullet.spark.BulletSparkStreamingMain \
-    --driver-class-path $BULLET_SPARK/bullet-spark.jar:$BULLET_HOME/pubsub/bullet-kafka.jar:$BULLET_SPARK/bullet-spark-example.jar \
+    --jars $BULLET_HOME/pubsub/bullet-kafka.jar,$BULLET_SPARK/bullet-spark-example.jar \
     $BULLET_SPARK/bullet-spark.jar \
     --bullet-spark-conf=$BULLET_SPARK/bullet_spark_kafka_settings.yaml &> log.txt &