From 1bd5fbc98322a83a8485f88705c7af42bb16dd7f Mon Sep 17 00:00:00 2001 From: Haibin Xie Date: Tue, 19 Jun 2018 16:45:05 -0700 Subject: [PATCH 1/3] Bullet Spark 0.1.2 for quick start --- examples/spark/pom.xml | 1 + .../bullet_spark_kafka_settings.yaml | 77 ++++++++++++- .../bullet_spark_kafka_settings_new.yaml | 103 ------------------ .../resources/bullet_spark_rest_settings.yaml | 76 ++++++++++++- 4 files changed, 146 insertions(+), 111 deletions(-) delete mode 100644 examples/spark/src/main/resources/bullet_spark_kafka_settings_new.yaml diff --git a/examples/spark/pom.xml b/examples/spark/pom.xml index 05abcd57..20e978e6 100644 --- a/examples/spark/pom.xml +++ b/examples/spark/pom.xml @@ -11,6 +11,7 @@ 2.11 2.3.0 0.1.2 + 0.2.0 diff --git a/examples/spark/src/main/resources/bullet_spark_kafka_settings.yaml b/examples/spark/src/main/resources/bullet_spark_kafka_settings.yaml index 0c63339b..728546c3 100644 --- a/examples/spark/src/main/resources/bullet_spark_kafka_settings.yaml +++ b/examples/spark/src/main/resources/bullet_spark_kafka_settings.yaml @@ -1,34 +1,103 @@ ######################################################################################################################## -######################################### Bullet Spark Settings ##################################### +############################################### Bullet Spark defaults ################################################# ######################################################################################################################## +# This is the name of the concrete implementation of Data Producer to use. bullet.spark.data.producer.class.name: "com.yahoo.bullet.spark.examples.RandomProducer" + +# This is the batch interval of your Spark Streaming job. Find out more at +# https://spark.apache.org/docs/latest/streaming-programming-guide.html#setting-the-right-batch-interval. bullet.spark.batch.duration.ms: 1000 + +# This is the size of the buffer for accumulating queries in the Query Receiver before emitting to Spark. bullet.spark.receiver.query.block.size: 1 + +# This is the maximum number of partitions that will be created by the Query Receiver. bullet.spark.receiver.query.coalesce.partitions: 10 + +# This is the number of Data Producers. bullet.spark.data.producer.parallelism: 1 + +# This is the checkpoint directory. If you are running your Spark on a cluster, the directory must be an HDFS path. bullet.spark.checkpoint.dir: "/tmp/spark-checkpoint" + +# If true, Bullet Spark recovers context from checkpoint files when restarting. +# Otherwise Bullet Spark creates a new context. bullet.spark.recover.from.checkpoint.enable: false + +# This is the Spark application name. bullet.spark.app.name: "BulletSparkStreamingJob" + +# If true, Bullet Spark collects metrics which can be accessed via the Spark REST API (/metrics/json). bullet.spark.metrics.enabled: false + +# If true, enables parallel processing of queries in each partition of the Filter Streaming job, This is particularly +# useful when using Producers that are Direct (e.g. DirectKafkaProducer) and you would like to avoid repartitioning +# the data and instead choose to parallelize within each partition (fixed by the producer) instead. +# It speeds up the processing within those partitions by partitioning queries to multiple threads to do the filtering +# operation concurrently. bullet.spark.filter.partition.parallel.mode.enabled: false + +# This is the thread pool size to use when bullet.spark.filter.partition.parallel.mode.enabled is true. bullet.spark.filter.partition.parallel.mode.parallelism: 4 + +# This is the minimum number of queries at which the parallel partition filtering is applied. Since there are fixed +# costs to manage a thread pool, they are only created once the number of queries exceeds this threshold. +# It is only used when bullet.spark.filter.partition.parallel.mode.enabled is true. bullet.spark.filter.partition.parallel.mode.min.query.threshold: 10 +# The following 2 settings are used to set the checkpoint intervals independently for each stateful transformation. +# Checkpoint interval = Spark duration * checkpoint duration multiplier +# Use this to control the frequency of checkpointing operation. If this is set too high, there might be too much +# data to checkpoint (RDD lineage graph). +bullet.spark.query.union.checkpoint.duration.multiplier: 10 +bullet.spark.join.checkpoint.duration.multiplier: 10 + +# The feedback publisher switches your PubSub into QUERY_SUBMISSION mode to loop back metadata messages to query +# receiver. If you need to change settings for your publisher in this mode that is different from the settings +# used in the result publisher, override them here. This setting needs to be a Map if provided. +# The example below pretends that your PubSub settings start with bullet.pubsub.custom. You will provide yours. +# Example: +# +# bullet.spark.loop.pubsub.overrides: +# bullet.pubsub.custom.publisher.setting: 1 +# bullet.pubsub.custom.nested.publisher.setting: +# foo: bar +# bar: baz +bullet.spark.loop.pubsub.overrides: {} + ######################################################################################################################## -######################################### Spark Streaming Settings ##################################### +############################################### Spark Streaming defaults ############################################### ######################################################################################################################## +# The following settings are passed to Spark directly. You can add more settings here. +# Find out more information about configuring a Spark job at https://spark.apache.org/docs/latest/configuration.html. +# Add configuration that change infrequently here and submit more variable settings while submitting the job on the +# command line. spark.serializer: "org.apache.spark.serializer.KryoSerializer" spark.closure.serializer: "org.apache.spark.serializer.KryoSerializer" -spark.kryo.registrator: "com.yahoo.bullet.spark.utils.BulletKryoRegistrator" spark.streaming.stopGracefullyOnShutdown: "true" spark.streaming.receiver.writeAheadLog.enable: "false" spark.streaming.driver.writeAheadLog.allowBatching: "false" ######################################################################################################################## -######################################### Query PubSub Settings ######################################## +############################################### Query PubSub defaults ################################################## ######################################################################################################################## +# This is the type of PubSub context to use for result publisher. +# The feedback publisher uses QUERY_SUBMISSION since it submits messages. bullet.pubsub.context.name: "QUERY_PROCESSING" +# This is the name of the concrete implementation of PubSub to use. +# By default, it is the bulletin REST in-memory PubSub. bullet.pubsub.class.name: "com.yahoo.bullet.kafka.KafkaPubSub" +# Add settings specific to your PubSub. bullet.pubsub.kafka.bootstrap.servers: "localhost:9092" bullet.pubsub.kafka.request.topic.name: "bullet.requests" bullet.pubsub.kafka.response.topic.name: "bullet.responses" + +######################################################################################################################## +############################################### Bullet Core settings ################################################### +######################################################################################################################## +## You can also configure the core Bullet settings here. For documentation and defaults for those settings, refer to: +## https://github.com/bullet-db/bullet-core/blob/master/src/main/resources/bullet_defaults.yaml +######################################################################################################################## +######################################################################################################################## +# Factory class to get new BulletRecords. +bullet.record.provider.class.name: "com.yahoo.bullet.record.SimpleBulletRecordProvider" diff --git a/examples/spark/src/main/resources/bullet_spark_kafka_settings_new.yaml b/examples/spark/src/main/resources/bullet_spark_kafka_settings_new.yaml deleted file mode 100644 index 728546c3..00000000 --- a/examples/spark/src/main/resources/bullet_spark_kafka_settings_new.yaml +++ /dev/null @@ -1,103 +0,0 @@ -######################################################################################################################## -############################################### Bullet Spark defaults ################################################# -######################################################################################################################## -# This is the name of the concrete implementation of Data Producer to use. -bullet.spark.data.producer.class.name: "com.yahoo.bullet.spark.examples.RandomProducer" - -# This is the batch interval of your Spark Streaming job. Find out more at -# https://spark.apache.org/docs/latest/streaming-programming-guide.html#setting-the-right-batch-interval. -bullet.spark.batch.duration.ms: 1000 - -# This is the size of the buffer for accumulating queries in the Query Receiver before emitting to Spark. -bullet.spark.receiver.query.block.size: 1 - -# This is the maximum number of partitions that will be created by the Query Receiver. -bullet.spark.receiver.query.coalesce.partitions: 10 - -# This is the number of Data Producers. -bullet.spark.data.producer.parallelism: 1 - -# This is the checkpoint directory. If you are running your Spark on a cluster, the directory must be an HDFS path. -bullet.spark.checkpoint.dir: "/tmp/spark-checkpoint" - -# If true, Bullet Spark recovers context from checkpoint files when restarting. -# Otherwise Bullet Spark creates a new context. -bullet.spark.recover.from.checkpoint.enable: false - -# This is the Spark application name. -bullet.spark.app.name: "BulletSparkStreamingJob" - -# If true, Bullet Spark collects metrics which can be accessed via the Spark REST API (/metrics/json). -bullet.spark.metrics.enabled: false - -# If true, enables parallel processing of queries in each partition of the Filter Streaming job, This is particularly -# useful when using Producers that are Direct (e.g. DirectKafkaProducer) and you would like to avoid repartitioning -# the data and instead choose to parallelize within each partition (fixed by the producer) instead. -# It speeds up the processing within those partitions by partitioning queries to multiple threads to do the filtering -# operation concurrently. -bullet.spark.filter.partition.parallel.mode.enabled: false - -# This is the thread pool size to use when bullet.spark.filter.partition.parallel.mode.enabled is true. -bullet.spark.filter.partition.parallel.mode.parallelism: 4 - -# This is the minimum number of queries at which the parallel partition filtering is applied. Since there are fixed -# costs to manage a thread pool, they are only created once the number of queries exceeds this threshold. -# It is only used when bullet.spark.filter.partition.parallel.mode.enabled is true. -bullet.spark.filter.partition.parallel.mode.min.query.threshold: 10 - -# The following 2 settings are used to set the checkpoint intervals independently for each stateful transformation. -# Checkpoint interval = Spark duration * checkpoint duration multiplier -# Use this to control the frequency of checkpointing operation. If this is set too high, there might be too much -# data to checkpoint (RDD lineage graph). -bullet.spark.query.union.checkpoint.duration.multiplier: 10 -bullet.spark.join.checkpoint.duration.multiplier: 10 - -# The feedback publisher switches your PubSub into QUERY_SUBMISSION mode to loop back metadata messages to query -# receiver. If you need to change settings for your publisher in this mode that is different from the settings -# used in the result publisher, override them here. This setting needs to be a Map if provided. -# The example below pretends that your PubSub settings start with bullet.pubsub.custom. You will provide yours. -# Example: -# -# bullet.spark.loop.pubsub.overrides: -# bullet.pubsub.custom.publisher.setting: 1 -# bullet.pubsub.custom.nested.publisher.setting: -# foo: bar -# bar: baz -bullet.spark.loop.pubsub.overrides: {} - -######################################################################################################################## -############################################### Spark Streaming defaults ############################################### -######################################################################################################################## -# The following settings are passed to Spark directly. You can add more settings here. -# Find out more information about configuring a Spark job at https://spark.apache.org/docs/latest/configuration.html. -# Add configuration that change infrequently here and submit more variable settings while submitting the job on the -# command line. -spark.serializer: "org.apache.spark.serializer.KryoSerializer" -spark.closure.serializer: "org.apache.spark.serializer.KryoSerializer" -spark.streaming.stopGracefullyOnShutdown: "true" -spark.streaming.receiver.writeAheadLog.enable: "false" -spark.streaming.driver.writeAheadLog.allowBatching: "false" - -######################################################################################################################## -############################################### Query PubSub defaults ################################################## -######################################################################################################################## -# This is the type of PubSub context to use for result publisher. -# The feedback publisher uses QUERY_SUBMISSION since it submits messages. -bullet.pubsub.context.name: "QUERY_PROCESSING" -# This is the name of the concrete implementation of PubSub to use. -# By default, it is the bulletin REST in-memory PubSub. -bullet.pubsub.class.name: "com.yahoo.bullet.kafka.KafkaPubSub" -# Add settings specific to your PubSub. -bullet.pubsub.kafka.bootstrap.servers: "localhost:9092" -bullet.pubsub.kafka.request.topic.name: "bullet.requests" -bullet.pubsub.kafka.response.topic.name: "bullet.responses" - -######################################################################################################################## -############################################### Bullet Core settings ################################################### -######################################################################################################################## -## You can also configure the core Bullet settings here. For documentation and defaults for those settings, refer to: -## https://github.com/bullet-db/bullet-core/blob/master/src/main/resources/bullet_defaults.yaml -######################################################################################################################## -######################################################################################################################## -# Factory class to get new BulletRecords. -bullet.record.provider.class.name: "com.yahoo.bullet.record.SimpleBulletRecordProvider" diff --git a/examples/spark/src/main/resources/bullet_spark_rest_settings.yaml b/examples/spark/src/main/resources/bullet_spark_rest_settings.yaml index f1e37c06..396419ab 100644 --- a/examples/spark/src/main/resources/bullet_spark_rest_settings.yaml +++ b/examples/spark/src/main/resources/bullet_spark_rest_settings.yaml @@ -1,36 +1,104 @@ ######################################################################################################################## -######################################### Bullet Spark Settings ##################################### +############################################### Bullet Spark defaults ################################################# ######################################################################################################################## +# This is the name of the concrete implementation of Data Producer to use. bullet.spark.data.producer.class.name: "com.yahoo.bullet.spark.examples.RandomProducer" + +# This is the batch interval of your Spark Streaming job. Find out more at +# https://spark.apache.org/docs/latest/streaming-programming-guide.html#setting-the-right-batch-interval. bullet.spark.batch.duration.ms: 1000 + +# This is the size of the buffer for accumulating queries in the Query Receiver before emitting to Spark. bullet.spark.receiver.query.block.size: 1 + +# This is the maximum number of partitions that will be created by the Query Receiver. bullet.spark.receiver.query.coalesce.partitions: 10 + +# This is the number of Data Producers. bullet.spark.data.producer.parallelism: 1 + +# This is the checkpoint directory. If you are running your Spark on a cluster, the directory must be an HDFS path. bullet.spark.checkpoint.dir: "/tmp/spark-checkpoint" + +# If true, Bullet Spark recovers context from checkpoint files when restarting. +# Otherwise Bullet Spark creates a new context. bullet.spark.recover.from.checkpoint.enable: false + +# This is the Spark application name. bullet.spark.app.name: "BulletSparkStreamingJob" + +# If true, Bullet Spark collects metrics which can be accessed via the Spark REST API (/metrics/json). bullet.spark.metrics.enabled: false + +# If true, enables parallel processing of queries in each partition of the Filter Streaming job, This is particularly +# useful when using Producers that are Direct (e.g. DirectKafkaProducer) and you would like to avoid repartitioning +# the data and instead choose to parallelize within each partition (fixed by the producer) instead. +# It speeds up the processing within those partitions by partitioning queries to multiple threads to do the filtering +# operation concurrently. bullet.spark.filter.partition.parallel.mode.enabled: false + +# This is the thread pool size to use when bullet.spark.filter.partition.parallel.mode.enabled is true. bullet.spark.filter.partition.parallel.mode.parallelism: 4 + +# This is the minimum number of queries at which the parallel partition filtering is applied. Since there are fixed +# costs to manage a thread pool, they are only created once the number of queries exceeds this threshold. +# It is only used when bullet.spark.filter.partition.parallel.mode.enabled is true. bullet.spark.filter.partition.parallel.mode.min.query.threshold: 10 +# The following 2 settings are used to set the checkpoint intervals independently for each stateful transformation. +# Checkpoint interval = Spark duration * checkpoint duration multiplier +# Use this to control the frequency of checkpointing operation. If this is set too high, there might be too much +# data to checkpoint (RDD lineage graph). +bullet.spark.query.union.checkpoint.duration.multiplier: 10 +bullet.spark.join.checkpoint.duration.multiplier: 10 + +# The feedback publisher switches your PubSub into QUERY_SUBMISSION mode to loop back metadata messages to query +# receiver. If you need to change settings for your publisher in this mode that is different from the settings +# used in the result publisher, override them here. This setting needs to be a Map if provided. +# The example below pretends that your PubSub settings start with bullet.pubsub.custom. You will provide yours. +# Example: +# +# bullet.spark.loop.pubsub.overrides: +# bullet.pubsub.custom.publisher.setting: 1 +# bullet.pubsub.custom.nested.publisher.setting: +# foo: bar +# bar: baz +bullet.spark.loop.pubsub.overrides: {} + ######################################################################################################################## -######################################### Spark Streaming Settings ##################################### +############################################### Spark Streaming defaults ############################################### ######################################################################################################################## +# The following settings are passed to Spark directly. You can add more settings here. +# Find out more information about configuring a Spark job at https://spark.apache.org/docs/latest/configuration.html. +# Add configuration that change infrequently here and submit more variable settings while submitting the job on the +# command line. spark.serializer: "org.apache.spark.serializer.KryoSerializer" spark.closure.serializer: "org.apache.spark.serializer.KryoSerializer" -spark.kryo.registrator: "com.yahoo.bullet.spark.utils.BulletKryoRegistrator" spark.streaming.stopGracefullyOnShutdown: "true" spark.streaming.receiver.writeAheadLog.enable: "false" spark.streaming.driver.writeAheadLog.allowBatching: "false" ######################################################################################################################## -######################################### Query PubSub Settings ######################################## +############################################### Query PubSub defaults ################################################## ######################################################################################################################## +# This is the type of PubSub context to use for result publisher. +# The feedback publisher uses QUERY_SUBMISSION since it submits messages. bullet.pubsub.context.name: "QUERY_PROCESSING" +# This is the name of the concrete implementation of PubSub to use. +# By default, it is the bulletin REST in-memory PubSub. bullet.pubsub.class.name: "com.yahoo.bullet.pubsub.rest.RESTPubSub" # A list of url(s) for the query endpoint. In the web service, this should contain a single URL for the query endpoint # of the in-memory pubsub instance running on that web service. For the backend it should contain the urls of all # the pubsub instances. bullet.pubsub.rest.query.urls: - "http://localhost:9999/api/bullet/pubsub/query" + +######################################################################################################################## +############################################### Bullet Core settings ################################################### +######################################################################################################################## +## You can also configure the core Bullet settings here. For documentation and defaults for those settings, refer to: +## https://github.com/bullet-db/bullet-core/blob/master/src/main/resources/bullet_defaults.yaml +######################################################################################################################## +######################################################################################################################## +# Factory class to get new BulletRecords. +bullet.record.provider.class.name: "com.yahoo.bullet.record.SimpleBulletRecordProvider" From a259da63ae3e652867603a11d2e1c0b4df7c8cee Mon Sep 17 00:00:00 2001 From: Haibin Xie Date: Tue, 19 Jun 2018 16:48:39 -0700 Subject: [PATCH 2/3] change quickstart --- docs/quick-start/bullet-on-spark-with-rest.md | 2 +- docs/quick-start/bullet-on-spark.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/quick-start/bullet-on-spark-with-rest.md b/docs/quick-start/bullet-on-spark-with-rest.md index e94ac640..e3431832 100644 --- a/docs/quick-start/bullet-on-spark-with-rest.md +++ b/docs/quick-start/bullet-on-spark-with-rest.md @@ -106,7 +106,7 @@ tar -xzf spark-2.2.1-bin-hadoop2.7.tgz ```bash cp $BULLET_HOME/bullet-examples/backend/spark/* $BULLET_SPARK -curl -Lo bullet-spark.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-spark/0.1.1/bullet-spark-0.1.1-standalone.jar +curl -Lo bullet-spark.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-spark/0.1.2/bullet-spark-0.1.2-standalone.jar ``` #### Step 8: Launch the Bullet Spark Backend diff --git a/docs/quick-start/bullet-on-spark.md b/docs/quick-start/bullet-on-spark.md index fb647935..a567e7ff 100644 --- a/docs/quick-start/bullet-on-spark.md +++ b/docs/quick-start/bullet-on-spark.md @@ -117,7 +117,7 @@ tar -xzf spark-2.2.1-bin-hadoop2.7.tgz ```bash cp $BULLET_HOME/bullet-examples/backend/spark/* $BULLET_SPARK -curl -Lo bullet-spark.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-spark/0.1.1/bullet-spark-0.1.1-standalone.jar +curl -Lo bullet-spark.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-spark/0.1.2/bullet-spark-0.1.2-standalone.jar ``` #### Step 11: Launch the Bullet Spark Backend From 19814c950a46ad14843080d8c76e58bf25ad47d5 Mon Sep 17 00:00:00 2001 From: Haibin Xie Date: Tue, 19 Jun 2018 17:58:50 -0700 Subject: [PATCH 3/3] use --jars instead of --driver-class-path --- docs/quick-start/bullet-on-spark-with-rest.md | 2 +- docs/quick-start/bullet-on-spark.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/quick-start/bullet-on-spark-with-rest.md b/docs/quick-start/bullet-on-spark-with-rest.md index e3431832..94330b90 100644 --- a/docs/quick-start/bullet-on-spark-with-rest.md +++ b/docs/quick-start/bullet-on-spark-with-rest.md @@ -117,7 +117,7 @@ curl -Lo bullet-spark.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-spa $BULLET_SPARK/spark-2.2.1-bin-hadoop2.7/bin/spark-submit \ --master local[10] \ --class com.yahoo.bullet.spark.BulletSparkStreamingMain \ - --driver-class-path $BULLET_SPARK/bullet-spark.jar:$BULLET_HOME/pubsub/bullet-kafka.jar:$BULLET_SPARK/bullet-spark-example.jar \ + --jars $BULLET_SPARK/bullet-spark-example.jar \ $BULLET_SPARK/bullet-spark.jar \ --bullet-spark-conf=$BULLET_SPARK/bullet_spark_rest_settings.yaml &> log.txt & diff --git a/docs/quick-start/bullet-on-spark.md b/docs/quick-start/bullet-on-spark.md index a567e7ff..385d9083 100644 --- a/docs/quick-start/bullet-on-spark.md +++ b/docs/quick-start/bullet-on-spark.md @@ -128,7 +128,7 @@ curl -Lo bullet-spark.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-spa $BULLET_SPARK/spark-2.2.1-bin-hadoop2.7/bin/spark-submit \ --master local[10] \ --class com.yahoo.bullet.spark.BulletSparkStreamingMain \ - --driver-class-path $BULLET_SPARK/bullet-spark.jar:$BULLET_HOME/pubsub/bullet-kafka.jar:$BULLET_SPARK/bullet-spark-example.jar \ + --jars $BULLET_HOME/pubsub/bullet-kafka.jar,$BULLET_SPARK/bullet-spark-example.jar \ $BULLET_SPARK/bullet-spark.jar \ --bullet-spark-conf=$BULLET_SPARK/bullet_spark_kafka_settings.yaml &> log.txt &