Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/quick-start/bullet-on-spark-with-rest.md
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ tar -xzf spark-2.2.1-bin-hadoop2.7.tgz

```bash
cp $BULLET_HOME/bullet-examples/backend/spark/* $BULLET_SPARK
curl -Lo bullet-spark.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-spark/0.1.1/bullet-spark-0.1.1-standalone.jar
curl -Lo bullet-spark.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-spark/0.1.2/bullet-spark-0.1.2-standalone.jar
```

#### Step 8: Launch the Bullet Spark Backend
Expand All @@ -117,7 +117,7 @@ curl -Lo bullet-spark.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-spa
$BULLET_SPARK/spark-2.2.1-bin-hadoop2.7/bin/spark-submit \
--master local[10] \
--class com.yahoo.bullet.spark.BulletSparkStreamingMain \
--driver-class-path $BULLET_SPARK/bullet-spark.jar:$BULLET_HOME/pubsub/bullet-kafka.jar:$BULLET_SPARK/bullet-spark-example.jar \
--jars $BULLET_SPARK/bullet-spark-example.jar \
$BULLET_SPARK/bullet-spark.jar \
--bullet-spark-conf=$BULLET_SPARK/bullet_spark_rest_settings.yaml &> log.txt &

Expand Down
4 changes: 2 additions & 2 deletions docs/quick-start/bullet-on-spark.md
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ tar -xzf spark-2.2.1-bin-hadoop2.7.tgz

```bash
cp $BULLET_HOME/bullet-examples/backend/spark/* $BULLET_SPARK
curl -Lo bullet-spark.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-spark/0.1.1/bullet-spark-0.1.1-standalone.jar
curl -Lo bullet-spark.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-spark/0.1.2/bullet-spark-0.1.2-standalone.jar
```

#### Step 11: Launch the Bullet Spark Backend
Expand All @@ -128,7 +128,7 @@ curl -Lo bullet-spark.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-spa
$BULLET_SPARK/spark-2.2.1-bin-hadoop2.7/bin/spark-submit \
--master local[10] \
--class com.yahoo.bullet.spark.BulletSparkStreamingMain \
--driver-class-path $BULLET_SPARK/bullet-spark.jar:$BULLET_HOME/pubsub/bullet-kafka.jar:$BULLET_SPARK/bullet-spark-example.jar \
--jars $BULLET_HOME/pubsub/bullet-kafka.jar,$BULLET_SPARK/bullet-spark-example.jar \
$BULLET_SPARK/bullet-spark.jar \
--bullet-spark-conf=$BULLET_SPARK/bullet_spark_kafka_settings.yaml &> log.txt &

Expand Down
1 change: 1 addition & 0 deletions examples/spark/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
<scala.dep.version>2.11</scala.dep.version>
<spark.version>2.3.0</spark.version>
<bullet.spark.version>0.1.2</bullet.spark.version>
<bullet.record.version>0.2.0</bullet.record.version>
</properties>

<repositories>
Expand Down
77 changes: 73 additions & 4 deletions examples/spark/src/main/resources/bullet_spark_kafka_settings.yaml
Original file line number Diff line number Diff line change
@@ -1,34 +1,103 @@
########################################################################################################################
######################################### Bullet Spark Settings #####################################
############################################### Bullet Spark defaults #################################################
########################################################################################################################
# This is the name of the concrete implementation of Data Producer to use.
bullet.spark.data.producer.class.name: "com.yahoo.bullet.spark.examples.RandomProducer"

# This is the batch interval of your Spark Streaming job. Find out more at
# https://spark.apache.org/docs/latest/streaming-programming-guide.html#setting-the-right-batch-interval.
bullet.spark.batch.duration.ms: 1000

# This is the size of the buffer for accumulating queries in the Query Receiver before emitting to Spark.
bullet.spark.receiver.query.block.size: 1

# This is the maximum number of partitions that will be created by the Query Receiver.
bullet.spark.receiver.query.coalesce.partitions: 10

# This is the number of Data Producers.
bullet.spark.data.producer.parallelism: 1

# This is the checkpoint directory. If you are running your Spark on a cluster, the directory must be an HDFS path.
bullet.spark.checkpoint.dir: "/tmp/spark-checkpoint"

# If true, Bullet Spark recovers context from checkpoint files when restarting.
# Otherwise Bullet Spark creates a new context.
bullet.spark.recover.from.checkpoint.enable: false

# This is the Spark application name.
bullet.spark.app.name: "BulletSparkStreamingJob"

# If true, Bullet Spark collects metrics which can be accessed via the Spark REST API (/metrics/json).
bullet.spark.metrics.enabled: false

# If true, enables parallel processing of queries in each partition of the Filter Streaming job, This is particularly
# useful when using Producers that are Direct (e.g. DirectKafkaProducer) and you would like to avoid repartitioning
# the data and instead choose to parallelize within each partition (fixed by the producer) instead.
# It speeds up the processing within those partitions by partitioning queries to multiple threads to do the filtering
# operation concurrently.
bullet.spark.filter.partition.parallel.mode.enabled: false

# This is the thread pool size to use when bullet.spark.filter.partition.parallel.mode.enabled is true.
bullet.spark.filter.partition.parallel.mode.parallelism: 4

# This is the minimum number of queries at which the parallel partition filtering is applied. Since there are fixed
# costs to manage a thread pool, they are only created once the number of queries exceeds this threshold.
# It is only used when bullet.spark.filter.partition.parallel.mode.enabled is true.
bullet.spark.filter.partition.parallel.mode.min.query.threshold: 10

# The following 2 settings are used to set the checkpoint intervals independently for each stateful transformation.
# Checkpoint interval = Spark duration * checkpoint duration multiplier
# Use this to control the frequency of checkpointing operation. If this is set too high, there might be too much
# data to checkpoint (RDD lineage graph).
bullet.spark.query.union.checkpoint.duration.multiplier: 10
bullet.spark.join.checkpoint.duration.multiplier: 10

# The feedback publisher switches your PubSub into QUERY_SUBMISSION mode to loop back metadata messages to query
# receiver. If you need to change settings for your publisher in this mode that is different from the settings
# used in the result publisher, override them here. This setting needs to be a Map if provided.
# The example below pretends that your PubSub settings start with bullet.pubsub.custom. You will provide yours.
# Example:
#
# bullet.spark.loop.pubsub.overrides:
# bullet.pubsub.custom.publisher.setting: 1
# bullet.pubsub.custom.nested.publisher.setting:
# foo: bar
# bar: baz
bullet.spark.loop.pubsub.overrides: {}

########################################################################################################################
######################################### Spark Streaming Settings #####################################
############################################### Spark Streaming defaults ###############################################
########################################################################################################################
# The following settings are passed to Spark directly. You can add more settings here.
# Find out more information about configuring a Spark job at https://spark.apache.org/docs/latest/configuration.html.
# Add configuration that change infrequently here and submit more variable settings while submitting the job on the
# command line.
spark.serializer: "org.apache.spark.serializer.KryoSerializer"
spark.closure.serializer: "org.apache.spark.serializer.KryoSerializer"
spark.kryo.registrator: "com.yahoo.bullet.spark.utils.BulletKryoRegistrator"
spark.streaming.stopGracefullyOnShutdown: "true"
spark.streaming.receiver.writeAheadLog.enable: "false"
spark.streaming.driver.writeAheadLog.allowBatching: "false"

########################################################################################################################
######################################### Query PubSub Settings ########################################
############################################### Query PubSub defaults ##################################################
########################################################################################################################
# This is the type of PubSub context to use for result publisher.
# The feedback publisher uses QUERY_SUBMISSION since it submits messages.
bullet.pubsub.context.name: "QUERY_PROCESSING"
# This is the name of the concrete implementation of PubSub to use.
# By default, it is the bulletin REST in-memory PubSub.
bullet.pubsub.class.name: "com.yahoo.bullet.kafka.KafkaPubSub"
# Add settings specific to your PubSub.
bullet.pubsub.kafka.bootstrap.servers: "localhost:9092"
bullet.pubsub.kafka.request.topic.name: "bullet.requests"
bullet.pubsub.kafka.response.topic.name: "bullet.responses"

########################################################################################################################
############################################### Bullet Core settings ###################################################
########################################################################################################################
## You can also configure the core Bullet settings here. For documentation and defaults for those settings, refer to:
## https://github.com/bullet-db/bullet-core/blob/master/src/main/resources/bullet_defaults.yaml
########################################################################################################################
########################################################################################################################
# Factory class to get new BulletRecords.
bullet.record.provider.class.name: "com.yahoo.bullet.record.SimpleBulletRecordProvider"
103 changes: 0 additions & 103 deletions examples/spark/src/main/resources/bullet_spark_kafka_settings_new.yaml

This file was deleted.

Loading