From a29c5bafbee9f1d8b13911f05d97c61cbd30ead1 Mon Sep 17 00:00:00 2001 From: Nathan Speidel Date: Tue, 29 May 2018 16:09:52 -0700 Subject: [PATCH 1/5] Rewrite Makefile and add spark folder --- examples/Makefile | 16 +++++---- examples/spark/bin/launch.sh | 2 ++ .../main/resources/bullet_spark_settings.yaml | 35 +++++++++++++++++++ 3 files changed, 47 insertions(+), 6 deletions(-) create mode 100644 examples/spark/bin/launch.sh create mode 100644 examples/spark/src/main/resources/bullet_spark_settings.yaml diff --git a/examples/Makefile b/examples/Makefile index e144710d..4162beca 100644 --- a/examples/Makefile +++ b/examples/Makefile @@ -5,15 +5,19 @@ clean: cd storm && mvn clean build: - mkdir -p bullet-examples/storm - mkdir -p bullet-examples/web-service - mkdir -p bullet-examples/ui + mkdir -p bullet-examples/backend/storm cp install-all.sh bullet-examples/ cd storm && mvn package - cp storm/target/*jar-with-dependencies.jar bullet-examples/storm - cp storm/bin/launch.sh bullet-examples/storm - cp storm/src/main/resources/bullet_settings.yaml bullet-examples/storm + cp storm/target/*jar-with-dependencies.jar bullet-examples/backend/storm + cp storm/bin/launch.sh bullet-examples/backend/storm + cp storm/src/main/resources/bullet_settings.yaml bullet-examples/backend/storm + mkdir -p bullet-examples/backend/spark + cd spark && mvn package + cp spark/target/*-SNAPSHOT.jar bullet-examples/backend/spark + cp spark/src/main/resources/bullet_spark_settings.yaml bullet-examples/backend/spark + mkdir -p bullet-examples/web-service cp web-service/* bullet-examples/web-service + mkdir -p bullet-examples/ui cp ui/* bullet-examples/ui tar -czf examples_artifacts.tar.gz bullet-examples/* rm -rf bullet-examples diff --git a/examples/spark/bin/launch.sh b/examples/spark/bin/launch.sh new file mode 100644 index 00000000..d23299ab --- /dev/null +++ b/examples/spark/bin/launch.sh @@ -0,0 +1,2 @@ + +# This will launch bullet-spark diff --git a/examples/spark/src/main/resources/bullet_spark_settings.yaml b/examples/spark/src/main/resources/bullet_spark_settings.yaml new file mode 100644 index 00000000..11b422be --- /dev/null +++ b/examples/spark/src/main/resources/bullet_spark_settings.yaml @@ -0,0 +1,35 @@ +######################################################################################################################## +######################################### Bullet Spark Settings ##################################### +######################################################################################################################## +bullet.spark.data.producer.class.name: "com.yahoo.bullet.spark.receiver.RandomReceiver" +bullet.spark.batch.duration.ms: 1000 +bullet.spark.receiver.query.block.size: 1 +bullet.spark.receiver.query.coalesce.partitions: 10 +bullet.spark.data.producer.parallelism: 1 +bullet.spark.checkpoint.dir: "/tmp/spark-checkpoint" +bullet.spark.recover.from.checkpoint.enable: false +bullet.spark.app.name: "BulletSparkStreamingJob" +bullet.spark.metrics.enabled: false +bullet.spark.filter.partition.parallel.mode.enabled: false +bullet.spark.filter.partition.parallel.mode.parallelism: 4 +bullet.spark.filter.partition.parallel.mode.min.query.threshold: 10 + +######################################################################################################################## +######################################### Spark Streaming Settings ##################################### +######################################################################################################################## +spark.serializer: "org.apache.spark.serializer.KryoSerializer" +spark.closure.serializer: "org.apache.spark.serializer.KryoSerializer" +spark.kryo.registrator: "com.yahoo.bullet.spark.utils.BulletKryoRegistrator" +spark.streaming.stopGracefullyOnShutdown: "true" +spark.streaming.receiver.writeAheadLog.enable: "false" +spark.streaming.driver.writeAheadLog.allowBatching: "false" + +######################################################################################################################## +######################################### Query PubSub Settings ######################################## +######################################################################################################################## +# This is the type of PubSub context to use +bullet.pubsub.context.name: "QUERY_PROCESSING" +# This is the name of the concrete implementation of PubSub to use. +bullet.pubsub.class.name: "com.yahoo.bullet.pubsub.rest.RESTPubSub" +bullet.pubsub.rest.query.urls: + - "http://localhost:9999/api/bullet/pubsub/query" From 4e09d897372e1e7bf20dcbc6aa5fe795a8f16d06 Mon Sep 17 00:00:00 2001 From: Nathan Speidel Date: Wed, 30 May 2018 13:20:06 -0700 Subject: [PATCH 2/5] Updated quick-start --- .../example_rest_pubsub_config.yaml | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 examples/web-service/example_rest_pubsub_config.yaml diff --git a/examples/web-service/example_rest_pubsub_config.yaml b/examples/web-service/example_rest_pubsub_config.yaml new file mode 100644 index 00000000..1761473a --- /dev/null +++ b/examples/web-service/example_rest_pubsub_config.yaml @@ -0,0 +1,18 @@ +bullet.pubsub.context.name: "QUERY_SUBMISSION" +bullet.pubsub.class.name: "com.yahoo.bullet.pubsub.rest.RESTPubSub" +# In-Memory pubsub settings that you may change for each instance (there may be multiple in-memory pubsub instances) +# The paths (not including the context.path) of the endpoints for reading/writing queries/responses +bullet.pubsub.rest.query.urls: + - "http://localhost:9999/api/bullet/pubsub/query" +bullet.pubsub.rest.result.url: "http://localhost:9999/api/bullet/pubsub/result" +# Http connection timout (used by both the web service and the backend) +bullet.pubsub.rest.connect.timeout.ms: 30000 +# Http connection retry limit (used by both the web service and the backend) +bullet.pubsub.rest.connect.retry.limit: 10 +# Maxiumum number of uncommitted messages allowed before read requests will wait for commits (used by both the web service and the backend) +bullet.pubsub.rest.subscriber.max.uncommitted.messages: 100 + +# Minimum time (ms) between http calls to the result subscriber REST endpoint +bullet.pubsub.rest.result.subscriber.min.wait.ms: 10 +# Minimum time (ms) between http calls to the query subscriber REST endpoint +bullet.pubsub.rest.query.subscriber.min.wait.ms: 10 From 3c751597c1650d6cbae1cf1e64cb572ed4d0d8fa Mon Sep 17 00:00:00 2001 From: Nathan Speidel Date: Wed, 30 May 2018 13:20:28 -0700 Subject: [PATCH 3/5] Updated Makefile --- docs/quick-start.md | 117 +++++++++++++++++++++++++------------------- examples/Makefile | 2 +- 2 files changed, 69 insertions(+), 50 deletions(-) diff --git a/docs/quick-start.md b/docs/quick-start.md index 872bf67f..668d2fa9 100644 --- a/docs/quick-start.md +++ b/docs/quick-start.md @@ -1,113 +1,132 @@ # Quick Start -This section gets you running a mock instance of Bullet to play around with. The instance will run using [Bullet on Storm](backend/storm-setup.md) and use the [DRPC Pubsub](pubsub/storm-drpc.md). Since we do not have an actual data source, we will produce some fake data and convert it into [Bullet Records](backend/ingestion.md) in a [custom Storm spout](https://github.com/yahoo/bullet-docs/blob/master/examples/storm/src/main/java/com/yahoo/bullet/storm/examples/RandomSpout.java). If you want to use Bullet for your data, you will need to do read and convert your data to Bullet Records in a similar manner. +This section gets you running a mock instance of Bullet to play around with. The instance will run using Bullet on Spark and use the REST pubsub available as part of bullet-core. Since we do not have an actual data source, we will produce some fake data and convert it into [Bullet Records](backend/ingestion.md) using [some simple custom Spark code](https://github.com/bullet-db/bullet-db.github.io/blob/src/examples/spark/src/main/scala/com/yahoo/bullet/spark/examples/receiver/RandomReceiver.scala). If you want to use Bullet for your data, you will need to do read and convert your data to Bullet Records in a similar manner. At the end of this section, you will have: - * Setup the Bullet topology using a custom spout on [bullet-storm-0.6.2](https://github.com/yahoo/bullet-storm/releases/tag/bullet-storm-0.6.2) - * Setup the [Web Service](ws/setup.md) talking to the topology and serving a schema for your UI using [bullet-service-0.1.1](https://github.com/yahoo/bullet-service/releases/tag/bullet-service-0.1.1) - * Setup the [DRPC PubSub](pubsub/storm-drpc.md) talking to the topology and Web Service. + * Launched the Bullet backend on spark + * Setup the [Web Service](ws/setup.md) with it's built-in REST pubsub enabled * Setup the [UI](ui/setup.md) talking to the Web Service using [bullet-ui-0.4.0](https://github.com/yahoo/bullet-ui/releases/tag/v0.4.0) **Prerequisites** * You will need to be on an Unix-based system (Mac OS X, Ubuntu ...) with ```curl``` installed * You will need [JDK 8](http://www.oracle.com/technetwork/java/javase/downloads/index.html) installed - * You will need enough CPU and RAM on your machine to run about 8-10 JVMs in ```server``` mode. You should have at least 2 GB free space on your disk. We will be setting up a Storm cluster with multiple components, a couple of Jetty instances and a Node server. ## Install Script -Simply run: - -```bash -curl -sLo- https://raw.githubusercontent.com/yahoo/bullet-docs/v0.4.0/examples/install-all.sh | bash -``` - -This will setup a local Storm cluster, a Bullet running on it, the Bullet Web Service and a Bullet UI for you. Once everything has launched, you should be able to go to the Bullet UI running locally at [http://localhost:8800](http://localhost:8800). You can then [**continue this guide from here**](#what-did-we-do). - -!!! note "Want to DIY?" - If you want to manually run all the commands or if the script died while doing something above (might want to perform the [teardown](#teardown) first), you can continue below. - +DO THIS LATER (one-liner?) ## Manual Installation -### Setting up Storm +### Setting up the Bullet Web Service and Built-In REST Pub-Sub -To set up a clean working environment, let's start with creating some directories. +Before we launch the Bullet Spark backend, we first need to setup the Bullet Web Service and PubSub layer. The bullet-core repo provides a [pubsub.rest](https://github.com/bullet-db/bullet-core/tree/master/src/main/java/com/yahoo/bullet/pubsub/rest) package which is a simple implementation of the PubSub layer using REST endpoints. The bullet web service can be configured to use this built-in REST PubSub to provide the additional REST endpoints needed to serve as a PubSub layer as well as the web service. #### Step 1: Setup directories and examples ```bash export BULLET_HOME=$(pwd)/bullet-quickstart -mkdir -p $BULLET_HOME/backend/storm +mkdir -p $BULLET_HOME/backend/spark mkdir -p $BULLET_HOME/service mkdir -p $BULLET_HOME/ui cd $BULLET_HOME -curl -LO https://github.com/yahoo/bullet-docs/releases/download/v0.4.0/examples_artifacts.tar.gz +DO THE THING to download the compressed folder - used to be: curl -LO https://github.com/yahoo/bullet-docs/releases/download/v0.4.0/examples_artifacts.tar.gz tar -xzf examples_artifacts.tar.gz export BULLET_EXAMPLES=$BULLET_HOME/bullet-examples ``` -#### Step 2: Install Storm 1.1 +#### Step 2: Install the Bullet Web Service ```bash -cd $BULLET_HOME/backend -curl -O http://apache.org/dist/storm/apache-storm-1.1.2/apache-storm-1.1.2.zip -unzip apache-storm-1.1.2.zip -export PATH=$(pwd)/apache-storm-1.1.2/bin/:$PATH +cd $BULLET_HOME/service +curl -Lo bullet-service.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-service/0.2.1/bullet-service-0.2.1-embedded.jar +cp $BULLET_EXAMPLES/web-service/example_rest_pubsub_config.yaml $BULLET_HOME/service/ +cp $BULLET_EXAMPLES/web-service/example_columns.json $BULLET_HOME/service/ ``` -Add a DRPC server setting to the Storm config: + +#### Step 3: Launch the Web Service ```bash -echo 'drpc.servers: ["127.0.0.1"]' >> apache-storm-1.1.2/conf/storm.yaml +cd $BULLET_HOME/service +java -jar bullet-service.jar --bullet.pubsub.config=$BULLET_HOME/service/example_rest_pubsub_config.yaml --bullet.schema.file=$BULLET_HOME/service/example_columns.json --server.port=9999 --bullet.pubsub.builtin.rest.enabled=true --logging.path=. --logging.file=log.txt &> log.txt & ``` -#### Step 3: Launch Storm components +The Web Service usually takes ~10-15 seconds to start. -Launch each of the following components, in order and wait for the commands to go through. You may have to do these one at a time. You will see a JVM being launched for each one and connection messages as the components communicate through Zookeeper. +You can check the status of the Web Service by looking at the Web Service log: ```bash -storm dev-zookeeper & -storm nimbus & -storm drpc & -storm ui & -storm logviewer & -storm supervisor & +vim $BULLET_HOME/service/log.txt ``` -It may take 30-60 seconds for all the components to launch. +The log should contain a message that reads something like `Started Application in X seconds` (usually the last line of the file if the web service has been run recently). -Once everything is up without errors, visit [http://localhost:8080](http://localhost:8080) and see if the Storm UI loads. +#### Step 4: Test the Web Service (optional) -#### Step 4: Test Storm (Optional) +We can check that the Web Service is up and running by getting the example columns through the API: -Before Bullet, test to see if Storm and DRPC are up and running by launching a example topology that comes with your Storm installation: +```bash +curl -s http://localhost:9999/api/bullet/columns +``` + +#### Step 5: Test the PubSub Layer (optional) + +To ensure that the Web Service has been configured to expose the necessary PubSub REST endpoints, we can "write" a fake-query to the PubSub, and then read it back by hand. Since there is currently no backend running, any queries written to the PubSub will simply be stored there until we read it manually. + +Write a fake empty query to the query endpoint: ```bash -storm jar apache-storm-1.1.2/examples/storm-starter/storm-starter-topologies-1.1.2.jar org.apache.storm.starter.BasicDRPCTopology topology +curl -s -H 'Content-Type: application/json' -X POST -d '{}' http://localhost:9999/api/bullet/pubsub/query ``` -Visit your UI with a browser and see if a topology with name "topology" is running. If everything is good, you should be able to ping DRPC with: +Receiving no error response should indicate that the fake query was written to the pubsub. Then read a query from this same endpoint: ```bash -curl localhost:3774/drpc/exclamation/foo +curl http://localhost:9999/api/bullet/pubsub/query ``` -and get back a ```foo!```. Any string you pass as part of the URL is returned to you with a "!" at the end. +This should print `'{}'` to the screen, indicating we have successfully written and then read a fake empty query from the PubSub layer. Subsequent reads from this endpoint will return nothing because no more queries have been written to the PubSub endpoint. + + +### Install Spark 2.2.1 + +We will run the bullet-spark backend using [Spark 2.2.1](https://spark.apache.org/releases/spark-release-2-2-1.html). -Kill this topology after with: +#### Step 6: Install Spark 2.2.1 ```bash -storm kill topology +export BULLET_SPARK=$BULLET_HOME/backend/spark +cd $BULLET_SPARK +curl -O http://www-eu.apache.org/dist/spark/spark-2.2.1/spark-2.2.1-bin-hadoop2.7.tgz +tar -xzf spark-2.2.1-bin-hadoop2.7.tgz ``` -!!! note "Local mode cleanup" +#### Step 7: Setup Bullet-Spark and Example Data Producer + +```bash +cp $BULLET_HOME/bullet-examples/backend/spark/* $BULLET_SPARK +curl -Lo bullet-spark.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-spark/0.1.1/bullet-spark-0.1.1-standalone.jar +``` + +#### Step 8: Launch the Spark Backend + +```bash +$BULLET_SPARK/spark-2.2.1-bin-hadoop2.7/bin/spark-submit --master local[10] --class com.yahoo.bullet.spark.BulletSparkStreamingMain --driver-class-path $BULLET_SPARK/bullet-spark-example.jar:$BULLET_SPARK/bullet-spark.jar $BULLET_SPARK/bullet-spark.jar &> log.txt & +``` + + + + + + + + + + - If you notice any problems while setting up storm or while relaunching a topology, it may be because some state is corrupted. When running Storm in this fashion, states and serializations are stored in ```storm-local``` and ```/tmp/```. You may want to ```rm -rf storm-local/* /tmp/dev-storm-zookeeper``` to clean up this state before relaunching Storm components. See the [tear down section](#teardown) on how to kill any running instances. -### Setting up the example Bullet topology -Now that Storm is up and running, we can put Bullet on it. We will use an example Spout that runs on Bullet 0.4.3 on our Storm cluster. The source is available [here](https://github.com/yahoo/bullet-docs/blob/master/examples/storm). This was part of the artifact that you installed in Step 1. #### Step 5: Setup the Storm example diff --git a/examples/Makefile b/examples/Makefile index 4162beca..5ec01eb6 100644 --- a/examples/Makefile +++ b/examples/Makefile @@ -13,7 +13,7 @@ build: cp storm/src/main/resources/bullet_settings.yaml bullet-examples/backend/storm mkdir -p bullet-examples/backend/spark cd spark && mvn package - cp spark/target/*-SNAPSHOT.jar bullet-examples/backend/spark + cp spark/target/bullet-spark-example-0.0.1-SNAPSHOT.jar bullet-examples/backend/spark/bullet-spark-example.jar cp spark/src/main/resources/bullet_spark_settings.yaml bullet-examples/backend/spark mkdir -p bullet-examples/web-service cp web-service/* bullet-examples/web-service From c4438c39bf3fdcc3cec990048f77f3b9ede72797 Mon Sep 17 00:00:00 2001 From: Nathan Speidel Date: Tue, 5 Jun 2018 16:15:08 -0700 Subject: [PATCH 4/5] Added spark quickstart without new UI --- docs/quick-start/bullet-on-spark.md | 476 ++++++++++++++++++ .../bullet-on-storm.md} | 92 +--- examples/Makefile | 3 +- examples/install-all-spark.sh | 344 +++++++++++++ .../{install-all.sh => install-all-storm.sh} | 0 .../main/resources/bullet_spark_settings.yaml | 12 +- examples/ui/env-settings.json | 2 +- .../example_kafka_pubsub_config.yaml | 5 + mkdocs.yml | 4 +- 9 files changed, 846 insertions(+), 92 deletions(-) create mode 100644 docs/quick-start/bullet-on-spark.md rename docs/{quick-start.md => quick-start/bullet-on-storm.md} (81%) create mode 100755 examples/install-all-spark.sh rename examples/{install-all.sh => install-all-storm.sh} (100%) create mode 100644 examples/web-service/example_kafka_pubsub_config.yaml diff --git a/docs/quick-start/bullet-on-spark.md b/docs/quick-start/bullet-on-spark.md new file mode 100644 index 00000000..f3a7364f --- /dev/null +++ b/docs/quick-start/bullet-on-spark.md @@ -0,0 +1,476 @@ +# Quick Start - Bullet on Spark + +In this section we will setup a mock instance of Bullet to play around with. We will use [bullet-spark](https://github.com/bullet-db/bullet-spark) to run the backend of Bullet on the [Spark](https://spark.apache.org/) framework. And we will use the [Bullet Kafka PubSub](https://github.com/bullet-db/bullet-kafka). + +At the end of this section, you will have: + + * Launched the Bullet backend on spark + * Setup the [Web Service](ws/setup.md) + * Setup the [UI](ui/setup.md) to talk to the web service + +**Prerequisites** + + * You will need to be on an Unix-based system (Mac OS X, Ubuntu ...) with ```curl``` installed + * You will need [JDK 8](http://www.oracle.com/technetwork/java/javase/downloads/index.html) installed + +## Install Script + +DO THIS LATER (one-liner?) Remove this 88 + +## Manual Installation + +### Setup Kafka + +For this instance of Bullet we will use the kafka PubSub implementation found in [bullet-spark](https://github.com/bullet-db/bullet-spark). So we will first download and run Kafka, and setup a couple Kafka topics. + +#### Step 1: Setup directories and examples + +```bash +export BULLET_HOME=$(pwd)/bullet-quickstart +mkdir -p $BULLET_HOME/backend/spark +mkdir -p $BULLET_HOME/pubsub +mkdir -p $BULLET_HOME/service +mkdir -p $BULLET_HOME/ui +cd $BULLET_HOME +Remove this 88: DO THE THING to download the compressed folder (needs to be put on git) - used to be: curl -LO https://github.com/yahoo/bullet-docs/releases/download/v0.4.0/examples_artifacts.tar.gz - now: cp ~/bullet/bullet-db.github.io/examples/examples_artifacts.tar.gz . +tar -xzf examples_artifacts.tar.gz +export BULLET_EXAMPLES=$BULLET_HOME/bullet-examples +``` + +#### Step 2: Download and Install Kafka + +```bash +cd $BULLET_HOME/pubsub +curl -Lo bullet-kafka.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-kafka/0.3.0/bullet-kafka-0.3.0-fat.jar +curl -LO https://archive.apache.org/dist/kafka/0.11.0.1/kafka_2.12-0.11.0.1.tgz +tar -xzf kafka_2.12-0.11.0.1.tgz +export KAFKA_DIR=$BULLET_HOME/pubsub/kafka_2.12-0.11.0.1 +``` + +#### Step 3: Start Zookeeper + +```bash +$KAFKA_DIR/bin/zookeeper-server-start.sh $KAFKA_DIR/config/zookeeper.properties & +``` + +#### Step 4: Start Kafka + +Give Zookeeper a ~5-10 seconds to start up, then start Kafka: + +```bash +$KAFKA_DIR/bin/kafka-server-start.sh $KAFKA_DIR/config/server.properties & +``` + +#### Step 5: Create Kafka Topics + +The Bullet Kafka PubSub uses two kafka topics. One to send messages from the web service to the backend, and one to send messages from the backend to the web service. So we will create a kafka topic called "bullet.requests" and another called "bullet.responses". + +```bash +$KAFKA_DIR/bin/kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 1 --partitions 1 --topic bullet.requests +$KAFKA_DIR/bin/kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 1 --partitions 1 --topic bullet.responses +``` + +### Setup Web Service + +#### Step 6: Install the Bullet Web Service + +```bash +cd $BULLET_HOME/service +curl -Lo bullet-service.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-service/0.2.1/bullet-service-0.2.1-embedded.jar +cp $BULLET_EXAMPLES/web-service/example_kafka_pubsub_config.yaml $BULLET_HOME/service/ +cp $BULLET_EXAMPLES/web-service/example_columns.json $BULLET_HOME/service/ +``` + +#### Step 7: Launch the Web Service + +**Note:** This is a single command (new-lines are escaped) - run it in a single bash command: + +```bash +java -Dloader.path=$BULLET_HOME/pubsub/bullet-kafka.jar -jar bullet-service.jar \ + --bullet.pubsub.config=$BULLET_HOME/service/example_kafka_pubsub_config.yaml \ + --bullet.schema.file=$BULLET_HOME/service/example_columns.json \ + --server.port=9999 \ + --logging.path=. \ + --logging.file=log.txt &> log.txt & +``` + +#### Step 8: Test the Web Service (optional) + +We can check that the Web Service is up and running by getting the example columns through the API: + +```bash +curl -s http://localhost:9999/api/bullet/columns +``` + +You can also check the status of the Web Service by looking at the Web Service log: $BULLET_HOME/service/log.txt + +### Setup Bullet Backend on Spark + +We will run the bullet-spark backend using [Spark 2.2.1](https://spark.apache.org/releases/spark-release-2-2-1.html). + +#### Step 9: Install Spark 2.2.1 + +```bash +export BULLET_SPARK=$BULLET_HOME/backend/spark +cd $BULLET_SPARK +curl -O http://www-eu.apache.org/dist/spark/spark-2.2.1/spark-2.2.1-bin-hadoop2.7.tgz +tar -xzf spark-2.2.1-bin-hadoop2.7.tgz +``` + +#### Step 10: Setup Bullet-Spark and Example Data Producer + +```bash +cp $BULLET_HOME/bullet-examples/backend/spark/* $BULLET_SPARK +# Remove this 88: THIS (next line) does not yet exist - make sure it's available +curl -Lo bullet-spark.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-spark/0.1.1/bullet-spark-0.1.1-standalone.jar +``` + +#### Step 11: Launch the Bullet Spark Backend + +**Note:** This is a single command (new-lines are escaped) - run it in a single bash command: + +```bash +$BULLET_SPARK/spark-2.2.1-bin-hadoop2.7/bin/spark-submit \ + --master local[10] \ + --class com.yahoo.bullet.spark.BulletSparkStreamingMain \ + --driver-class-path $BULLET_SPARK/bullet-spark.jar:$BULLET_HOME/pubsub/bullet-kafka.jar:$BULLET_SPARK/bullet-spark-example.jar \ + $BULLET_SPARK/bullet-spark.jar \ + --bullet-spark-conf=$BULLET_SPARK/bullet_spark_settings.yaml &> log.txt & + +``` + +The backend will usually be up and running usually within 5-10 seconds. Once it is running you can get information about the Spark job in the Spark UI, which can be seen in your browser at **http://localhost:4040** by default. The Web Service will now be hooked up through the Kafka PubSub to the Spark backend. +To test it you can now run a Bullet query by hitting the web service directly: + +```bash +curl -s -H 'Content-Type: text/plain' -X POST -d '{"aggregation": {"size": 1}}' http://localhost:9999/api/bullet/sse-query +``` + +This query will return a result JSON containing a "records" field containing a single record, and a "meta" field with some meta information. + +!!! note "What is this data?" + + This data is randomly generated by the [custom data producer](https://github.com/bullet-db/bullet-db.github.io/blob/src/examples/spark/src/main/scala/com/yahoo/bullet/spark/examples/receiver/RandomReceiver.scala) that was created for the sole purpose of generating toy data to demo Bullet. In practice, your spout would read from an actual data source such as Kafka. + +### Setting up the Bullet UI + +** ------------------------------------------------------------ This needs to be re-done with the new UI!! ------------------------------------------------------------------------- ** + +#### Step 12: Install Node + +```bash +cd $BULLET_HOME/ui +curl -s https://raw.githubusercontent.com/creationix/nvm/v0.33.1/install.sh | bash +source ~/.bashrc +nvm install v6.9.4 +nvm use v6.9.4 +``` + +#### Step 13: Install the Bullet UI + +```bash +curl -LO https://github.com/yahoo/bullet-ui/releases/download/v0.4.0/bullet-ui-v0.4.0.tar.gz +tar -xzf bullet-ui-v0.4.0.tar.gz +cp $BULLET_EXAMPLES/ui/env-settings.json config/ +``` + +#### Step 14: Launch the UI + +```bash +PORT=8800 node express-server.js & +``` + +Visit [http://localhost:8800](http://localhost:8800) to query your topology with the UI. See [UI usage](ui/usage.md) for some example queries and interactions using this UI. You see what the Schema means by visiting the Schema section. + +!!! note "Running it remotely?" + + If you access the UI from another machine than where your UI is actually running, you will need to edit ```config/env-settings.json```. Since the UI is a client-side app, the machine that your browser is running on will fetch the UI and attempt to use these settings to talk to the Web Service. Since they point to localhost by default, your browser will attempt to connect there and fail. An easy fix is to change ```localhost``` in your env-settings.json to point to the host name where you will hosting the UI. This will be the same as the UI host you use in the browser. You can also do a local port forward on the machine accessing the UI by running: + ```ssh -N -L 8800:localhost:8800 -L 9999:localhost:9999 hostname-of-the-quickstart-components 2>&1``` + +## Playing around with the instance + +Check out and follow along with the [UI Usage](ui/usage.md) page as it shows you some queries you can run using this UI. + + + + + + + + + + + + + + + + +## Teardown +* Web service: + * pkill -f [e]xample_kafka_pubsub_config.yaml + * pkill -f [b]ullet-spark +* Kafka and Zookeeper: + * TRY THIS: + * pkill -f kafka-server-start + * pkill -f [p]ubsub/kafka_2.12-0.11.0.1 + * /Users/nspeidel/bullet-quickstart/pubsub/kafka_2.12-0.11.0.1/bin/zookeeper-server-stop.sh + * pkill -f [k]afka_2.12-0.11.0.1 + * pkill -f [k]afka-server-start + * pkill -f [z]ookeeper.properties + + + + + + + + + +## Teardown + +If you were using the [Install Script](#install-script) or if you don't want to manually bring down everything, you can run: + +```bash +curl -sLo- https://raw.githubusercontent.com/yahoo/bullet-docs/v0.4.0/examples/install-all.sh | bash -s cleanup +``` + +If you were performing the steps yourself, you can also manually cleanup **all the components and all the downloads** using: + +| | | +| -------------- | ---------------------------------------------------------------- | +| UI | ```pkill -f [e]xpress-server.js``` | +| Web Service | ```pkill -f [e]xample_kafka_pubsub_config.yaml``` | +| Spark | ```pkill -f [b]ullet-spark``` | +| Kafka | ```${KAFKA_DIR}/bin/kafka-server-stop.sh``` | +| Zookeeper | ```${KAFKA_DIR}/bin/zookeeper-server-stop.sh``` | +| File System | ```rm -rf $BULLET_HOME /tmp/dev-storm-zookeeper /tmp/jetty-*``` | + + + +This does *not* delete ```$HOME/.nvm``` and some extra lines nvm may have added to your ```$HOME/{.profile, .bash_profile, .zshrc, .bashrc}```. + +## What did we do? + +This section will go over the various custom pieces this example plugged into Bullet, so you can better understand what we did. + +### Storm topology + +The topology was the Bullet topology plugged in with a custom spout. This spout is implemented in this [example project](https://github.com/yahoo/bullet-docs/blob/master/examples/storm/) and was already built for you when you [downloaded the examples](#step-1-setup-directories-and-examples). It does not read from any data source and just produces random, structured data. It also produces only up to a maximum number of records in a given period. Both this maximum and the length of a period are configurable. If you examine $BULLET_HOME/backend/storm/launch.sh, you'll see the following: + +```bash +storm jar bullet-storm-example-1.0-SNAPSHOT-jar-with-dependencies.jar \ + com.yahoo.bullet.Topology \ + --bullet-conf bullet_settings.yaml \ + --bullet-spout com.yahoo.bullet.storm.examples.RandomSpout \ + --bullet-spout-parallelism 1 \ + ... + --bullet-spout-arg 20 \ + --bullet-spout-arg 101 \ + ... +``` + +This command launches the jar (an uber or "fat" jar) containing the custom spout code and all dependencies you copied in Step 5. We pass the name of your spout class with ```--bullet-spout com.yahoo.bullet.storm.examples.RandomSpout``` to the Bullet main class ```com.yahoo.bullet.Topology``` with two arguments ```--bullet-spout-arg 20``` and ```--bullet-spout-arg 101```. The first argument tells the Spout to generate at most 20 tuples (records) in a period and the second argument says a period is 101 ms long. + +The settings defined by ```--bullet-conf bullet_settings.yaml``` and the arguments here run all components in the topology with a parallelism of 1. So there will be one spout that is producing ~200 rps. + +!!! note "I thought you said hundreds of thousands of records..." + + 200 records is not Big Data by any stretch of the imagination but this Quick Start is running everything on one machine and is meant to introduce you to what Bullet does. In practice, you would scale and run your components with CPU and memory configurations to accommodate for your data volume and querying needs. + + +Let's look at the [custom spout code](https://github.com/yahoo/bullet-docs/blob/master/examples/storm/src/main/java/com/yahoo/bullet/storm/examples/RandomSpout.java) that generates the data. + +```java + @Override + public void nextTuple() { + long timeNow = System.nanoTime(); + // Only emit if we are still in the interval and haven't gone over our per period max + if (timeNow <= nextIntervalStart && generatedThisPeriod < maxPerPeriod) { + outputCollector.emit(new Values(generateRecord()), DUMMY_ID); + generatedThisPeriod++; + } + if (timeNow > nextIntervalStart) { + log.info("Generated {} tuples out of {}", generatedThisPeriod, maxPerPeriod); + nextIntervalStart = timeNow + period; + generatedThisPeriod = 0; + periodCount++; + } + // It is courteous to sleep for a short time if you're not emitting anything... + try { + Thread.sleep(1); + } catch (InterruptedException e) { + log.error("Error: ", e); + } + } +``` + +This method above emits the tuples. The Storm framework calls this method. This function only emits at most the given maximum tuples per period. + +!!! note "Why a DUMMY_ID?" + + When the spout emits the randomly generated tuple, it attaches a ```DUMMY_ID``` to it. In Storm terms, this is a message ID. By adding a message ID, this tuple can be made to flow reliably. The Bullet component that receives this tuple (Filter bolt) acknowledges or "acks" this tuple. If the tuple did not make it to Filter bolt within a configured timeout window, Storm will call a ```fail(Object messageId)``` method on the spout. This particular spout does not define one and hence the usage of a ```DUMMY_ID```. If your source of data can identify records uniquely and you can re-emit them on a fail, you should attach that actual ID in place of the ```DUMMY_ID```. + +```java + private BulletRecord generateRecord() { + BulletRecord record = new BulletRecord(); + String uuid = UUID.randomUUID().toString(); + + record.setString(STRING, uuid); + record.setLong(LONG, (long) generatedThisPeriod); + record.setDouble(DOUBLE, random.nextDouble()); + record.setString(TYPE, STRING_POOL[random.nextInt(STRING_POOL.length)]); + record.setLong(DURATION, System.currentTimeMillis() % INTEGER_POOL[random.nextInt(INTEGER_POOL.length)]); + + Map booleanMap = new HashMap<>(4); + booleanMap.put(uuid.substring(0, 8), random.nextBoolean()); + booleanMap.put(uuid.substring(9, 13), random.nextBoolean()); + booleanMap.put(uuid.substring(14, 18), random.nextBoolean()); + booleanMap.put(uuid.substring(19, 23), random.nextBoolean()); + record.setBooleanMap(BOOLEAN_MAP, booleanMap); + + Map statsMap = new HashMap<>(4); + statsMap.put(PERIOD_COUNT, periodCount); + statsMap.put(RECORD_NUMBER, periodCount * maxPerPeriod + generatedThisPeriod); + statsMap.put(NANO_TIME, System.nanoTime()); + statsMap.put(TIMESTAMP, System.currentTimeMillis()); + record.setLongMap(STATS_MAP, statsMap); + + Map randomMapA = new HashMap<>(2); + Map randomMapB = new HashMap<>(2); + randomMapA.put(RANDOM_MAP_KEY_A, STRING_POOL[random.nextInt(STRING_POOL.length)]); + randomMapA.put(RANDOM_MAP_KEY_B, STRING_POOL[random.nextInt(STRING_POOL.length)]); + randomMapB.put(RANDOM_MAP_KEY_A, STRING_POOL[random.nextInt(STRING_POOL.length)]); + randomMapB.put(RANDOM_MAP_KEY_B, STRING_POOL[random.nextInt(STRING_POOL.length)]); + record.setListOfStringMap(LIST, asList(randomMapA, randomMapB)); + + return record; + } +``` + +This method generates some fields randomly and inserts them into a BulletRecord. Note that the BulletRecord is typed and all data must be inserted with the proper types. + +If you put Bullet on your data, you will need to write a Spout (or a topology if your reading is complex), that reads from your data source and emits BulletRecords with the fields you wish to be query-able placed into a BulletRecord similar to this example. + +### PubSub + +We used the [DRPC PubSub](pubsub/storm-drpc.md) since we were using the Storm Backend. This code was included in the Bullet Storm artifact that we downloaded (the JAR with dependencies). We configured the Backend to use this PubSub by adding these settings to the YAML file that we passed to our Storm topology. Notice that we set the context to ```QUERY_PROCESSING``` since this is the Backend. + +```yaml +bullet.pubsub.context.name: "QUERY_PROCESSING" +bullet.pubsub.class.name: "com.yahoo.bullet.storm.drpc.DRPCPubSub" +bullet.pubsub.storm.drpc.function: "bullet-query" +``` + +For the Web Service, we passed in a YAML file that pointed to our DRPC server that was part of the Storm cluster we launched. Notice that we set the context to ```QUERY_SUBMISSION``` since this is the Web Service. + +```yaml +bullet.pubsub.context.name: "QUERY_SUBMISSION" +bullet.pubsub.class.name: "com.yahoo.bullet.storm.drpc.DRPCPubSub" +bullet.pubsub.storm.drpc.servers: + - 127.0.0.1 +bullet.pubsub.storm.drpc.function: "bullet-query" +bullet.pubsub.storm.drpc.http.protocol: "http" +bullet.pubsub.storm.drpc.http.port: "3774" +bullet.pubsub.storm.drpc.http.path: "drpc" +bullet.pubsub.storm.drpc.http.connect.retry.limit: 3 +bullet.pubsub.storm.drpc.http.connect.timeout.ms: 1000 +``` + +### Web Service + +We launched the Web Service using two custom files - a PubSub configuration YAML file and JSON schema file. + +The JSON columns file contains the schema for our data specified in JSON. Since our schema is not going to change, we use the Web Service to serve it from a file. If your schema changes dynamically, you will need to provide your own endpoint to the UI. + +The following is a snippet from the [JSON file](https://github.com/yahoo/bullet-docs/blob/master/examples/web-service/example_columns.json). Notice how the types of the fields are specified. Also, if you have generated BulletRecord with Map fields whose keys are known, you can specify them here using ```enumerations```. + +```javascript +[ + { + "name": "probability", + "type": "DOUBLE", + "description": "Generated from Random#nextDouble" + }, + ... + { + "name": "stats_map", + "type": "MAP", + "subtype": "LONG", + "description": "This map contains some numeric information such as the current number of periods etc.", + "enumerations": [ + ... + {"name": "nano_time", "description": "The ns time when this record was generated"} + ] + }, + { + "name": "classifiers", + "type": "LIST", + "subtype": "MAP", + "description": "This contains two maps, each with: field_A and field_B whose values are randomly chosen from: foo, bar, baz, qux, quux, norf" + } +] +``` +The contents of the [PubSub configuration file](https://github.com/yahoo/bullet-docs/blob/master/examples/web-service/example_drpc_pubsub_config.yaml) was discussed in the [PubSub section above](#pubsub). + +### UI + +Finally, we configured the UI with the custom environment specific settings file. We did not add any environments since we only had the one. + +```javascript +{ + "default": { + "queryHost": "http://localhost:9999", + "queryNamespace": "api/bullet", + "queryPath": "query", + "schemaHost": "http://localhost:9999", + "schemaNamespace": "api/bullet", + "helpLinks": [ + { + "name": "Examples", + "link": "https://yahoo.github.io/bullet-docs/ui/usage" + } + ], + "bugLink": "https://github.com/yahoo/bullet-ui/issues", + "modelVersion": 2, + "migrations": { + "deletions": "result" + }, + "defaultValues": { + "defaultValues": { + "aggregationMaxSize": 1024, + "rawMaxSize": 500, + "durationMaxSecs": 540, + "distributionNumberOfPoints": 11, + "distributionQuantilePoints": "0, 0.25, 0.5, 0.75, 0.9, 1", + "distributionQuantileStart": 0, + "distributionQuantileEnd": 1, + "distributionQuantileIncrement": 0.1, + "queryTimeoutSecs": 3, + "sketches": { + "countDistinctMaxEntries": 16384, + "groupByMaxEntries": 512, + "distributionMaxEntries": 1024, + "distributionMaxNumberOfPoints": 200, + "topKMaxEntries": 1024, + "topKErrorType": "No False Negatives" + }, + "metadataKeyMapping": { + "theta": "theta", + "uniquesEstimate": "uniques_estimate", + "queryCreationTime": "query_receive_time", + "queryTerminationTime": "query_finish_time", + "estimatedResult": "was_estimated", + "standardDeviations": "standard_deviations", + "normalizedRankError": "normalized_rank_error", + "maximumCountError": "maximum_count_error", + "itemsSeen": "items_seen", + "minimumValue": "minimum_value", + "maximumValue": "maximum_value" + } + } + } +} +``` + +Since we served our schema through the same Web Service as our queries, both these point to our Web Service. Note that there is no ```schemaPath``` because it must be the constant string ```columns```. If you define a custom endpoint for your schema, you must ensure that it can be obtained by making a GET request to ```schemaHost/schemaNamespace/columns```. diff --git a/docs/quick-start.md b/docs/quick-start/bullet-on-storm.md similarity index 81% rename from docs/quick-start.md rename to docs/quick-start/bullet-on-storm.md index 668d2fa9..eed84fb4 100644 --- a/docs/quick-start.md +++ b/docs/quick-start/bullet-on-storm.md @@ -19,7 +19,7 @@ DO THIS LATER (one-liner?) ## Manual Installation -### Setting up the Bullet Web Service and Built-In REST Pub-Sub +### Setup the Bullet Web Service and REST Pub-Sub Before we launch the Bullet Spark backend, we first need to setup the Bullet Web Service and PubSub layer. The bullet-core repo provides a [pubsub.rest](https://github.com/bullet-db/bullet-core/tree/master/src/main/java/com/yahoo/bullet/pubsub/rest) package which is a simple implementation of the PubSub layer using REST endpoints. The bullet web service can be configured to use this built-in REST PubSub to provide the additional REST endpoints needed to serve as a PubSub layer as well as the web service. @@ -57,7 +57,7 @@ The Web Service usually takes ~10-15 seconds to start. You can check the status of the Web Service by looking at the Web Service log: ```bash -vim $BULLET_HOME/service/log.txt +cat $BULLET_HOME/service/log.txt ``` The log should contain a message that reads something like `Started Application in X seconds` (usually the last line of the file if the web service has been run recently). @@ -89,7 +89,7 @@ curl http://localhost:9999/api/bullet/pubsub/query This should print `'{}'` to the screen, indicating we have successfully written and then read a fake empty query from the PubSub layer. Subsequent reads from this endpoint will return nothing because no more queries have been written to the PubSub endpoint. -### Install Spark 2.2.1 +### Setup Bullet Backend on Spark We will run the bullet-spark backend using [Spark 2.2.1](https://spark.apache.org/releases/spark-release-2-2-1.html). @@ -109,105 +109,31 @@ cp $BULLET_HOME/bullet-examples/backend/spark/* $BULLET_SPARK curl -Lo bullet-spark.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-spark/0.1.1/bullet-spark-0.1.1-standalone.jar ``` -#### Step 8: Launch the Spark Backend +#### Step 8: Launch the Bullet Spark Backend ```bash $BULLET_SPARK/spark-2.2.1-bin-hadoop2.7/bin/spark-submit --master local[10] --class com.yahoo.bullet.spark.BulletSparkStreamingMain --driver-class-path $BULLET_SPARK/bullet-spark-example.jar:$BULLET_SPARK/bullet-spark.jar $BULLET_SPARK/bullet-spark.jar &> log.txt & ``` - - - - - - - - - - - - - -#### Step 5: Setup the Storm example - -```bash -cp $BULLET_EXAMPLES/storm/* $BULLET_HOME/backend/storm -``` - -!!! note "Settings" - - Take a look at bullet_settings.yaml for the settings that are being overridden for this example. You can add or change settings as you like by referring to [core Bullet settings in bullet_defaults.yaml](https://github.com/yahoo/bullet-core/blob/master/src/main/resources/bullet_defaults.yaml) and [Storm settings in bullet_storm_defaults.yaml](https://github.com/yahoo/bullet-storm/blob/master/src/main/resources/bullet_storm_defaults.yaml). In particular, we have [customized these settings](https://github.com/yahoo/bullet-docs/blob/master/examples/storm/src/main/resources/bullet_settings.yaml) that affect the Bullet queries you can run: - - ```bullet.query.max.duration: 570000``` Longest query time can be 570s. The Storm cluster default DRPC timeout is 600s. - - ```bullet.query.aggregation.raw.max.size: 500``` The max ```RAW``` records you can fetch is 500. - - ```bullet.query.aggregation.max.size: 1024``` The max records you can fetch for any query is 1024. - - ```bullet.query.aggregation.count.distinct.sketch.entries: 16384``` We can count 16384 unique values exactly. Approximates after. - - ```bullet.query.aggregation.group.sketch.entries: 1024``` The max unique groups can be 1024. Uniform sample after. - - ```bullet.query.aggregation.distribution.sketch.entries: 1024``` Determines the normalized rank error for distributions. - - ```bullet.query.aggregation.top.k.sketch.entries: 1024``` 0.75 times this number is the number of unique items for which counts can be done exactly. Approximates after. - - ```bullet.query.aggregation.distribution.max.points: 200``` The maximum number of points you can generate, use or provide for a Distribution aggregation. - -!!! note "Want to tweak the example topology code?" - - You will need to clone the [examples repository](https://github.com/yahoo/bullet-docs/tree/master/examples/storm) and customize it. To build the examples, you'll need to install [Maven 3](https://maven.apache.org/install.html). - - ```cd $BULLET_HOME && git clone git@github.com:yahoo/bullet-docs.git``` - - ```cd bullet-docs/examples/storm && mvn package``` - - You will find the ```bullet-storm-example-1.0-SNAPSHOT-jar-with-dependencies.jar``` in ```$BULLET_HOME/bullet-docs/examples/storm/target/``` - - You can also make the ```examples_artifacts.tar.gz``` file with all the settings that is placed in ```$BULLET_EXAMPLES``` by just running ```make``` in the ```bullet-docs/examples/``` folder. - -#### Step 6: Launch the topology +The backend will usually be up and running within 5-10 seconds. The Web Service will now be hooked up through the REST PubSub to the Spark backend. You can now run a Bullet query by hitting the web service directly: ```bash -cd $BULLET_HOME/backend/storm && ./launch.sh +curl -s -H 'Content-Type: text/plain' -X POST -d '{"aggregation": {"size": 1}}' http://localhost:9999/api/bullet/sse-query ``` -Visit the UI and see if the topology is up. You should see the ```DataSource``` spout begin emitting records. -Test the Bullet topology by: +This query will return a result JSON containing a "records" field containing a single record, and a "meta" field with some meta information. -```bash -curl -s -X POST -d '{"id":"", "content":"{}"}' http://localhost:3774/drpc/bullet-query -``` +!!! note "What is this data?" -You should get a random record (serialized as a String inside a JSON message sent back through the PubSub) from Bullet. + This data is randomly generated by the [custom data producer](https://github.com/bullet-db/bullet-db.github.io/blob/src/examples/spark/src/main/scala/com/yahoo/bullet/spark/examples/receiver/RandomReceiver.scala) that was created for the sole purpose of generating toy data to demo Bullet. In practice, your spout would read from an actual data source such as Kafka. -!!! note "What is this data?" - This data is randomly generated by the [custom Storm spout](https://github.com/yahoo/bullet-docs/blob/master/examples/storm/src/main/java/com/yahoo/bullet/storm/examples/RandomSpout.java) that is in the example topology you just launched. In practice, your spout would read from an actual data source such as Kafka instead. See [below](#storm-topology) for more details about this random data spout. -### Setting up the Bullet Web Service -#### Step 7: Install the Bullet Web Service -```bash -cd $BULLET_HOME/service -curl -Lo bullet-service.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-service/0.1.1/bullet-service-0.1.1-embedded.jar -cp $BULLET_EXAMPLES/web-service/example* $BULLET_HOME/service/ -cp $BULLET_EXAMPLES/storm/*jar-with-dependencies.jar $BULLET_HOME/service/bullet-storm-jar-with-dependencies.jar -``` -#### Step 8: Launch the Web Service -```bash -cd $BULLET_HOME/service -java -Dloader.path=bullet-storm-jar-with-dependencies.jar -jar bullet-service.jar --bullet.pubsub.config=example_drpc_pubsub_config.yaml --bullet.schema.file=example_columns.json --server.port=9999 --logging.path=. --logging.file=log.txt &> log.txt & -``` -You can verify that it is up by running a Bullet query or getting the example columns through the API: -```bash -curl -s -H 'Content-Type: text/plain' -X POST -d '{}' http://localhost:9999/api/bullet/query -curl -s http://localhost:9999/api/bullet/columns -``` ### Setting up the Bullet UI diff --git a/examples/Makefile b/examples/Makefile index 5ec01eb6..b4177bee 100644 --- a/examples/Makefile +++ b/examples/Makefile @@ -6,12 +6,13 @@ clean: build: mkdir -p bullet-examples/backend/storm - cp install-all.sh bullet-examples/ + cp install-all-storm.sh bullet-examples/ cd storm && mvn package cp storm/target/*jar-with-dependencies.jar bullet-examples/backend/storm cp storm/bin/launch.sh bullet-examples/backend/storm cp storm/src/main/resources/bullet_settings.yaml bullet-examples/backend/storm mkdir -p bullet-examples/backend/spark + cp install-all-spark.sh bullet-examples/ cd spark && mvn package cp spark/target/bullet-spark-example-0.0.1-SNAPSHOT.jar bullet-examples/backend/spark/bullet-spark-example.jar cp spark/src/main/resources/bullet_spark_settings.yaml bullet-examples/backend/spark diff --git a/examples/install-all-spark.sh b/examples/install-all-spark.sh new file mode 100755 index 00000000..fd357951 --- /dev/null +++ b/examples/install-all-spark.sh @@ -0,0 +1,344 @@ +#! /usr/bin/env bash + +set -euo pipefail + +BULLET_EXAMPLES_VERSION=0.4.0 +BULLET_UI_VERSION=0.4.0 +BULLET_WS_VERSION=0.2.1 +BULLET_KAFKA_VERSION=0.3.0 +KAFKA_VERSION=0.11.0.1 +SPARK_VERSION=2.2.1 +NVM_VERSION=0.33.1 +NODE_VERSION=6.9.4 + +KAFKA_TOPIC_REQUESTS=bullet.requests +KAFKA_TOPIC_RESPONSES=bullet.responses + +println() { + local DATE + DATE="$(date)" + printf "%s [BULLET-QUICKSTART] %s\n" "${DATE}" "$1" +} + +print_versions() { + println "Using the following artifacts..." + println "Bullet Examples: ${BULLET_EXAMPLES_VERSION}" + println "Bullet Web Service: ${BULLET_WS_VERSION}" + println "Bullet UI: ${BULLET_UI_VERSION}" + println "Kafka: ${KAFKA_VERSION}" + println "NVM: ${NVM_VERSION}" + println "Node.js: ${NODE_VERSION}" + println "Done!" +} + +download() { + local URL="$1" + local FILE="$2" + + local FILE_PATH="${BULLET_DOWNLOADS}/${FILE}" + + if [[ -s "${FILE_PATH}" ]]; then + println "Download exists in ${FILE_PATH}. Skipping download..." + else + println "curl --retry 2 -#LO \"${URL}/${FILE}\"" + cd "${BULLET_DOWNLOADS}" && { curl --retry 2 -#LO "${URL}/${FILE}" ; cd - &> /dev/null; } + fi +} + +export_vars() { + local PWD + PWD="$(pwd)" + + println "Exporting some variables..." + export BULLET_HOME="${PWD}/bullet-quickstart" + export BULLET_EXAMPLES=$BULLET_HOME/bullet-examples + export BULLET_DOWNLOADS=$BULLET_HOME/bullet-downloads + export BULLET_SPARK=${BULLET_HOME}/backend/spark + println "Done!" +} + +setup() { + println "Setting up directories..." + mkdir -p "${BULLET_HOME}/backend/spark" + mkdir -p "${BULLET_HOME}/service" + mkdir -p "${BULLET_HOME}/ui" + mkdir -p "${BULLET_HOME}/pubsub" + mkdir -p "${BULLET_DOWNLOADS}" + println "Done!" +} + +install_bullet_examples() { + println "Downloading Bullet Examples ${BULLET_EXAMPLES_VERSION}..." + download "https://github.com/yahoo/bullet-docs/releases/download/v${BULLET_EXAMPLES_VERSION}" "examples_artifacts.tar.gz" + + println "Installing Bullet Examples..." + tar -xzf "${BULLET_DOWNLOADS}/examples_artifacts.tar.gz" -C "${BULLET_HOME}" + println "Done!" +} + +install_kafka() { + local KAFKA="kafka_2.12-${KAFKA_VERSION}" + local PUBSUB="${BULLET_HOME}/pubsub/" + + println "Downloading Kafka ${KAFKA_VERSION}..." + download "https://archive.apache.org/dist/kafka/${KAFKA_VERSION}" "${KAFKA}.tgz" + + println "Installing Kafka ..." + tar -xzf ${BULLET_DOWNLOADS}/${KAFKA}.tgz -C ${PUBSUB} + export KAFKA_DIR=${PUBSUB}${KAFKA} + + println "Done!" +} + +install_bullet_kafka() { + local BULLET_KAFKA="bullet-kafka-${BULLET_KAFKA_VERSION}-fat.jar" + local PUBSUB="${BULLET_HOME}/pubsub/" + + println "Downloading bullet-kafka ${BULLET_KAFKA_VERSION}..." + download "http://jcenter.bintray.com/com/yahoo/bullet/bullet-kafka/${BULLET_KAFKA_VERSION}" "${BULLET_KAFKA}" + cp ${BULLET_DOWNLOADS}/${BULLET_KAFKA} ${PUBSUB}${BULLET_KAFKA} + export BULLET_KAFKA_JAR=${PUBSUB}${BULLET_KAFKA} + + println "Done!" +} + +launch_kafka() { + println "Launching Zookeeper..." + $KAFKA_DIR/bin/zookeeper-server-start.sh $KAFKA_DIR/config/zookeeper.properties & + sleep 3 + + println "Launching Kafka..." + $KAFKA_DIR/bin/kafka-server-start.sh $KAFKA_DIR/config/server.properties & + + sleep 3 + println "Done!" +} + +create_topics() { + set +e + println "Creating kafka topics ${KAFKA_TOPIC_REQUESTS} and ${KAFKA_TOPIC_RESPONSES}..." + $KAFKA_DIR/bin/kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 1 --partitions 1 --topic ${KAFKA_TOPIC_REQUESTS} + $KAFKA_DIR/bin/kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 1 --partitions 1 --topic ${KAFKA_TOPIC_RESPONSES} + set -e + + sleep 3 + println "Done!" +} + +install_web_service() { + local BULLET_WEB_SERVICE="bullet-service-${BULLET_WS_VERSION}-embedded.jar" + + println "Downloading bullet web service version ${BULLET_WS_VERSION}..." + download "http://jcenter.bintray.com/com/yahoo/bullet/bullet-service/${BULLET_WS_VERSION}" "${BULLET_WEB_SERVICE}" + + println "Installing bullet web service..." + cp ${BULLET_DOWNLOADS}/${BULLET_WEB_SERVICE} ${BULLET_HOME}/service/ + cp ${BULLET_EXAMPLES}/web-service/example_kafka_pubsub_config.yaml ${BULLET_HOME}/service/ + cp ${BULLET_EXAMPLES}/web-service/example_columns.json ${BULLET_HOME}/service/ + export BULLET_WS_JAR=${BULLET_HOME}/service/${BULLET_WEB_SERVICE} + + println "Done!" +} + +launch_web_service() { + local BULLET_SERVICE_HOME="${BULLET_HOME}/service" + + println "Launching Bullet Web Service..." + cd "${BULLET_SERVICE_HOME}" + java -Dloader.path=${BULLET_KAFKA_JAR} -jar ${BULLET_WS_JAR} \ + --bullet.pubsub.config=${BULLET_SERVICE_HOME}/example_kafka_pubsub_config.yaml \ + --bullet.schema.file=${BULLET_SERVICE_HOME}/example_columns.json \ + --server.port=9999 \ + --logging.path=. \ + --logging.file=log.txt &> log.txt & + + println "Sleeping for 15 s to ensure Bullet Web Service is up..." + sleep 15 + + println "Testing the Web Service: Getting column schema..." + println "" + curl -s http://localhost:9999/api/bullet/columns + println "Finished Bullet Web Service test" +} + +install_spark() { + local SPARK="spark-${SPARK_VERSION}-bin-hadoop2.7.tgz" + + println "Downloading Spark version ${SPARK_VERSION}..." + download "http://www-us.apache.org/dist/spark/spark-${SPARK_VERSION}" "${SPARK}" + + println "Installing Spark version ${SPARK_VERSION}..." + cp ${BULLET_DOWNLOADS}/${SPARK} ${BULLET_HOME}/backend/spark/ + + tar -xzf "${BULLET_HOME}/backend/spark/${SPARK}" -C "${BULLET_HOME}/backend/spark/" + export SPARK_DIR="${BULLET_HOME}/backend/spark/spark-${SPARK_VERSION}-bin-hadoop2.7" + + println "Done!" +} + +install_bullet_spark() { + cp $BULLET_HOME/bullet-examples/backend/spark/* $BULLET_SPARK + # Remove this 88 - THIS STILL NEEDS to be implemented - download the thing (it's not available online yet because we haven't released this version yet): + # Something like this: curl -Lo bullet-spark.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-spark/0.1.1/bullet-spark-0.1.1-standalone.jar +} + +launch_bullet_spark() { + cd ${BULLET_SPARK} + println "Launching bullet-spark..." + ${SPARK_DIR}/bin/spark-submit \ + --master local[10] \ + --class com.yahoo.bullet.spark.BulletSparkStreamingMain \ + --driver-class-path $BULLET_SPARK/bullet-spark.jar:${BULLET_KAFKA_JAR}:$BULLET_SPARK/bullet-spark-example.jar \ + $BULLET_SPARK/bullet-spark.jar \ + --bullet-spark-conf=$BULLET_SPARK/bullet_spark_settings.yaml &> log.txt & + + println "Sleeping for 15 s to ensure bullet-spark is up and running..." + sleep 15 + + println "Done! You should now be able to query Bullet through the web service. Try this:" + println "curl -s -H 'Content-Type: text/plain' -X POST -d '{\"aggregation\": {\"size\": 1}}' http://localhost:9999/api/bullet/sse-query" +} + + + +install_node() { + # NVM unset var bug + set +u + + println "Trying to install nvm. If there is a failure, manually perform: " + println " curl -s https://raw.githubusercontent.com/creationix/nvm/v${NVM_VERSION}/install.sh | bash" + println " nvm install v${NODE_VERSION}" + println " nvm use v${NODE_VERSION}" + println "and then try this script again..." + + println "Downloading and installing NVM ${NVM_VERSION}..." + curl --retry 2 -s "https://raw.githubusercontent.com/creationix/nvm/v${NVM_VERSION}/install.sh" | bash + + println "Loading nvm into current environment if installation successful..." + [ -s "${HOME}/.nvm/nvm.sh" ] && source "${HOME}/.nvm/nvm.sh" + println "Done!" + + println "Installing Node ${NODE_VERSION}..." + nvm install "v${NODE_VERSION}" + nvm use "v${NODE_VERSION}" + + set -u + + println "Done!" +} + +launch_bullet_ui() { + local BULLET_UI_ARCHIVE="bullet-ui-v${BULLET_UI_VERSION}.tar.gz" + + println "Downloading Bullet UI ${BULLET_UI_VERSION}..." + download "https://github.com/yahoo/bullet-ui/releases/download/v${BULLET_UI_VERSION}" "${BULLET_UI_ARCHIVE}" + + cd "${BULLET_HOME}/ui" + + println "Installing Bullet UI..." + tar -xzf "${BULLET_DOWNLOADS}/${BULLET_UI_ARCHIVE}" + + println "Configuring Bullet UI..." + cp "${BULLET_EXAMPLES}/ui/env-settings.json" config/ + + println "Launching Bullet UI..." + PORT=8800 node express-server.js & + + println "Sleeping for 5 s to ensure Bullet UI is up..." + sleep 5 + println "Done!" +} + +cleanup() { + set +e + + pkill -f "[e]xpress-server.js" + pkill -f "[e]xample_kafka_pubsub_config.yaml" + pkill -f "[b]ullet-spark" + ${KAFKA_DIR}/bin/kafka-server-stop.sh + ${KAFKA_DIR}/bin/zookeeper-server-stop.sh + + sleep 3 + + rm -rf "${BULLET_EXAMPLES}" "${BULLET_HOME}/backend" "${BULLET_HOME}/service" \ + "${BULLET_HOME}/ui" "${BULLET_HOME}/pubsub" /tmp/dev-storm-zookeeper + + set -e +} + +teardown() { + println "Killing and cleaning up all Bullet components..." + cleanup &> /dev/null + println "Done!" +} + +unset_all() { + unset -f print_versions println download export_vars setup \ + install_bullet_examples \ + install_storm launch_storm launch_bullet_storm \ + launch_bullet_web_service \ + install_node launch_bullet_ui \ + cleanup teardown unset_all launch +} + +launch() { + print_versions + export_vars + + teardown + + setup + + # install_bullet_examples + # <------------- Remove this 88 - the above line needs to be uncommented and all the below stuff should be removed once this artifact actualy exists on the git cloud or whatever + cp ~/bullet/bullet-db.github.io/examples/examples_artifacts.tar.gz ${BULLET_DOWNLOADS}/ + tar -xzf "${BULLET_DOWNLOADS}/examples_artifacts.tar.gz" -C "${BULLET_HOME}" # <------------ Remove this 88 - remove this line and the one above it once the artifact is actulaly on github + + install_kafka + install_bullet_kafka + launch_kafka + create_topics + + install_web_service + launch_web_service + + install_spark + # install_bullet_spark + # <------------- Remove this 88 - the above line needs to be uncommented and all the below stuff should be removed once this artifact actualy exists on the git cloud or whatever + cp $BULLET_HOME/bullet-examples/backend/spark/* $BULLET_SPARK # <------------ Remove this 88 + cp ~/bullet/bullet-spark/target/bullet-spark-0.1.1-SNAPSHOT-standalone.jar $BULLET_SPARK/bullet-spark.jar # <------------ Remove this 88 + + launch_bullet_spark + + # Remove this 88 - deal with the following two lines: + # Now do the UI stuff once the new UI is ready + # ALSO - DON'T FORGET! The teardown stuff doesn't work unless you run the whole script (the "else" block at the bottom won't work) because the KAFKA_DIR isn't defined unless you run install_kafka function) - so fix that somehow + + + + + + + # install_node + # launch_bullet_ui + + # println "All components launched! Visit http://localhost:8800 (default) for the UI" + # unset_all +} + +clean() { + println "Launching cleanup..." + export_vars + teardown + println "Not deleting ${BULLET_DOWNLOADS}, ${HOME}/.nvm or nvm additions to ${HOME}/{.profile, .bash_profile, .zshrc, .bashrc}..." + println "Cleaned up ${BULLET_HOME} and /tmp" + println "To delete all download artifacts (excluding nvm), do:" + println " rm -rf ${BULLET_HOME}" + unset_all +} + +if [ $# -eq 0 ]; then + launch +else + clean +fi diff --git a/examples/install-all.sh b/examples/install-all-storm.sh similarity index 100% rename from examples/install-all.sh rename to examples/install-all-storm.sh diff --git a/examples/spark/src/main/resources/bullet_spark_settings.yaml b/examples/spark/src/main/resources/bullet_spark_settings.yaml index 11b422be..06413ea2 100644 --- a/examples/spark/src/main/resources/bullet_spark_settings.yaml +++ b/examples/spark/src/main/resources/bullet_spark_settings.yaml @@ -1,7 +1,7 @@ ######################################################################################################################## ######################################### Bullet Spark Settings ##################################### ######################################################################################################################## -bullet.spark.data.producer.class.name: "com.yahoo.bullet.spark.receiver.RandomReceiver" +bullet.spark.data.producer.class.name: "com.yahoo.bullet.spark.examples.RandomProducer" bullet.spark.batch.duration.ms: 1000 bullet.spark.receiver.query.block.size: 1 bullet.spark.receiver.query.coalesce.partitions: 10 @@ -27,9 +27,9 @@ spark.streaming.driver.writeAheadLog.allowBatching: "false" ######################################################################################################################## ######################################### Query PubSub Settings ######################################## ######################################################################################################################## -# This is the type of PubSub context to use bullet.pubsub.context.name: "QUERY_PROCESSING" -# This is the name of the concrete implementation of PubSub to use. -bullet.pubsub.class.name: "com.yahoo.bullet.pubsub.rest.RESTPubSub" -bullet.pubsub.rest.query.urls: - - "http://localhost:9999/api/bullet/pubsub/query" +bullet.pubsub.class.name: "com.yahoo.bullet.kafka.KafkaPubSub" +bullet.pubsub.kafka.bootstrap.servers: "localhost:9092" +bullet.pubsub.kafka.request.topic.name: "bullet.requests" +bullet.pubsub.kafka.response.topic.name: "bullet.responses" + diff --git a/examples/ui/env-settings.json b/examples/ui/env-settings.json index d9bacd2f..9aa00db6 100644 --- a/examples/ui/env-settings.json +++ b/examples/ui/env-settings.json @@ -2,7 +2,7 @@ "default": { "queryHost": "http://localhost:9999", "queryNamespace": "api/bullet", - "queryPath": "query", + "queryPath": "sse-query", "schemaHost": "http://localhost:9999", "schemaNamespace": "api/bullet", "helpLinks": [ diff --git a/examples/web-service/example_kafka_pubsub_config.yaml b/examples/web-service/example_kafka_pubsub_config.yaml new file mode 100644 index 00000000..ccdd6b4d --- /dev/null +++ b/examples/web-service/example_kafka_pubsub_config.yaml @@ -0,0 +1,5 @@ +bullet.pubsub.context.name: "QUERY_SUBMISSION" +bullet.pubsub.class.name: "com.yahoo.bullet.kafka.KafkaPubSub" +bullet.pubsub.kafka.bootstrap.servers: "localhost:9092" +bullet.pubsub.kafka.request.topic.name: "bullet.requests" +bullet.pubsub.kafka.response.topic.name: "bullet.responses" diff --git a/mkdocs.yml b/mkdocs.yml index 720885cb..2db9cf27 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -9,7 +9,9 @@ repo_name: yahoo/bullet-core pages: - Home: index.md -- Quick Start: quick-start.md +- Quick Start: + - Bullet On Spark: quick-start/bullet-on-spark.md + - Bullet On Storm: quick-start/bullet-on-storm.md - Backend: - Getting your data into Bullet: backend/ingestion.md - Storm: From 63b92191614598dbe506dca1a4f748895f0bf991 Mon Sep 17 00:00:00 2001 From: Nathan Speidel Date: Tue, 19 Jun 2018 13:59:24 -0700 Subject: [PATCH 5/5] Finished spark quickstart without one-liner --- docs/quick-start/bullet-on-spark-with-rest.md | 425 ++++++++++++++++++ docs/quick-start/bullet-on-spark.md | 295 +----------- docs/quick-start/bullet-on-storm-with-rest.md | 417 +++++++++++++++++ docs/quick-start/bullet-on-storm.md | 166 ++++--- examples/Makefile | 3 +- examples/spark/pom.xml | 2 +- ....yaml => bullet_spark_kafka_settings.yaml} | 1 - .../bullet_spark_kafka_settings_new.yaml | 103 +++++ .../resources/bullet_spark_rest_settings.yaml | 36 ++ examples/ui/env-settings.json | 54 ++- 10 files changed, 1140 insertions(+), 362 deletions(-) create mode 100644 docs/quick-start/bullet-on-spark-with-rest.md create mode 100644 docs/quick-start/bullet-on-storm-with-rest.md rename examples/spark/src/main/resources/{bullet_spark_settings.yaml => bullet_spark_kafka_settings.yaml} (99%) create mode 100644 examples/spark/src/main/resources/bullet_spark_kafka_settings_new.yaml create mode 100644 examples/spark/src/main/resources/bullet_spark_rest_settings.yaml diff --git a/docs/quick-start/bullet-on-spark-with-rest.md b/docs/quick-start/bullet-on-spark-with-rest.md new file mode 100644 index 00000000..e94ac640 --- /dev/null +++ b/docs/quick-start/bullet-on-spark-with-rest.md @@ -0,0 +1,425 @@ +# Quick Start + +This section gets you running a mock instance of Bullet to play around with. The instance will run using Bullet on Spark and use the REST pubsub available as part of bullet-core. Since we do not have an actual data source, we will produce some fake data and convert it into [Bullet Records](backend/ingestion.md) using [some simple custom Spark code](https://github.com/bullet-db/bullet-db.github.io/blob/src/examples/spark/src/main/scala/com/yahoo/bullet/spark/examples/receiver/RandomReceiver.scala). If you want to use Bullet for your data, you will need to do read and convert your data to Bullet Records in a similar manner. + +At the end of this section, you will have: + + * Launched the Bullet backend on spark + * Setup the [Web Service](ws/setup.md) with it's built-in REST pubsub enabled + * Setup the [UI](ui/setup.md) talking to the Web Service using [bullet-ui-0.4.0](https://github.com/yahoo/bullet-ui/releases/tag/v0.4.0) + +**Prerequisites** + + * You will need to be on an Unix-based system (Mac OS X, Ubuntu ...) with ```curl``` installed + * You will need [JDK 8](http://www.oracle.com/technetwork/java/javase/downloads/index.html) installed + +## Install Script + +Coming soon - a one-liner to start Bullet. + +## Manual Installation + +### Setup the Bullet Web Service and REST Pub-Sub + +Before we launch the Bullet Spark backend, we first need to setup the Bullet Web Service and PubSub layer. The bullet-core repo provides a [pubsub.rest](https://github.com/bullet-db/bullet-core/tree/master/src/main/java/com/yahoo/bullet/pubsub/rest) package which is a simple implementation of the PubSub layer using REST endpoints. The bullet web service can be configured to use this built-in REST PubSub to provide the additional REST endpoints needed to serve as a PubSub layer as well as the web service. + +#### Step 1: Setup directories and examples + +```bash +export BULLET_HOME=$(pwd)/bullet-quickstart +mkdir -p $BULLET_HOME/backend/spark +mkdir -p $BULLET_HOME/service +mkdir -p $BULLET_HOME/ui +cd $BULLET_HOME +DO THE THING to download the compressed folder - used to be: curl -LO https://github.com/yahoo/bullet-docs/releases/download/v0.4.0/examples_artifacts.tar.gz - now: cp ~/bullet/bullet-db.github.io/examples/examples_artifacts.tar.gz . +tar -xzf examples_artifacts.tar.gz +export BULLET_EXAMPLES=$BULLET_HOME/bullet-examples +``` + +#### Step 2: Install the Bullet Web Service + +```bash +cd $BULLET_HOME/service +curl -Lo bullet-service.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-service/0.2.1/bullet-service-0.2.1-embedded.jar +cp $BULLET_EXAMPLES/web-service/example_rest_pubsub_config.yaml $BULLET_HOME/service/ +cp $BULLET_EXAMPLES/web-service/example_columns.json $BULLET_HOME/service/ +``` + +#### Step 3: Launch the Web Service + +```bash +cd $BULLET_HOME/service +java -jar bullet-service.jar --bullet.pubsub.config=$BULLET_HOME/service/example_rest_pubsub_config.yaml --bullet.schema.file=$BULLET_HOME/service/example_columns.json --server.port=9999 --bullet.endpoint.http=/query --bullet.pubsub.builtin.rest.enabled=true --logging.path=. --logging.file=log.txt &> log.txt & +``` + +The Web Service usually takes ~10-15 seconds to start. + +You can check the status of the Web Service by looking at the Web Service log: + +```bash +cat $BULLET_HOME/service/log.txt +``` + +The log should contain a message that reads something like `Started Application in X seconds` (usually the last line of the file if the web service has been run recently). + +#### Step 4: Test the Web Service (optional) + +We can check that the Web Service is up and running by getting the example columns through the API: + +```bash +curl -s http://localhost:9999/api/bullet/columns +``` + +#### Step 5: Test the PubSub Layer (optional) + +To ensure that the Web Service has been configured to expose the necessary PubSub REST endpoints, we can "write" a fake-query to the PubSub, and then read it back by hand. Since there is currently no backend running, any queries written to the PubSub will simply be stored there until we read it manually. + +Write a fake empty query to the query endpoint: + +```bash +curl -s -H 'Content-Type: application/json' -X POST -d '{}' http://localhost:9999/api/bullet/pubsub/query +``` + +Receiving no error response should indicate that the fake query was written to the pubsub. Then read a query from this same endpoint: + +```bash +curl http://localhost:9999/api/bullet/pubsub/query +``` + +This should print `'{}'` to the screen, indicating we have successfully written and then read a fake empty query from the PubSub layer. Subsequent reads from this endpoint will return nothing because no more queries have been written to the PubSub endpoint. + + +### Setup Bullet Backend on Spark + +We will run the bullet-spark backend using [Spark 2.2.1](https://spark.apache.org/releases/spark-release-2-2-1.html). + +#### Step 6: Install Spark 2.2.1 + +```bash +export BULLET_SPARK=$BULLET_HOME/backend/spark +cd $BULLET_SPARK +curl -O http://www-eu.apache.org/dist/spark/spark-2.2.1/spark-2.2.1-bin-hadoop2.7.tgz +tar -xzf spark-2.2.1-bin-hadoop2.7.tgz +``` + +#### Step 7: Setup Bullet-Spark and Example Data Producer + +```bash +cp $BULLET_HOME/bullet-examples/backend/spark/* $BULLET_SPARK +curl -Lo bullet-spark.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-spark/0.1.1/bullet-spark-0.1.1-standalone.jar +``` + +#### Step 8: Launch the Bullet Spark Backend + +**Note:** This is a single command (new-lines are escaped) - run it in a single bash command: + +```bash +$BULLET_SPARK/spark-2.2.1-bin-hadoop2.7/bin/spark-submit \ + --master local[10] \ + --class com.yahoo.bullet.spark.BulletSparkStreamingMain \ + --driver-class-path $BULLET_SPARK/bullet-spark.jar:$BULLET_HOME/pubsub/bullet-kafka.jar:$BULLET_SPARK/bullet-spark-example.jar \ + $BULLET_SPARK/bullet-spark.jar \ + --bullet-spark-conf=$BULLET_SPARK/bullet_spark_rest_settings.yaml &> log.txt & + +``` + +The backend will usually be up and running within 5-10 seconds. The Web Service will now be hooked up through the REST PubSub to the Spark backend. You can now run a Bullet query by hitting the web service directly: + +```bash +curl -s -H 'Content-Type: text/plain' -X POST -d '{"aggregation": {"size": 1}}' http://localhost:9999/api/bullet/http-query +``` + +This query will return a result JSON containing a "records" field containing a single record, and a "meta" field with some meta information. + +!!! note "What is this data?" + + This data is randomly generated by the [custom data producer](https://github.com/bullet-db/bullet-db.github.io/blob/src/examples/spark/src/main/scala/com/yahoo/bullet/spark/examples/receiver/RandomReceiver.scala) that was created for the sole purpose of generating toy data to demo Bullet. In practice, your spout would read from an actual data source such as Kafka. + + + + + + + + + +### Setting up the Bullet UI + +#### Step 9: Install Node + +```bash +curl -s https://raw.githubusercontent.com/creationix/nvm/v0.33.1/install.sh | bash +source ~/.bashrc +nvm install v6.9.4 +nvm use v6.9.4 +``` + +#### Step 10: Install the Bullet UI + +```bash +cd $BULLET_HOME/ui +curl -LO https://github.com/yahoo/bullet-ui/releases/download/v0.4.0/bullet-ui-v0.4.0.tar.gz +tar -xzf bullet-ui-v0.4.0.tar.gz +cp $BULLET_EXAMPLES/ui/env-settings.json config/ +``` + +#### Step 11: Launch the UI + +```bash +PORT=8800 node express-server.js & +``` + +Visit [http://localhost:8800](http://localhost:8800) to query your topology with the UI. See [UI usage](ui/usage.md) for some example queries and interactions using this UI. You see what the Schema means by visiting the Schema section. + +!!! note "Running it remotely?" + + If you access the UI from another machine than where your UI is actually running, you will need to edit ```config/env-settings.json```. Since the UI is a client-side app, the machine that your browser is running on will fetch the UI and attempt to use these settings to talk to the Web Service. Since they point to localhost by default, your browser will attempt to connect there and fail. An easy fix is to change ```localhost``` in your env-settings.json to point to the host name where you will hosting the UI. This will be the same as the UI host you use in the browser. You can also do a local port forward on the machine accessing the UI by running: + ```ssh -N -L 8800:localhost:8800 -L 9999:localhost:9999 hostname-of-the-quickstart-components 2>&1``` + +## Playing around with the instance + +Check out and follow along with the [UI Usage](ui/usage.md) page as it shows you some queries you can run using this UI. + +## Teardown + +If you were using the [Install Script](#install-script) or if you don't want to manually bring down everything, you can run: + +```bash +curl -sLo- https://raw.githubusercontent.com/yahoo/bullet-docs/v0.4.0/examples/install-all.sh | bash -s cleanup +``` + +If you were performing the steps yourself, you can also manually cleanup **all the components and all the downloads** using: + +| | | +| -------------- | ---------------------------------------------------------------- | +| UI | ```pkill -f [e]xpress-server.js``` | +| Web Service | ```pkill -f [e]xample_drpc_pubsub_config.yaml``` | +| Spark | ```pkill -f [b]ullet-spark``` | +| File System | ```rm -rf $BULLET_HOME /tmp/dev-storm-zookeeper /tmp/jetty-*``` | + +This does *not* delete ```$HOME/.nvm``` and some extra lines nvm may have added to your ```$HOME/{.profile, .bash_profile, .zshrc, .bashrc}```. + +## What did we do? + +This section will go over the various custom pieces this example plugged into Bullet, so you can better understand what we did. + +### Storm topology + +The topology was the Bullet topology plugged in with a custom spout. This spout is implemented in this [example project](https://github.com/yahoo/bullet-docs/blob/master/examples/storm/) and was already built for you when you [downloaded the examples](#step-1-setup-directories-and-examples). It does not read from any data source and just produces random, structured data. It also produces only up to a maximum number of records in a given period. Both this maximum and the length of a period are configurable. If you examine $BULLET_HOME/backend/storm/launch.sh, you'll see the following: + +```bash +storm jar bullet-storm-example-1.0-SNAPSHOT-jar-with-dependencies.jar \ + com.yahoo.bullet.Topology \ + --bullet-conf bullet_settings.yaml \ + --bullet-spout com.yahoo.bullet.storm.examples.RandomSpout \ + --bullet-spout-parallelism 1 \ + ... + --bullet-spout-arg 20 \ + --bullet-spout-arg 101 \ + ... +``` + +This command launches the jar (an uber or "fat" jar) containing the custom spout code and all dependencies you copied in Step 5. We pass the name of your spout class with ```--bullet-spout com.yahoo.bullet.storm.examples.RandomSpout``` to the Bullet main class ```com.yahoo.bullet.Topology``` with two arguments ```--bullet-spout-arg 20``` and ```--bullet-spout-arg 101```. The first argument tells the Spout to generate at most 20 tuples (records) in a period and the second argument says a period is 101 ms long. + +The settings defined by ```--bullet-conf bullet_settings.yaml``` and the arguments here run all components in the topology with a parallelism of 1. So there will be one spout that is producing ~200 rps. + +!!! note "I thought you said hundreds of thousands of records..." + + 200 records is not Big Data by any stretch of the imagination but this Quick Start is running everything on one machine and is meant to introduce you to what Bullet does. In practice, you would scale and run your components with CPU and memory configurations to accommodate for your data volume and querying needs. + + +Let's look at the [custom spout code](https://github.com/yahoo/bullet-docs/blob/master/examples/storm/src/main/java/com/yahoo/bullet/storm/examples/RandomSpout.java) that generates the data. + +```java + @Override + public void nextTuple() { + long timeNow = System.nanoTime(); + // Only emit if we are still in the interval and haven't gone over our per period max + if (timeNow <= nextIntervalStart && generatedThisPeriod < maxPerPeriod) { + outputCollector.emit(new Values(generateRecord()), DUMMY_ID); + generatedThisPeriod++; + } + if (timeNow > nextIntervalStart) { + log.info("Generated {} tuples out of {}", generatedThisPeriod, maxPerPeriod); + nextIntervalStart = timeNow + period; + generatedThisPeriod = 0; + periodCount++; + } + // It is courteous to sleep for a short time if you're not emitting anything... + try { + Thread.sleep(1); + } catch (InterruptedException e) { + log.error("Error: ", e); + } + } +``` + +This method above emits the tuples. The Storm framework calls this method. This function only emits at most the given maximum tuples per period. + +!!! note "Why a DUMMY_ID?" + + When the spout emits the randomly generated tuple, it attaches a ```DUMMY_ID``` to it. In Storm terms, this is a message ID. By adding a message ID, this tuple can be made to flow reliably. The Bullet component that receives this tuple (Filter bolt) acknowledges or "acks" this tuple. If the tuple did not make it to Filter bolt within a configured timeout window, Storm will call a ```fail(Object messageId)``` method on the spout. This particular spout does not define one and hence the usage of a ```DUMMY_ID```. If your source of data can identify records uniquely and you can re-emit them on a fail, you should attach that actual ID in place of the ```DUMMY_ID```. + +```java + private BulletRecord generateRecord() { + BulletRecord record = new BulletRecord(); + String uuid = UUID.randomUUID().toString(); + + record.setString(STRING, uuid); + record.setLong(LONG, (long) generatedThisPeriod); + record.setDouble(DOUBLE, random.nextDouble()); + record.setString(TYPE, STRING_POOL[random.nextInt(STRING_POOL.length)]); + record.setLong(DURATION, System.currentTimeMillis() % INTEGER_POOL[random.nextInt(INTEGER_POOL.length)]); + + Map booleanMap = new HashMap<>(4); + booleanMap.put(uuid.substring(0, 8), random.nextBoolean()); + booleanMap.put(uuid.substring(9, 13), random.nextBoolean()); + booleanMap.put(uuid.substring(14, 18), random.nextBoolean()); + booleanMap.put(uuid.substring(19, 23), random.nextBoolean()); + record.setBooleanMap(BOOLEAN_MAP, booleanMap); + + Map statsMap = new HashMap<>(4); + statsMap.put(PERIOD_COUNT, periodCount); + statsMap.put(RECORD_NUMBER, periodCount * maxPerPeriod + generatedThisPeriod); + statsMap.put(NANO_TIME, System.nanoTime()); + statsMap.put(TIMESTAMP, System.currentTimeMillis()); + record.setLongMap(STATS_MAP, statsMap); + + Map randomMapA = new HashMap<>(2); + Map randomMapB = new HashMap<>(2); + randomMapA.put(RANDOM_MAP_KEY_A, STRING_POOL[random.nextInt(STRING_POOL.length)]); + randomMapA.put(RANDOM_MAP_KEY_B, STRING_POOL[random.nextInt(STRING_POOL.length)]); + randomMapB.put(RANDOM_MAP_KEY_A, STRING_POOL[random.nextInt(STRING_POOL.length)]); + randomMapB.put(RANDOM_MAP_KEY_B, STRING_POOL[random.nextInt(STRING_POOL.length)]); + record.setListOfStringMap(LIST, asList(randomMapA, randomMapB)); + + return record; + } +``` + +This method generates some fields randomly and inserts them into a BulletRecord. Note that the BulletRecord is typed and all data must be inserted with the proper types. + +If you put Bullet on your data, you will need to write a Spout (or a topology if your reading is complex), that reads from your data source and emits BulletRecords with the fields you wish to be query-able placed into a BulletRecord similar to this example. + +### PubSub + +We used the [DRPC PubSub](pubsub/storm-drpc.md) since we were using the Storm Backend. This code was included in the Bullet Storm artifact that we downloaded (the JAR with dependencies). We configured the Backend to use this PubSub by adding these settings to the YAML file that we passed to our Storm topology. Notice that we set the context to ```QUERY_PROCESSING``` since this is the Backend. + +```yaml +bullet.pubsub.context.name: "QUERY_PROCESSING" +bullet.pubsub.class.name: "com.yahoo.bullet.storm.drpc.DRPCPubSub" +bullet.pubsub.storm.drpc.function: "bullet-query" +``` + +For the Web Service, we passed in a YAML file that pointed to our DRPC server that was part of the Storm cluster we launched. Notice that we set the context to ```QUERY_SUBMISSION``` since this is the Web Service. + +```yaml +bullet.pubsub.context.name: "QUERY_SUBMISSION" +bullet.pubsub.class.name: "com.yahoo.bullet.storm.drpc.DRPCPubSub" +bullet.pubsub.storm.drpc.servers: + - 127.0.0.1 +bullet.pubsub.storm.drpc.function: "bullet-query" +bullet.pubsub.storm.drpc.http.protocol: "http" +bullet.pubsub.storm.drpc.http.port: "3774" +bullet.pubsub.storm.drpc.http.path: "drpc" +bullet.pubsub.storm.drpc.http.connect.retry.limit: 3 +bullet.pubsub.storm.drpc.http.connect.timeout.ms: 1000 +``` + +### Web Service + +We launched the Web Service using two custom files - a PubSub configuration YAML file and JSON schema file. + +The JSON columns file contains the schema for our data specified in JSON. Since our schema is not going to change, we use the Web Service to serve it from a file. If your schema changes dynamically, you will need to provide your own endpoint to the UI. + +The following is a snippet from the [JSON file](https://github.com/yahoo/bullet-docs/blob/master/examples/web-service/example_columns.json). Notice how the types of the fields are specified. Also, if you have generated BulletRecord with Map fields whose keys are known, you can specify them here using ```enumerations```. + +```javascript +[ + { + "name": "probability", + "type": "DOUBLE", + "description": "Generated from Random#nextDouble" + }, + ... + { + "name": "stats_map", + "type": "MAP", + "subtype": "LONG", + "description": "This map contains some numeric information such as the current number of periods etc.", + "enumerations": [ + ... + {"name": "nano_time", "description": "The ns time when this record was generated"} + ] + }, + { + "name": "classifiers", + "type": "LIST", + "subtype": "MAP", + "description": "This contains two maps, each with: field_A and field_B whose values are randomly chosen from: foo, bar, baz, qux, quux, norf" + } +] +``` +The contents of the [PubSub configuration file](https://github.com/yahoo/bullet-docs/blob/master/examples/web-service/example_drpc_pubsub_config.yaml) was discussed in the [PubSub section above](#pubsub). + +### UI + +Finally, we configured the UI with the custom environment specific settings file. We did not add any environments since we only had the one. + +```javascript +{ + "default": { + "queryHost": "http://localhost:9999", + "queryNamespace": "api/bullet", + "queryPath": "query", + "schemaHost": "http://localhost:9999", + "schemaNamespace": "api/bullet", + "helpLinks": [ + { + "name": "Examples", + "link": "https://yahoo.github.io/bullet-docs/ui/usage" + } + ], + "bugLink": "https://github.com/yahoo/bullet-ui/issues", + "modelVersion": 2, + "migrations": { + "deletions": "result" + }, + "defaultValues": { + "defaultValues": { + "aggregationMaxSize": 1024, + "rawMaxSize": 500, + "durationMaxSecs": 540, + "distributionNumberOfPoints": 11, + "distributionQuantilePoints": "0, 0.25, 0.5, 0.75, 0.9, 1", + "distributionQuantileStart": 0, + "distributionQuantileEnd": 1, + "distributionQuantileIncrement": 0.1, + "queryTimeoutSecs": 3, + "sketches": { + "countDistinctMaxEntries": 16384, + "groupByMaxEntries": 512, + "distributionMaxEntries": 1024, + "distributionMaxNumberOfPoints": 200, + "topKMaxEntries": 1024, + "topKErrorType": "No False Negatives" + }, + "metadataKeyMapping": { + "theta": "theta", + "uniquesEstimate": "uniques_estimate", + "queryCreationTime": "query_receive_time", + "queryTerminationTime": "query_finish_time", + "estimatedResult": "was_estimated", + "standardDeviations": "standard_deviations", + "normalizedRankError": "normalized_rank_error", + "maximumCountError": "maximum_count_error", + "itemsSeen": "items_seen", + "minimumValue": "minimum_value", + "maximumValue": "maximum_value" + } + } + } +} +``` + +Since we served our schema through the same Web Service as our queries, both these point to our Web Service. Note that there is no ```schemaPath``` because it must be the constant string ```columns```. If you define a custom endpoint for your schema, you must ensure that it can be obtained by making a GET request to ```schemaHost/schemaNamespace/columns```. diff --git a/docs/quick-start/bullet-on-spark.md b/docs/quick-start/bullet-on-spark.md index f3a7364f..fb647935 100644 --- a/docs/quick-start/bullet-on-spark.md +++ b/docs/quick-start/bullet-on-spark.md @@ -13,11 +13,7 @@ At the end of this section, you will have: * You will need to be on an Unix-based system (Mac OS X, Ubuntu ...) with ```curl``` installed * You will need [JDK 8](http://www.oracle.com/technetwork/java/javase/downloads/index.html) installed -## Install Script - -DO THIS LATER (one-liner?) Remove this 88 - -## Manual Installation +## To Install and Launch Bullet Locally: ### Setup Kafka @@ -32,7 +28,7 @@ mkdir -p $BULLET_HOME/pubsub mkdir -p $BULLET_HOME/service mkdir -p $BULLET_HOME/ui cd $BULLET_HOME -Remove this 88: DO THE THING to download the compressed folder (needs to be put on git) - used to be: curl -LO https://github.com/yahoo/bullet-docs/releases/download/v0.4.0/examples_artifacts.tar.gz - now: cp ~/bullet/bullet-db.github.io/examples/examples_artifacts.tar.gz . +curl -LO https://github.com/bullet-db/bullet-db.github.io/releases/download/v0.5.0/examples_artifacts.tar.gz tar -xzf examples_artifacts.tar.gz export BULLET_EXAMPLES=$BULLET_HOME/bullet-examples ``` @@ -121,7 +117,6 @@ tar -xzf spark-2.2.1-bin-hadoop2.7.tgz ```bash cp $BULLET_HOME/bullet-examples/backend/spark/* $BULLET_SPARK -# Remove this 88: THIS (next line) does not yet exist - make sure it's available curl -Lo bullet-spark.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-spark/0.1.1/bullet-spark-0.1.1-standalone.jar ``` @@ -135,7 +130,7 @@ $BULLET_SPARK/spark-2.2.1-bin-hadoop2.7/bin/spark-submit \ --class com.yahoo.bullet.spark.BulletSparkStreamingMain \ --driver-class-path $BULLET_SPARK/bullet-spark.jar:$BULLET_HOME/pubsub/bullet-kafka.jar:$BULLET_SPARK/bullet-spark-example.jar \ $BULLET_SPARK/bullet-spark.jar \ - --bullet-spark-conf=$BULLET_SPARK/bullet_spark_settings.yaml &> log.txt & + --bullet-spark-conf=$BULLET_SPARK/bullet_spark_kafka_settings.yaml &> log.txt & ``` @@ -154,8 +149,6 @@ This query will return a result JSON containing a "records" field containing a s ### Setting up the Bullet UI -** ------------------------------------------------------------ This needs to be re-done with the new UI!! ------------------------------------------------------------------------- ** - #### Step 12: Install Node ```bash @@ -169,8 +162,8 @@ nvm use v6.9.4 #### Step 13: Install the Bullet UI ```bash -curl -LO https://github.com/yahoo/bullet-ui/releases/download/v0.4.0/bullet-ui-v0.4.0.tar.gz -tar -xzf bullet-ui-v0.4.0.tar.gz +curl -LO https://github.com/bullet-db/bullet-ui/releases/download/v0.5.0/bullet-ui-v0.5.0.tar.gz +tar -xzf bullet-ui-v0.5.0.tar.gz cp $BULLET_EXAMPLES/ui/env-settings.json config/ ``` @@ -187,55 +180,15 @@ Visit [http://localhost:8800](http://localhost:8800) to query your topology with If you access the UI from another machine than where your UI is actually running, you will need to edit ```config/env-settings.json```. Since the UI is a client-side app, the machine that your browser is running on will fetch the UI and attempt to use these settings to talk to the Web Service. Since they point to localhost by default, your browser will attempt to connect there and fail. An easy fix is to change ```localhost``` in your env-settings.json to point to the host name where you will hosting the UI. This will be the same as the UI host you use in the browser. You can also do a local port forward on the machine accessing the UI by running: ```ssh -N -L 8800:localhost:8800 -L 9999:localhost:9999 hostname-of-the-quickstart-components 2>&1``` -## Playing around with the instance - -Check out and follow along with the [UI Usage](ui/usage.md) page as it shows you some queries you can run using this UI. - - - - - - - - - - - - - - - - -## Teardown -* Web service: - * pkill -f [e]xample_kafka_pubsub_config.yaml - * pkill -f [b]ullet-spark -* Kafka and Zookeeper: - * TRY THIS: - * pkill -f kafka-server-start - * pkill -f [p]ubsub/kafka_2.12-0.11.0.1 - * /Users/nspeidel/bullet-quickstart/pubsub/kafka_2.12-0.11.0.1/bin/zookeeper-server-stop.sh - * pkill -f [k]afka_2.12-0.11.0.1 - * pkill -f [k]afka-server-start - * pkill -f [z]ookeeper.properties - - - - - - +## Congratulations!! Bullet is all setup! +#### Playing around with the instance: +Check out and follow along with the [UI Usage](ui/usage.md) page as it shows you some queries you can run using this UI. ## Teardown -If you were using the [Install Script](#install-script) or if you don't want to manually bring down everything, you can run: - -```bash -curl -sLo- https://raw.githubusercontent.com/yahoo/bullet-docs/v0.4.0/examples/install-all.sh | bash -s cleanup -``` - -If you were performing the steps yourself, you can also manually cleanup **all the components and all the downloads** using: +When you are done trying out Bullet, you can stop the processes and cleanup all the downloads using: | | | | -------------- | ---------------------------------------------------------------- | @@ -244,233 +197,7 @@ If you were performing the steps yourself, you can also manually cleanup **all t | Spark | ```pkill -f [b]ullet-spark``` | | Kafka | ```${KAFKA_DIR}/bin/kafka-server-stop.sh``` | | Zookeeper | ```${KAFKA_DIR}/bin/zookeeper-server-stop.sh``` | -| File System | ```rm -rf $BULLET_HOME /tmp/dev-storm-zookeeper /tmp/jetty-*``` | - - - -This does *not* delete ```$HOME/.nvm``` and some extra lines nvm may have added to your ```$HOME/{.profile, .bash_profile, .zshrc, .bashrc}```. - -## What did we do? - -This section will go over the various custom pieces this example plugged into Bullet, so you can better understand what we did. - -### Storm topology - -The topology was the Bullet topology plugged in with a custom spout. This spout is implemented in this [example project](https://github.com/yahoo/bullet-docs/blob/master/examples/storm/) and was already built for you when you [downloaded the examples](#step-1-setup-directories-and-examples). It does not read from any data source and just produces random, structured data. It also produces only up to a maximum number of records in a given period. Both this maximum and the length of a period are configurable. If you examine $BULLET_HOME/backend/storm/launch.sh, you'll see the following: - -```bash -storm jar bullet-storm-example-1.0-SNAPSHOT-jar-with-dependencies.jar \ - com.yahoo.bullet.Topology \ - --bullet-conf bullet_settings.yaml \ - --bullet-spout com.yahoo.bullet.storm.examples.RandomSpout \ - --bullet-spout-parallelism 1 \ - ... - --bullet-spout-arg 20 \ - --bullet-spout-arg 101 \ - ... -``` +| File System | ```rm -rf $BULLET_HOME``` | -This command launches the jar (an uber or "fat" jar) containing the custom spout code and all dependencies you copied in Step 5. We pass the name of your spout class with ```--bullet-spout com.yahoo.bullet.storm.examples.RandomSpout``` to the Bullet main class ```com.yahoo.bullet.Topology``` with two arguments ```--bullet-spout-arg 20``` and ```--bullet-spout-arg 101```. The first argument tells the Spout to generate at most 20 tuples (records) in a period and the second argument says a period is 101 ms long. - -The settings defined by ```--bullet-conf bullet_settings.yaml``` and the arguments here run all components in the topology with a parallelism of 1. So there will be one spout that is producing ~200 rps. - -!!! note "I thought you said hundreds of thousands of records..." - - 200 records is not Big Data by any stretch of the imagination but this Quick Start is running everything on one machine and is meant to introduce you to what Bullet does. In practice, you would scale and run your components with CPU and memory configurations to accommodate for your data volume and querying needs. - - -Let's look at the [custom spout code](https://github.com/yahoo/bullet-docs/blob/master/examples/storm/src/main/java/com/yahoo/bullet/storm/examples/RandomSpout.java) that generates the data. - -```java - @Override - public void nextTuple() { - long timeNow = System.nanoTime(); - // Only emit if we are still in the interval and haven't gone over our per period max - if (timeNow <= nextIntervalStart && generatedThisPeriod < maxPerPeriod) { - outputCollector.emit(new Values(generateRecord()), DUMMY_ID); - generatedThisPeriod++; - } - if (timeNow > nextIntervalStart) { - log.info("Generated {} tuples out of {}", generatedThisPeriod, maxPerPeriod); - nextIntervalStart = timeNow + period; - generatedThisPeriod = 0; - periodCount++; - } - // It is courteous to sleep for a short time if you're not emitting anything... - try { - Thread.sleep(1); - } catch (InterruptedException e) { - log.error("Error: ", e); - } - } -``` - -This method above emits the tuples. The Storm framework calls this method. This function only emits at most the given maximum tuples per period. - -!!! note "Why a DUMMY_ID?" - - When the spout emits the randomly generated tuple, it attaches a ```DUMMY_ID``` to it. In Storm terms, this is a message ID. By adding a message ID, this tuple can be made to flow reliably. The Bullet component that receives this tuple (Filter bolt) acknowledges or "acks" this tuple. If the tuple did not make it to Filter bolt within a configured timeout window, Storm will call a ```fail(Object messageId)``` method on the spout. This particular spout does not define one and hence the usage of a ```DUMMY_ID```. If your source of data can identify records uniquely and you can re-emit them on a fail, you should attach that actual ID in place of the ```DUMMY_ID```. - -```java - private BulletRecord generateRecord() { - BulletRecord record = new BulletRecord(); - String uuid = UUID.randomUUID().toString(); - - record.setString(STRING, uuid); - record.setLong(LONG, (long) generatedThisPeriod); - record.setDouble(DOUBLE, random.nextDouble()); - record.setString(TYPE, STRING_POOL[random.nextInt(STRING_POOL.length)]); - record.setLong(DURATION, System.currentTimeMillis() % INTEGER_POOL[random.nextInt(INTEGER_POOL.length)]); - - Map booleanMap = new HashMap<>(4); - booleanMap.put(uuid.substring(0, 8), random.nextBoolean()); - booleanMap.put(uuid.substring(9, 13), random.nextBoolean()); - booleanMap.put(uuid.substring(14, 18), random.nextBoolean()); - booleanMap.put(uuid.substring(19, 23), random.nextBoolean()); - record.setBooleanMap(BOOLEAN_MAP, booleanMap); - - Map statsMap = new HashMap<>(4); - statsMap.put(PERIOD_COUNT, periodCount); - statsMap.put(RECORD_NUMBER, periodCount * maxPerPeriod + generatedThisPeriod); - statsMap.put(NANO_TIME, System.nanoTime()); - statsMap.put(TIMESTAMP, System.currentTimeMillis()); - record.setLongMap(STATS_MAP, statsMap); - - Map randomMapA = new HashMap<>(2); - Map randomMapB = new HashMap<>(2); - randomMapA.put(RANDOM_MAP_KEY_A, STRING_POOL[random.nextInt(STRING_POOL.length)]); - randomMapA.put(RANDOM_MAP_KEY_B, STRING_POOL[random.nextInt(STRING_POOL.length)]); - randomMapB.put(RANDOM_MAP_KEY_A, STRING_POOL[random.nextInt(STRING_POOL.length)]); - randomMapB.put(RANDOM_MAP_KEY_B, STRING_POOL[random.nextInt(STRING_POOL.length)]); - record.setListOfStringMap(LIST, asList(randomMapA, randomMapB)); - - return record; - } -``` - -This method generates some fields randomly and inserts them into a BulletRecord. Note that the BulletRecord is typed and all data must be inserted with the proper types. - -If you put Bullet on your data, you will need to write a Spout (or a topology if your reading is complex), that reads from your data source and emits BulletRecords with the fields you wish to be query-able placed into a BulletRecord similar to this example. - -### PubSub - -We used the [DRPC PubSub](pubsub/storm-drpc.md) since we were using the Storm Backend. This code was included in the Bullet Storm artifact that we downloaded (the JAR with dependencies). We configured the Backend to use this PubSub by adding these settings to the YAML file that we passed to our Storm topology. Notice that we set the context to ```QUERY_PROCESSING``` since this is the Backend. - -```yaml -bullet.pubsub.context.name: "QUERY_PROCESSING" -bullet.pubsub.class.name: "com.yahoo.bullet.storm.drpc.DRPCPubSub" -bullet.pubsub.storm.drpc.function: "bullet-query" -``` - -For the Web Service, we passed in a YAML file that pointed to our DRPC server that was part of the Storm cluster we launched. Notice that we set the context to ```QUERY_SUBMISSION``` since this is the Web Service. - -```yaml -bullet.pubsub.context.name: "QUERY_SUBMISSION" -bullet.pubsub.class.name: "com.yahoo.bullet.storm.drpc.DRPCPubSub" -bullet.pubsub.storm.drpc.servers: - - 127.0.0.1 -bullet.pubsub.storm.drpc.function: "bullet-query" -bullet.pubsub.storm.drpc.http.protocol: "http" -bullet.pubsub.storm.drpc.http.port: "3774" -bullet.pubsub.storm.drpc.http.path: "drpc" -bullet.pubsub.storm.drpc.http.connect.retry.limit: 3 -bullet.pubsub.storm.drpc.http.connect.timeout.ms: 1000 -``` - -### Web Service - -We launched the Web Service using two custom files - a PubSub configuration YAML file and JSON schema file. - -The JSON columns file contains the schema for our data specified in JSON. Since our schema is not going to change, we use the Web Service to serve it from a file. If your schema changes dynamically, you will need to provide your own endpoint to the UI. - -The following is a snippet from the [JSON file](https://github.com/yahoo/bullet-docs/blob/master/examples/web-service/example_columns.json). Notice how the types of the fields are specified. Also, if you have generated BulletRecord with Map fields whose keys are known, you can specify them here using ```enumerations```. - -```javascript -[ - { - "name": "probability", - "type": "DOUBLE", - "description": "Generated from Random#nextDouble" - }, - ... - { - "name": "stats_map", - "type": "MAP", - "subtype": "LONG", - "description": "This map contains some numeric information such as the current number of periods etc.", - "enumerations": [ - ... - {"name": "nano_time", "description": "The ns time when this record was generated"} - ] - }, - { - "name": "classifiers", - "type": "LIST", - "subtype": "MAP", - "description": "This contains two maps, each with: field_A and field_B whose values are randomly chosen from: foo, bar, baz, qux, quux, norf" - } -] -``` -The contents of the [PubSub configuration file](https://github.com/yahoo/bullet-docs/blob/master/examples/web-service/example_drpc_pubsub_config.yaml) was discussed in the [PubSub section above](#pubsub). - -### UI - -Finally, we configured the UI with the custom environment specific settings file. We did not add any environments since we only had the one. - -```javascript -{ - "default": { - "queryHost": "http://localhost:9999", - "queryNamespace": "api/bullet", - "queryPath": "query", - "schemaHost": "http://localhost:9999", - "schemaNamespace": "api/bullet", - "helpLinks": [ - { - "name": "Examples", - "link": "https://yahoo.github.io/bullet-docs/ui/usage" - } - ], - "bugLink": "https://github.com/yahoo/bullet-ui/issues", - "modelVersion": 2, - "migrations": { - "deletions": "result" - }, - "defaultValues": { - "defaultValues": { - "aggregationMaxSize": 1024, - "rawMaxSize": 500, - "durationMaxSecs": 540, - "distributionNumberOfPoints": 11, - "distributionQuantilePoints": "0, 0.25, 0.5, 0.75, 0.9, 1", - "distributionQuantileStart": 0, - "distributionQuantileEnd": 1, - "distributionQuantileIncrement": 0.1, - "queryTimeoutSecs": 3, - "sketches": { - "countDistinctMaxEntries": 16384, - "groupByMaxEntries": 512, - "distributionMaxEntries": 1024, - "distributionMaxNumberOfPoints": 200, - "topKMaxEntries": 1024, - "topKErrorType": "No False Negatives" - }, - "metadataKeyMapping": { - "theta": "theta", - "uniquesEstimate": "uniques_estimate", - "queryCreationTime": "query_receive_time", - "queryTerminationTime": "query_finish_time", - "estimatedResult": "was_estimated", - "standardDeviations": "standard_deviations", - "normalizedRankError": "normalized_rank_error", - "maximumCountError": "maximum_count_error", - "itemsSeen": "items_seen", - "minimumValue": "minimum_value", - "maximumValue": "maximum_value" - } - } - } -} -``` +Note: This does *not* delete ```$HOME/.nvm```. -Since we served our schema through the same Web Service as our queries, both these point to our Web Service. Note that there is no ```schemaPath``` because it must be the constant string ```columns```. If you define a custom endpoint for your schema, you must ensure that it can be obtained by making a GET request to ```schemaHost/schemaNamespace/columns```. diff --git a/docs/quick-start/bullet-on-storm-with-rest.md b/docs/quick-start/bullet-on-storm-with-rest.md new file mode 100644 index 00000000..eed84fb4 --- /dev/null +++ b/docs/quick-start/bullet-on-storm-with-rest.md @@ -0,0 +1,417 @@ +# Quick Start + +This section gets you running a mock instance of Bullet to play around with. The instance will run using Bullet on Spark and use the REST pubsub available as part of bullet-core. Since we do not have an actual data source, we will produce some fake data and convert it into [Bullet Records](backend/ingestion.md) using [some simple custom Spark code](https://github.com/bullet-db/bullet-db.github.io/blob/src/examples/spark/src/main/scala/com/yahoo/bullet/spark/examples/receiver/RandomReceiver.scala). If you want to use Bullet for your data, you will need to do read and convert your data to Bullet Records in a similar manner. + +At the end of this section, you will have: + + * Launched the Bullet backend on spark + * Setup the [Web Service](ws/setup.md) with it's built-in REST pubsub enabled + * Setup the [UI](ui/setup.md) talking to the Web Service using [bullet-ui-0.4.0](https://github.com/yahoo/bullet-ui/releases/tag/v0.4.0) + +**Prerequisites** + + * You will need to be on an Unix-based system (Mac OS X, Ubuntu ...) with ```curl``` installed + * You will need [JDK 8](http://www.oracle.com/technetwork/java/javase/downloads/index.html) installed + +## Install Script + +DO THIS LATER (one-liner?) + +## Manual Installation + +### Setup the Bullet Web Service and REST Pub-Sub + +Before we launch the Bullet Spark backend, we first need to setup the Bullet Web Service and PubSub layer. The bullet-core repo provides a [pubsub.rest](https://github.com/bullet-db/bullet-core/tree/master/src/main/java/com/yahoo/bullet/pubsub/rest) package which is a simple implementation of the PubSub layer using REST endpoints. The bullet web service can be configured to use this built-in REST PubSub to provide the additional REST endpoints needed to serve as a PubSub layer as well as the web service. + +#### Step 1: Setup directories and examples + +```bash +export BULLET_HOME=$(pwd)/bullet-quickstart +mkdir -p $BULLET_HOME/backend/spark +mkdir -p $BULLET_HOME/service +mkdir -p $BULLET_HOME/ui +cd $BULLET_HOME +DO THE THING to download the compressed folder - used to be: curl -LO https://github.com/yahoo/bullet-docs/releases/download/v0.4.0/examples_artifacts.tar.gz +tar -xzf examples_artifacts.tar.gz +export BULLET_EXAMPLES=$BULLET_HOME/bullet-examples +``` + +#### Step 2: Install the Bullet Web Service + +```bash +cd $BULLET_HOME/service +curl -Lo bullet-service.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-service/0.2.1/bullet-service-0.2.1-embedded.jar +cp $BULLET_EXAMPLES/web-service/example_rest_pubsub_config.yaml $BULLET_HOME/service/ +cp $BULLET_EXAMPLES/web-service/example_columns.json $BULLET_HOME/service/ +``` + +#### Step 3: Launch the Web Service + +```bash +cd $BULLET_HOME/service +java -jar bullet-service.jar --bullet.pubsub.config=$BULLET_HOME/service/example_rest_pubsub_config.yaml --bullet.schema.file=$BULLET_HOME/service/example_columns.json --server.port=9999 --bullet.pubsub.builtin.rest.enabled=true --logging.path=. --logging.file=log.txt &> log.txt & +``` + +The Web Service usually takes ~10-15 seconds to start. + +You can check the status of the Web Service by looking at the Web Service log: + +```bash +cat $BULLET_HOME/service/log.txt +``` + +The log should contain a message that reads something like `Started Application in X seconds` (usually the last line of the file if the web service has been run recently). + +#### Step 4: Test the Web Service (optional) + +We can check that the Web Service is up and running by getting the example columns through the API: + +```bash +curl -s http://localhost:9999/api/bullet/columns +``` + +#### Step 5: Test the PubSub Layer (optional) + +To ensure that the Web Service has been configured to expose the necessary PubSub REST endpoints, we can "write" a fake-query to the PubSub, and then read it back by hand. Since there is currently no backend running, any queries written to the PubSub will simply be stored there until we read it manually. + +Write a fake empty query to the query endpoint: + +```bash +curl -s -H 'Content-Type: application/json' -X POST -d '{}' http://localhost:9999/api/bullet/pubsub/query +``` + +Receiving no error response should indicate that the fake query was written to the pubsub. Then read a query from this same endpoint: + +```bash +curl http://localhost:9999/api/bullet/pubsub/query +``` + +This should print `'{}'` to the screen, indicating we have successfully written and then read a fake empty query from the PubSub layer. Subsequent reads from this endpoint will return nothing because no more queries have been written to the PubSub endpoint. + + +### Setup Bullet Backend on Spark + +We will run the bullet-spark backend using [Spark 2.2.1](https://spark.apache.org/releases/spark-release-2-2-1.html). + +#### Step 6: Install Spark 2.2.1 + +```bash +export BULLET_SPARK=$BULLET_HOME/backend/spark +cd $BULLET_SPARK +curl -O http://www-eu.apache.org/dist/spark/spark-2.2.1/spark-2.2.1-bin-hadoop2.7.tgz +tar -xzf spark-2.2.1-bin-hadoop2.7.tgz +``` + +#### Step 7: Setup Bullet-Spark and Example Data Producer + +```bash +cp $BULLET_HOME/bullet-examples/backend/spark/* $BULLET_SPARK +curl -Lo bullet-spark.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-spark/0.1.1/bullet-spark-0.1.1-standalone.jar +``` + +#### Step 8: Launch the Bullet Spark Backend + +```bash +$BULLET_SPARK/spark-2.2.1-bin-hadoop2.7/bin/spark-submit --master local[10] --class com.yahoo.bullet.spark.BulletSparkStreamingMain --driver-class-path $BULLET_SPARK/bullet-spark-example.jar:$BULLET_SPARK/bullet-spark.jar $BULLET_SPARK/bullet-spark.jar &> log.txt & +``` + +The backend will usually be up and running within 5-10 seconds. The Web Service will now be hooked up through the REST PubSub to the Spark backend. You can now run a Bullet query by hitting the web service directly: + +```bash +curl -s -H 'Content-Type: text/plain' -X POST -d '{"aggregation": {"size": 1}}' http://localhost:9999/api/bullet/sse-query +``` + +This query will return a result JSON containing a "records" field containing a single record, and a "meta" field with some meta information. + +!!! note "What is this data?" + + This data is randomly generated by the [custom data producer](https://github.com/bullet-db/bullet-db.github.io/blob/src/examples/spark/src/main/scala/com/yahoo/bullet/spark/examples/receiver/RandomReceiver.scala) that was created for the sole purpose of generating toy data to demo Bullet. In practice, your spout would read from an actual data source such as Kafka. + + + + + + + + + +### Setting up the Bullet UI + +#### Step 9: Install Node + +```bash +curl -s https://raw.githubusercontent.com/creationix/nvm/v0.33.1/install.sh | bash +source ~/.bashrc +nvm install v6.9.4 +nvm use v6.9.4 +``` + +#### Step 10: Install the Bullet UI + +```bash +cd $BULLET_HOME/ui +curl -LO https://github.com/yahoo/bullet-ui/releases/download/v0.4.0/bullet-ui-v0.4.0.tar.gz +tar -xzf bullet-ui-v0.4.0.tar.gz +cp $BULLET_EXAMPLES/ui/env-settings.json config/ +``` + +#### Step 11: Launch the UI + +```bash +PORT=8800 node express-server.js & +``` + +Visit [http://localhost:8800](http://localhost:8800) to query your topology with the UI. See [UI usage](ui/usage.md) for some example queries and interactions using this UI. You see what the Schema means by visiting the Schema section. + +!!! note "Running it remotely?" + + If you access the UI from another machine than where your UI is actually running, you will need to edit ```config/env-settings.json```. Since the UI is a client-side app, the machine that your browser is running on will fetch the UI and attempt to use these settings to talk to the Web Service. Since they point to localhost by default, your browser will attempt to connect there and fail. An easy fix is to change ```localhost``` in your env-settings.json to point to the host name where you will hosting the UI. This will be the same as the UI host you use in the browser. You can also do a local port forward on the machine accessing the UI by running: + ```ssh -N -L 8800:localhost:8800 -L 9999:localhost:9999 hostname-of-the-quickstart-components 2>&1``` + +## Playing around with the instance + +Check out and follow along with the [UI Usage](ui/usage.md) page as it shows you some queries you can run using this UI. + +## Teardown + +If you were using the [Install Script](#install-script) or if you don't want to manually bring down everything, you can run: + +```bash +curl -sLo- https://raw.githubusercontent.com/yahoo/bullet-docs/v0.4.0/examples/install-all.sh | bash -s cleanup +``` + +If you were performing the steps yourself, you can also manually cleanup **all the components and all the downloads** using: + +| | | +| -------------- | ---------------------------------------------------------------- | +| UI | ```pkill -f [e]xpress-server.js``` | +| Web Service | ```pkill -f [e]xample_drpc_pubsub_config.yaml``` | +| Storm | ```pkill -f [a]pache-storm-1.1.2``` | +| File System | ```rm -rf $BULLET_HOME /tmp/dev-storm-zookeeper /tmp/jetty-*``` | + +This does *not* delete ```$HOME/.nvm``` and some extra lines nvm may have added to your ```$HOME/{.profile, .bash_profile, .zshrc, .bashrc}```. + +## What did we do? + +This section will go over the various custom pieces this example plugged into Bullet, so you can better understand what we did. + +### Storm topology + +The topology was the Bullet topology plugged in with a custom spout. This spout is implemented in this [example project](https://github.com/yahoo/bullet-docs/blob/master/examples/storm/) and was already built for you when you [downloaded the examples](#step-1-setup-directories-and-examples). It does not read from any data source and just produces random, structured data. It also produces only up to a maximum number of records in a given period. Both this maximum and the length of a period are configurable. If you examine $BULLET_HOME/backend/storm/launch.sh, you'll see the following: + +```bash +storm jar bullet-storm-example-1.0-SNAPSHOT-jar-with-dependencies.jar \ + com.yahoo.bullet.Topology \ + --bullet-conf bullet_settings.yaml \ + --bullet-spout com.yahoo.bullet.storm.examples.RandomSpout \ + --bullet-spout-parallelism 1 \ + ... + --bullet-spout-arg 20 \ + --bullet-spout-arg 101 \ + ... +``` + +This command launches the jar (an uber or "fat" jar) containing the custom spout code and all dependencies you copied in Step 5. We pass the name of your spout class with ```--bullet-spout com.yahoo.bullet.storm.examples.RandomSpout``` to the Bullet main class ```com.yahoo.bullet.Topology``` with two arguments ```--bullet-spout-arg 20``` and ```--bullet-spout-arg 101```. The first argument tells the Spout to generate at most 20 tuples (records) in a period and the second argument says a period is 101 ms long. + +The settings defined by ```--bullet-conf bullet_settings.yaml``` and the arguments here run all components in the topology with a parallelism of 1. So there will be one spout that is producing ~200 rps. + +!!! note "I thought you said hundreds of thousands of records..." + + 200 records is not Big Data by any stretch of the imagination but this Quick Start is running everything on one machine and is meant to introduce you to what Bullet does. In practice, you would scale and run your components with CPU and memory configurations to accommodate for your data volume and querying needs. + + +Let's look at the [custom spout code](https://github.com/yahoo/bullet-docs/blob/master/examples/storm/src/main/java/com/yahoo/bullet/storm/examples/RandomSpout.java) that generates the data. + +```java + @Override + public void nextTuple() { + long timeNow = System.nanoTime(); + // Only emit if we are still in the interval and haven't gone over our per period max + if (timeNow <= nextIntervalStart && generatedThisPeriod < maxPerPeriod) { + outputCollector.emit(new Values(generateRecord()), DUMMY_ID); + generatedThisPeriod++; + } + if (timeNow > nextIntervalStart) { + log.info("Generated {} tuples out of {}", generatedThisPeriod, maxPerPeriod); + nextIntervalStart = timeNow + period; + generatedThisPeriod = 0; + periodCount++; + } + // It is courteous to sleep for a short time if you're not emitting anything... + try { + Thread.sleep(1); + } catch (InterruptedException e) { + log.error("Error: ", e); + } + } +``` + +This method above emits the tuples. The Storm framework calls this method. This function only emits at most the given maximum tuples per period. + +!!! note "Why a DUMMY_ID?" + + When the spout emits the randomly generated tuple, it attaches a ```DUMMY_ID``` to it. In Storm terms, this is a message ID. By adding a message ID, this tuple can be made to flow reliably. The Bullet component that receives this tuple (Filter bolt) acknowledges or "acks" this tuple. If the tuple did not make it to Filter bolt within a configured timeout window, Storm will call a ```fail(Object messageId)``` method on the spout. This particular spout does not define one and hence the usage of a ```DUMMY_ID```. If your source of data can identify records uniquely and you can re-emit them on a fail, you should attach that actual ID in place of the ```DUMMY_ID```. + +```java + private BulletRecord generateRecord() { + BulletRecord record = new BulletRecord(); + String uuid = UUID.randomUUID().toString(); + + record.setString(STRING, uuid); + record.setLong(LONG, (long) generatedThisPeriod); + record.setDouble(DOUBLE, random.nextDouble()); + record.setString(TYPE, STRING_POOL[random.nextInt(STRING_POOL.length)]); + record.setLong(DURATION, System.currentTimeMillis() % INTEGER_POOL[random.nextInt(INTEGER_POOL.length)]); + + Map booleanMap = new HashMap<>(4); + booleanMap.put(uuid.substring(0, 8), random.nextBoolean()); + booleanMap.put(uuid.substring(9, 13), random.nextBoolean()); + booleanMap.put(uuid.substring(14, 18), random.nextBoolean()); + booleanMap.put(uuid.substring(19, 23), random.nextBoolean()); + record.setBooleanMap(BOOLEAN_MAP, booleanMap); + + Map statsMap = new HashMap<>(4); + statsMap.put(PERIOD_COUNT, periodCount); + statsMap.put(RECORD_NUMBER, periodCount * maxPerPeriod + generatedThisPeriod); + statsMap.put(NANO_TIME, System.nanoTime()); + statsMap.put(TIMESTAMP, System.currentTimeMillis()); + record.setLongMap(STATS_MAP, statsMap); + + Map randomMapA = new HashMap<>(2); + Map randomMapB = new HashMap<>(2); + randomMapA.put(RANDOM_MAP_KEY_A, STRING_POOL[random.nextInt(STRING_POOL.length)]); + randomMapA.put(RANDOM_MAP_KEY_B, STRING_POOL[random.nextInt(STRING_POOL.length)]); + randomMapB.put(RANDOM_MAP_KEY_A, STRING_POOL[random.nextInt(STRING_POOL.length)]); + randomMapB.put(RANDOM_MAP_KEY_B, STRING_POOL[random.nextInt(STRING_POOL.length)]); + record.setListOfStringMap(LIST, asList(randomMapA, randomMapB)); + + return record; + } +``` + +This method generates some fields randomly and inserts them into a BulletRecord. Note that the BulletRecord is typed and all data must be inserted with the proper types. + +If you put Bullet on your data, you will need to write a Spout (or a topology if your reading is complex), that reads from your data source and emits BulletRecords with the fields you wish to be query-able placed into a BulletRecord similar to this example. + +### PubSub + +We used the [DRPC PubSub](pubsub/storm-drpc.md) since we were using the Storm Backend. This code was included in the Bullet Storm artifact that we downloaded (the JAR with dependencies). We configured the Backend to use this PubSub by adding these settings to the YAML file that we passed to our Storm topology. Notice that we set the context to ```QUERY_PROCESSING``` since this is the Backend. + +```yaml +bullet.pubsub.context.name: "QUERY_PROCESSING" +bullet.pubsub.class.name: "com.yahoo.bullet.storm.drpc.DRPCPubSub" +bullet.pubsub.storm.drpc.function: "bullet-query" +``` + +For the Web Service, we passed in a YAML file that pointed to our DRPC server that was part of the Storm cluster we launched. Notice that we set the context to ```QUERY_SUBMISSION``` since this is the Web Service. + +```yaml +bullet.pubsub.context.name: "QUERY_SUBMISSION" +bullet.pubsub.class.name: "com.yahoo.bullet.storm.drpc.DRPCPubSub" +bullet.pubsub.storm.drpc.servers: + - 127.0.0.1 +bullet.pubsub.storm.drpc.function: "bullet-query" +bullet.pubsub.storm.drpc.http.protocol: "http" +bullet.pubsub.storm.drpc.http.port: "3774" +bullet.pubsub.storm.drpc.http.path: "drpc" +bullet.pubsub.storm.drpc.http.connect.retry.limit: 3 +bullet.pubsub.storm.drpc.http.connect.timeout.ms: 1000 +``` + +### Web Service + +We launched the Web Service using two custom files - a PubSub configuration YAML file and JSON schema file. + +The JSON columns file contains the schema for our data specified in JSON. Since our schema is not going to change, we use the Web Service to serve it from a file. If your schema changes dynamically, you will need to provide your own endpoint to the UI. + +The following is a snippet from the [JSON file](https://github.com/yahoo/bullet-docs/blob/master/examples/web-service/example_columns.json). Notice how the types of the fields are specified. Also, if you have generated BulletRecord with Map fields whose keys are known, you can specify them here using ```enumerations```. + +```javascript +[ + { + "name": "probability", + "type": "DOUBLE", + "description": "Generated from Random#nextDouble" + }, + ... + { + "name": "stats_map", + "type": "MAP", + "subtype": "LONG", + "description": "This map contains some numeric information such as the current number of periods etc.", + "enumerations": [ + ... + {"name": "nano_time", "description": "The ns time when this record was generated"} + ] + }, + { + "name": "classifiers", + "type": "LIST", + "subtype": "MAP", + "description": "This contains two maps, each with: field_A and field_B whose values are randomly chosen from: foo, bar, baz, qux, quux, norf" + } +] +``` +The contents of the [PubSub configuration file](https://github.com/yahoo/bullet-docs/blob/master/examples/web-service/example_drpc_pubsub_config.yaml) was discussed in the [PubSub section above](#pubsub). + +### UI + +Finally, we configured the UI with the custom environment specific settings file. We did not add any environments since we only had the one. + +```javascript +{ + "default": { + "queryHost": "http://localhost:9999", + "queryNamespace": "api/bullet", + "queryPath": "query", + "schemaHost": "http://localhost:9999", + "schemaNamespace": "api/bullet", + "helpLinks": [ + { + "name": "Examples", + "link": "https://yahoo.github.io/bullet-docs/ui/usage" + } + ], + "bugLink": "https://github.com/yahoo/bullet-ui/issues", + "modelVersion": 2, + "migrations": { + "deletions": "result" + }, + "defaultValues": { + "defaultValues": { + "aggregationMaxSize": 1024, + "rawMaxSize": 500, + "durationMaxSecs": 540, + "distributionNumberOfPoints": 11, + "distributionQuantilePoints": "0, 0.25, 0.5, 0.75, 0.9, 1", + "distributionQuantileStart": 0, + "distributionQuantileEnd": 1, + "distributionQuantileIncrement": 0.1, + "queryTimeoutSecs": 3, + "sketches": { + "countDistinctMaxEntries": 16384, + "groupByMaxEntries": 512, + "distributionMaxEntries": 1024, + "distributionMaxNumberOfPoints": 200, + "topKMaxEntries": 1024, + "topKErrorType": "No False Negatives" + }, + "metadataKeyMapping": { + "theta": "theta", + "uniquesEstimate": "uniques_estimate", + "queryCreationTime": "query_receive_time", + "queryTerminationTime": "query_finish_time", + "estimatedResult": "was_estimated", + "standardDeviations": "standard_deviations", + "normalizedRankError": "normalized_rank_error", + "maximumCountError": "maximum_count_error", + "itemsSeen": "items_seen", + "minimumValue": "minimum_value", + "maximumValue": "maximum_value" + } + } + } +} +``` + +Since we served our schema through the same Web Service as our queries, both these point to our Web Service. Note that there is no ```schemaPath``` because it must be the constant string ```columns```. If you define a custom endpoint for your schema, you must ensure that it can be obtained by making a GET request to ```schemaHost/schemaNamespace/columns```. diff --git a/docs/quick-start/bullet-on-storm.md b/docs/quick-start/bullet-on-storm.md index eed84fb4..4c8c1217 100644 --- a/docs/quick-start/bullet-on-storm.md +++ b/docs/quick-start/bullet-on-storm.md @@ -1,139 +1,197 @@ -# Quick Start +# Quick Start - Bullet on Storm -This section gets you running a mock instance of Bullet to play around with. The instance will run using Bullet on Spark and use the REST pubsub available as part of bullet-core. Since we do not have an actual data source, we will produce some fake data and convert it into [Bullet Records](backend/ingestion.md) using [some simple custom Spark code](https://github.com/bullet-db/bullet-db.github.io/blob/src/examples/spark/src/main/scala/com/yahoo/bullet/spark/examples/receiver/RandomReceiver.scala). If you want to use Bullet for your data, you will need to do read and convert your data to Bullet Records in a similar manner. +!!! note "NOTE: This is an old version of Bullet" + The version of Bullet this quickstart uses does not support the newest functionality such as Windowing. We are working hard to get new documentation up as soon as possible. Use [the Spark quickstart](bullet-on-spark.md) to see all the latest features. An updated quickstart for Storm is coming soon. + +This section gets you running a mock instance of Bullet to play around with. The instance will run using [Bullet on Storm](backend/storm-setup.md) and use the [DRPC Pubsub](pubsub/storm-drpc.md). Since we do not have an actual data source, we will produce some fake data and convert it into [Bullet Records](backend/ingestion.md) in a [custom Storm spout](https://github.com/yahoo/bullet-docs/blob/master/examples/storm/src/main/java/com/yahoo/bullet/storm/examples/RandomSpout.java). If you want to use Bullet for your data, you will need to do read and convert your data to Bullet Records in a similar manner. At the end of this section, you will have: - * Launched the Bullet backend on spark - * Setup the [Web Service](ws/setup.md) with it's built-in REST pubsub enabled + * Setup the Bullet topology using a custom spout on [bullet-storm-0.6.2](https://github.com/yahoo/bullet-storm/releases/tag/bullet-storm-0.6.2) + * Setup the [Web Service](ws/setup.md) talking to the topology and serving a schema for your UI using [bullet-service-0.1.1](https://github.com/yahoo/bullet-service/releases/tag/bullet-service-0.1.1) + * Setup the [DRPC PubSub](pubsub/storm-drpc.md) talking to the topology and Web Service. * Setup the [UI](ui/setup.md) talking to the Web Service using [bullet-ui-0.4.0](https://github.com/yahoo/bullet-ui/releases/tag/v0.4.0) **Prerequisites** * You will need to be on an Unix-based system (Mac OS X, Ubuntu ...) with ```curl``` installed * You will need [JDK 8](http://www.oracle.com/technetwork/java/javase/downloads/index.html) installed + * You will need enough CPU and RAM on your machine to run about 8-10 JVMs in ```server``` mode. You should have at least 2 GB free space on your disk. We will be setting up a Storm cluster with multiple components, a couple of Jetty instances and a Node server. ## Install Script -DO THIS LATER (one-liner?) +Simply run: + +```bash +curl -sLo- https://raw.githubusercontent.com/yahoo/bullet-docs/v0.4.0/examples/install-all.sh | bash +``` + +This will setup a local Storm cluster, a Bullet running on it, the Bullet Web Service and a Bullet UI for you. Once everything has launched, you should be able to go to the Bullet UI running locally at [http://localhost:8800](http://localhost:8800). You can then [**continue this guide from here**](#what-did-we-do). + +!!! note "Want to DIY?" + If you want to manually run all the commands or if the script died while doing something above (might want to perform the [teardown](#teardown) first), you can continue below. + ## Manual Installation -### Setup the Bullet Web Service and REST Pub-Sub +### Setting up Storm -Before we launch the Bullet Spark backend, we first need to setup the Bullet Web Service and PubSub layer. The bullet-core repo provides a [pubsub.rest](https://github.com/bullet-db/bullet-core/tree/master/src/main/java/com/yahoo/bullet/pubsub/rest) package which is a simple implementation of the PubSub layer using REST endpoints. The bullet web service can be configured to use this built-in REST PubSub to provide the additional REST endpoints needed to serve as a PubSub layer as well as the web service. +To set up a clean working environment, let's start with creating some directories. #### Step 1: Setup directories and examples ```bash export BULLET_HOME=$(pwd)/bullet-quickstart -mkdir -p $BULLET_HOME/backend/spark +mkdir -p $BULLET_HOME/backend/storm mkdir -p $BULLET_HOME/service mkdir -p $BULLET_HOME/ui cd $BULLET_HOME -DO THE THING to download the compressed folder - used to be: curl -LO https://github.com/yahoo/bullet-docs/releases/download/v0.4.0/examples_artifacts.tar.gz +curl -LO https://github.com/yahoo/bullet-docs/releases/download/v0.4.0/examples_artifacts.tar.gz tar -xzf examples_artifacts.tar.gz export BULLET_EXAMPLES=$BULLET_HOME/bullet-examples ``` -#### Step 2: Install the Bullet Web Service +#### Step 2: Install Storm 1.1 ```bash -cd $BULLET_HOME/service -curl -Lo bullet-service.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-service/0.2.1/bullet-service-0.2.1-embedded.jar -cp $BULLET_EXAMPLES/web-service/example_rest_pubsub_config.yaml $BULLET_HOME/service/ -cp $BULLET_EXAMPLES/web-service/example_columns.json $BULLET_HOME/service/ +cd $BULLET_HOME/backend +curl -O http://apache.org/dist/storm/apache-storm-1.1.2/apache-storm-1.1.2.zip +unzip apache-storm-1.1.2.zip +export PATH=$(pwd)/apache-storm-1.1.2/bin/:$PATH ``` - -#### Step 3: Launch the Web Service +Add a DRPC server setting to the Storm config: ```bash -cd $BULLET_HOME/service -java -jar bullet-service.jar --bullet.pubsub.config=$BULLET_HOME/service/example_rest_pubsub_config.yaml --bullet.schema.file=$BULLET_HOME/service/example_columns.json --server.port=9999 --bullet.pubsub.builtin.rest.enabled=true --logging.path=. --logging.file=log.txt &> log.txt & +echo 'drpc.servers: ["127.0.0.1"]' >> apache-storm-1.1.2/conf/storm.yaml ``` -The Web Service usually takes ~10-15 seconds to start. +#### Step 3: Launch Storm components -You can check the status of the Web Service by looking at the Web Service log: +Launch each of the following components, in order and wait for the commands to go through. You may have to do these one at a time. You will see a JVM being launched for each one and connection messages as the components communicate through Zookeeper. ```bash -cat $BULLET_HOME/service/log.txt +storm dev-zookeeper & +storm nimbus & +storm drpc & +storm ui & +storm logviewer & +storm supervisor & ``` -The log should contain a message that reads something like `Started Application in X seconds` (usually the last line of the file if the web service has been run recently). +It may take 30-60 seconds for all the components to launch. + +Once everything is up without errors, visit [http://localhost:8080](http://localhost:8080) and see if the Storm UI loads. -#### Step 4: Test the Web Service (optional) +#### Step 4: Test Storm (Optional) -We can check that the Web Service is up and running by getting the example columns through the API: +Before Bullet, test to see if Storm and DRPC are up and running by launching a example topology that comes with your Storm installation: ```bash -curl -s http://localhost:9999/api/bullet/columns +storm jar apache-storm-1.1.2/examples/storm-starter/storm-starter-topologies-1.1.2.jar org.apache.storm.starter.BasicDRPCTopology topology ``` -#### Step 5: Test the PubSub Layer (optional) - -To ensure that the Web Service has been configured to expose the necessary PubSub REST endpoints, we can "write" a fake-query to the PubSub, and then read it back by hand. Since there is currently no backend running, any queries written to the PubSub will simply be stored there until we read it manually. - -Write a fake empty query to the query endpoint: +Visit your UI with a browser and see if a topology with name "topology" is running. If everything is good, you should be able to ping DRPC with: ```bash -curl -s -H 'Content-Type: application/json' -X POST -d '{}' http://localhost:9999/api/bullet/pubsub/query +curl localhost:3774/drpc/exclamation/foo ``` -Receiving no error response should indicate that the fake query was written to the pubsub. Then read a query from this same endpoint: +and get back a ```foo!```. Any string you pass as part of the URL is returned to you with a "!" at the end. + +Kill this topology after with: ```bash -curl http://localhost:9999/api/bullet/pubsub/query +storm kill topology ``` -This should print `'{}'` to the screen, indicating we have successfully written and then read a fake empty query from the PubSub layer. Subsequent reads from this endpoint will return nothing because no more queries have been written to the PubSub endpoint. +!!! note "Local mode cleanup" + If you notice any problems while setting up storm or while relaunching a topology, it may be because some state is corrupted. When running Storm in this fashion, states and serializations are stored in ```storm-local``` and ```/tmp/```. You may want to ```rm -rf storm-local/* /tmp/dev-storm-zookeeper``` to clean up this state before relaunching Storm components. See the [tear down section](#teardown) on how to kill any running instances. -### Setup Bullet Backend on Spark +### Setting up the example Bullet topology -We will run the bullet-spark backend using [Spark 2.2.1](https://spark.apache.org/releases/spark-release-2-2-1.html). +Now that Storm is up and running, we can put Bullet on it. We will use an example Spout that runs on Bullet 0.4.3 on our Storm cluster. The source is available [here](https://github.com/yahoo/bullet-docs/blob/master/examples/storm). This was part of the artifact that you installed in Step 1. -#### Step 6: Install Spark 2.2.1 +#### Step 5: Setup the Storm example ```bash -export BULLET_SPARK=$BULLET_HOME/backend/spark -cd $BULLET_SPARK -curl -O http://www-eu.apache.org/dist/spark/spark-2.2.1/spark-2.2.1-bin-hadoop2.7.tgz -tar -xzf spark-2.2.1-bin-hadoop2.7.tgz +cp $BULLET_EXAMPLES/storm/* $BULLET_HOME/backend/storm ``` -#### Step 7: Setup Bullet-Spark and Example Data Producer +!!! note "Settings" -```bash -cp $BULLET_HOME/bullet-examples/backend/spark/* $BULLET_SPARK -curl -Lo bullet-spark.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-spark/0.1.1/bullet-spark-0.1.1-standalone.jar -``` + Take a look at bullet_settings.yaml for the settings that are being overridden for this example. You can add or change settings as you like by referring to [core Bullet settings in bullet_defaults.yaml](https://github.com/yahoo/bullet-core/blob/master/src/main/resources/bullet_defaults.yaml) and [Storm settings in bullet_storm_defaults.yaml](https://github.com/yahoo/bullet-storm/blob/master/src/main/resources/bullet_storm_defaults.yaml). In particular, we have [customized these settings](https://github.com/yahoo/bullet-docs/blob/master/examples/storm/src/main/resources/bullet_settings.yaml) that affect the Bullet queries you can run: + + ```bullet.query.max.duration: 570000``` Longest query time can be 570s. The Storm cluster default DRPC timeout is 600s. + + ```bullet.query.aggregation.raw.max.size: 500``` The max ```RAW``` records you can fetch is 500. + + ```bullet.query.aggregation.max.size: 1024``` The max records you can fetch for any query is 1024. + + ```bullet.query.aggregation.count.distinct.sketch.entries: 16384``` We can count 16384 unique values exactly. Approximates after. -#### Step 8: Launch the Bullet Spark Backend + ```bullet.query.aggregation.group.sketch.entries: 1024``` The max unique groups can be 1024. Uniform sample after. + + ```bullet.query.aggregation.distribution.sketch.entries: 1024``` Determines the normalized rank error for distributions. + + ```bullet.query.aggregation.top.k.sketch.entries: 1024``` 0.75 times this number is the number of unique items for which counts can be done exactly. Approximates after. + + ```bullet.query.aggregation.distribution.max.points: 200``` The maximum number of points you can generate, use or provide for a Distribution aggregation. + +!!! note "Want to tweak the example topology code?" + + You will need to clone the [examples repository](https://github.com/yahoo/bullet-docs/tree/master/examples/storm) and customize it. To build the examples, you'll need to install [Maven 3](https://maven.apache.org/install.html). + + ```cd $BULLET_HOME && git clone git@github.com:yahoo/bullet-docs.git``` + + ```cd bullet-docs/examples/storm && mvn package``` + + You will find the ```bullet-storm-example-1.0-SNAPSHOT-jar-with-dependencies.jar``` in ```$BULLET_HOME/bullet-docs/examples/storm/target/``` + + You can also make the ```examples_artifacts.tar.gz``` file with all the settings that is placed in ```$BULLET_EXAMPLES``` by just running ```make``` in the ```bullet-docs/examples/``` folder. + +#### Step 6: Launch the topology ```bash -$BULLET_SPARK/spark-2.2.1-bin-hadoop2.7/bin/spark-submit --master local[10] --class com.yahoo.bullet.spark.BulletSparkStreamingMain --driver-class-path $BULLET_SPARK/bullet-spark-example.jar:$BULLET_SPARK/bullet-spark.jar $BULLET_SPARK/bullet-spark.jar &> log.txt & +cd $BULLET_HOME/backend/storm && ./launch.sh ``` +Visit the UI and see if the topology is up. You should see the ```DataSource``` spout begin emitting records. -The backend will usually be up and running within 5-10 seconds. The Web Service will now be hooked up through the REST PubSub to the Spark backend. You can now run a Bullet query by hitting the web service directly: +Test the Bullet topology by: ```bash -curl -s -H 'Content-Type: text/plain' -X POST -d '{"aggregation": {"size": 1}}' http://localhost:9999/api/bullet/sse-query +curl -s -X POST -d '{"id":"", "content":"{}"}' http://localhost:3774/drpc/bullet-query ``` -This query will return a result JSON containing a "records" field containing a single record, and a "meta" field with some meta information. +You should get a random record (serialized as a String inside a JSON message sent back through the PubSub) from Bullet. !!! note "What is this data?" - This data is randomly generated by the [custom data producer](https://github.com/bullet-db/bullet-db.github.io/blob/src/examples/spark/src/main/scala/com/yahoo/bullet/spark/examples/receiver/RandomReceiver.scala) that was created for the sole purpose of generating toy data to demo Bullet. In practice, your spout would read from an actual data source such as Kafka. - - + This data is randomly generated by the [custom Storm spout](https://github.com/yahoo/bullet-docs/blob/master/examples/storm/src/main/java/com/yahoo/bullet/storm/examples/RandomSpout.java) that is in the example topology you just launched. In practice, your spout would read from an actual data source such as Kafka instead. See [below](#storm-topology) for more details about this random data spout. +### Setting up the Bullet Web Service +#### Step 7: Install the Bullet Web Service +```bash +cd $BULLET_HOME/service +curl -Lo bullet-service.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-service/0.1.1/bullet-service-0.1.1-embedded.jar +cp $BULLET_EXAMPLES/web-service/example* $BULLET_HOME/service/ +cp $BULLET_EXAMPLES/storm/*jar-with-dependencies.jar $BULLET_HOME/service/bullet-storm-jar-with-dependencies.jar +``` +#### Step 8: Launch the Web Service +```bash +cd $BULLET_HOME/service +java -Dloader.path=bullet-storm-jar-with-dependencies.jar -jar bullet-service.jar --bullet.pubsub.config=example_drpc_pubsub_config.yaml --bullet.schema.file=example_columns.json --server.port=9999 --logging.path=. --logging.file=log.txt &> log.txt & +``` +You can verify that it is up by running a Bullet query or getting the example columns through the API: +```bash +curl -s -H 'Content-Type: text/plain' -X POST -d '{}' http://localhost:9999/api/bullet/query +curl -s http://localhost:9999/api/bullet/columns +``` ### Setting up the Bullet UI diff --git a/examples/Makefile b/examples/Makefile index b4177bee..605c5405 100644 --- a/examples/Makefile +++ b/examples/Makefile @@ -15,7 +15,8 @@ build: cp install-all-spark.sh bullet-examples/ cd spark && mvn package cp spark/target/bullet-spark-example-0.0.1-SNAPSHOT.jar bullet-examples/backend/spark/bullet-spark-example.jar - cp spark/src/main/resources/bullet_spark_settings.yaml bullet-examples/backend/spark + cp spark/src/main/resources/bullet_spark_kafka_settings.yaml bullet-examples/backend/spark + cp spark/src/main/resources/bullet_spark_rest_settings.yaml bullet-examples/backend/spark mkdir -p bullet-examples/web-service cp web-service/* bullet-examples/web-service mkdir -p bullet-examples/ui diff --git a/examples/spark/pom.xml b/examples/spark/pom.xml index 593c17e3..3454c6fc 100644 --- a/examples/spark/pom.xml +++ b/examples/spark/pom.xml @@ -10,7 +10,7 @@ 2.11.7 2.11 2.3.0 - 0.1.0 + 0.1.2 diff --git a/examples/spark/src/main/resources/bullet_spark_settings.yaml b/examples/spark/src/main/resources/bullet_spark_kafka_settings.yaml similarity index 99% rename from examples/spark/src/main/resources/bullet_spark_settings.yaml rename to examples/spark/src/main/resources/bullet_spark_kafka_settings.yaml index 06413ea2..0c63339b 100644 --- a/examples/spark/src/main/resources/bullet_spark_settings.yaml +++ b/examples/spark/src/main/resources/bullet_spark_kafka_settings.yaml @@ -32,4 +32,3 @@ bullet.pubsub.class.name: "com.yahoo.bullet.kafka.KafkaPubSub" bullet.pubsub.kafka.bootstrap.servers: "localhost:9092" bullet.pubsub.kafka.request.topic.name: "bullet.requests" bullet.pubsub.kafka.response.topic.name: "bullet.responses" - diff --git a/examples/spark/src/main/resources/bullet_spark_kafka_settings_new.yaml b/examples/spark/src/main/resources/bullet_spark_kafka_settings_new.yaml new file mode 100644 index 00000000..728546c3 --- /dev/null +++ b/examples/spark/src/main/resources/bullet_spark_kafka_settings_new.yaml @@ -0,0 +1,103 @@ +######################################################################################################################## +############################################### Bullet Spark defaults ################################################# +######################################################################################################################## +# This is the name of the concrete implementation of Data Producer to use. +bullet.spark.data.producer.class.name: "com.yahoo.bullet.spark.examples.RandomProducer" + +# This is the batch interval of your Spark Streaming job. Find out more at +# https://spark.apache.org/docs/latest/streaming-programming-guide.html#setting-the-right-batch-interval. +bullet.spark.batch.duration.ms: 1000 + +# This is the size of the buffer for accumulating queries in the Query Receiver before emitting to Spark. +bullet.spark.receiver.query.block.size: 1 + +# This is the maximum number of partitions that will be created by the Query Receiver. +bullet.spark.receiver.query.coalesce.partitions: 10 + +# This is the number of Data Producers. +bullet.spark.data.producer.parallelism: 1 + +# This is the checkpoint directory. If you are running your Spark on a cluster, the directory must be an HDFS path. +bullet.spark.checkpoint.dir: "/tmp/spark-checkpoint" + +# If true, Bullet Spark recovers context from checkpoint files when restarting. +# Otherwise Bullet Spark creates a new context. +bullet.spark.recover.from.checkpoint.enable: false + +# This is the Spark application name. +bullet.spark.app.name: "BulletSparkStreamingJob" + +# If true, Bullet Spark collects metrics which can be accessed via the Spark REST API (/metrics/json). +bullet.spark.metrics.enabled: false + +# If true, enables parallel processing of queries in each partition of the Filter Streaming job, This is particularly +# useful when using Producers that are Direct (e.g. DirectKafkaProducer) and you would like to avoid repartitioning +# the data and instead choose to parallelize within each partition (fixed by the producer) instead. +# It speeds up the processing within those partitions by partitioning queries to multiple threads to do the filtering +# operation concurrently. +bullet.spark.filter.partition.parallel.mode.enabled: false + +# This is the thread pool size to use when bullet.spark.filter.partition.parallel.mode.enabled is true. +bullet.spark.filter.partition.parallel.mode.parallelism: 4 + +# This is the minimum number of queries at which the parallel partition filtering is applied. Since there are fixed +# costs to manage a thread pool, they are only created once the number of queries exceeds this threshold. +# It is only used when bullet.spark.filter.partition.parallel.mode.enabled is true. +bullet.spark.filter.partition.parallel.mode.min.query.threshold: 10 + +# The following 2 settings are used to set the checkpoint intervals independently for each stateful transformation. +# Checkpoint interval = Spark duration * checkpoint duration multiplier +# Use this to control the frequency of checkpointing operation. If this is set too high, there might be too much +# data to checkpoint (RDD lineage graph). +bullet.spark.query.union.checkpoint.duration.multiplier: 10 +bullet.spark.join.checkpoint.duration.multiplier: 10 + +# The feedback publisher switches your PubSub into QUERY_SUBMISSION mode to loop back metadata messages to query +# receiver. If you need to change settings for your publisher in this mode that is different from the settings +# used in the result publisher, override them here. This setting needs to be a Map if provided. +# The example below pretends that your PubSub settings start with bullet.pubsub.custom. You will provide yours. +# Example: +# +# bullet.spark.loop.pubsub.overrides: +# bullet.pubsub.custom.publisher.setting: 1 +# bullet.pubsub.custom.nested.publisher.setting: +# foo: bar +# bar: baz +bullet.spark.loop.pubsub.overrides: {} + +######################################################################################################################## +############################################### Spark Streaming defaults ############################################### +######################################################################################################################## +# The following settings are passed to Spark directly. You can add more settings here. +# Find out more information about configuring a Spark job at https://spark.apache.org/docs/latest/configuration.html. +# Add configuration that change infrequently here and submit more variable settings while submitting the job on the +# command line. +spark.serializer: "org.apache.spark.serializer.KryoSerializer" +spark.closure.serializer: "org.apache.spark.serializer.KryoSerializer" +spark.streaming.stopGracefullyOnShutdown: "true" +spark.streaming.receiver.writeAheadLog.enable: "false" +spark.streaming.driver.writeAheadLog.allowBatching: "false" + +######################################################################################################################## +############################################### Query PubSub defaults ################################################## +######################################################################################################################## +# This is the type of PubSub context to use for result publisher. +# The feedback publisher uses QUERY_SUBMISSION since it submits messages. +bullet.pubsub.context.name: "QUERY_PROCESSING" +# This is the name of the concrete implementation of PubSub to use. +# By default, it is the bulletin REST in-memory PubSub. +bullet.pubsub.class.name: "com.yahoo.bullet.kafka.KafkaPubSub" +# Add settings specific to your PubSub. +bullet.pubsub.kafka.bootstrap.servers: "localhost:9092" +bullet.pubsub.kafka.request.topic.name: "bullet.requests" +bullet.pubsub.kafka.response.topic.name: "bullet.responses" + +######################################################################################################################## +############################################### Bullet Core settings ################################################### +######################################################################################################################## +## You can also configure the core Bullet settings here. For documentation and defaults for those settings, refer to: +## https://github.com/bullet-db/bullet-core/blob/master/src/main/resources/bullet_defaults.yaml +######################################################################################################################## +######################################################################################################################## +# Factory class to get new BulletRecords. +bullet.record.provider.class.name: "com.yahoo.bullet.record.SimpleBulletRecordProvider" diff --git a/examples/spark/src/main/resources/bullet_spark_rest_settings.yaml b/examples/spark/src/main/resources/bullet_spark_rest_settings.yaml new file mode 100644 index 00000000..f1e37c06 --- /dev/null +++ b/examples/spark/src/main/resources/bullet_spark_rest_settings.yaml @@ -0,0 +1,36 @@ +######################################################################################################################## +######################################### Bullet Spark Settings ##################################### +######################################################################################################################## +bullet.spark.data.producer.class.name: "com.yahoo.bullet.spark.examples.RandomProducer" +bullet.spark.batch.duration.ms: 1000 +bullet.spark.receiver.query.block.size: 1 +bullet.spark.receiver.query.coalesce.partitions: 10 +bullet.spark.data.producer.parallelism: 1 +bullet.spark.checkpoint.dir: "/tmp/spark-checkpoint" +bullet.spark.recover.from.checkpoint.enable: false +bullet.spark.app.name: "BulletSparkStreamingJob" +bullet.spark.metrics.enabled: false +bullet.spark.filter.partition.parallel.mode.enabled: false +bullet.spark.filter.partition.parallel.mode.parallelism: 4 +bullet.spark.filter.partition.parallel.mode.min.query.threshold: 10 + +######################################################################################################################## +######################################### Spark Streaming Settings ##################################### +######################################################################################################################## +spark.serializer: "org.apache.spark.serializer.KryoSerializer" +spark.closure.serializer: "org.apache.spark.serializer.KryoSerializer" +spark.kryo.registrator: "com.yahoo.bullet.spark.utils.BulletKryoRegistrator" +spark.streaming.stopGracefullyOnShutdown: "true" +spark.streaming.receiver.writeAheadLog.enable: "false" +spark.streaming.driver.writeAheadLog.allowBatching: "false" + +######################################################################################################################## +######################################### Query PubSub Settings ######################################## +######################################################################################################################## +bullet.pubsub.context.name: "QUERY_PROCESSING" +bullet.pubsub.class.name: "com.yahoo.bullet.pubsub.rest.RESTPubSub" +# A list of url(s) for the query endpoint. In the web service, this should contain a single URL for the query endpoint +# of the in-memory pubsub instance running on that web service. For the backend it should contain the urls of all +# the pubsub instances. +bullet.pubsub.rest.query.urls: + - "http://localhost:9999/api/bullet/pubsub/query" diff --git a/examples/ui/env-settings.json b/examples/ui/env-settings.json index 9aa00db6..2d33339c 100644 --- a/examples/ui/env-settings.json +++ b/examples/ui/env-settings.json @@ -2,50 +2,62 @@ "default": { "queryHost": "http://localhost:9999", "queryNamespace": "api/bullet", - "queryPath": "sse-query", + "queryPath": "ws-query", + "queryStompRequestChannel": "/server/request", + "queryStompResponseChannel": "/client/response", "schemaHost": "http://localhost:9999", "schemaNamespace": "api/bullet", "helpLinks": [ { - "name": "Examples", - "link": "https://yahoo.github.io/bullet-docs/ui/usage" + "name": "Tutorials", + "link": "https://bullet-db.github.io/ui/usage" } ], - "bugLink": "https://github.com/yahoo/bullet-ui/issues", - "modelVersion": 2, + "bugLink": "https://github.com/bullet-db/bullet-ui/issues", + "modelVersion": 3, "migrations": { - "deletions": "result" + "deletions": "query" }, "defaultValues": { - "aggregationMaxSize": 1024, - "rawMaxSize": 500, - "durationMaxSecs": 540, + "aggregationMaxSize": 512, + "rawMaxSize": 100, + "durationMaxSecs": 120, "distributionNumberOfPoints": 11, "distributionQuantilePoints": "0, 0.25, 0.5, 0.75, 0.9, 1", "distributionQuantileStart": 0, "distributionQuantileEnd": 1, "distributionQuantileIncrement": 0.1, "queryTimeoutSecs": 3, + "windowEmitFrequencyMinSecs": 1, + "everyForRecordBasedWindow": 1, + "everyForTimeBasedWindow": 2, "sketches": { "countDistinctMaxEntries": 16384, "groupByMaxEntries": 512, "distributionMaxEntries": 1024, - "distributionMaxNumberOfPoints": 200, + "distributionMaxNumberOfPoints": 100, "topKMaxEntries": 1024, "topKErrorType": "No False Negatives" }, "metadataKeyMapping": { - "theta": "theta", - "uniquesEstimate": "uniques_estimate", - "queryCreationTime": "query_receive_time", - "queryTerminationTime": "query_finish_time", - "estimatedResult": "was_estimated", - "standardDeviations": "standard_deviations", - "normalizedRankError": "normalized_rank_error", - "maximumCountError": "maximum_count_error", - "itemsSeen": "items_seen", - "minimumValue": "minimum_value", - "maximumValue": "maximum_value" + "querySection": "Query", + "windowSection": "Window", + "sketchSection": "Sketch", + "theta": "Theta", + "uniquesEstimate": "Uniques Estimate", + "queryCreationTime": "Receive Time", + "queryTerminationTime": "Finish Time", + "estimatedResult": "Was Estimated", + "standardDeviations": "Standard Deviations", + "normalizedRankError": "Normalized Rank Error", + "maximumCountError": "Maximum Count Error", + "itemsSeen": "Items Seen", + "minimumValue": "Minimum Value", + "maximumValue": "Maximum Value", + "windowNumber": "Number", + "windowSize": "Size", + "windowEmitTime": "Emit Time", + "expectedEmitTime": "Expected Emit Time" } } }