diff --git a/docs/quick-start/bullet-on-storm-with-rest.md b/docs/quick-start/bullet-on-storm-with-rest.md deleted file mode 100644 index eed84fb4..00000000 --- a/docs/quick-start/bullet-on-storm-with-rest.md +++ /dev/null @@ -1,417 +0,0 @@ -# Quick Start - -This section gets you running a mock instance of Bullet to play around with. The instance will run using Bullet on Spark and use the REST pubsub available as part of bullet-core. Since we do not have an actual data source, we will produce some fake data and convert it into [Bullet Records](backend/ingestion.md) using [some simple custom Spark code](https://github.com/bullet-db/bullet-db.github.io/blob/src/examples/spark/src/main/scala/com/yahoo/bullet/spark/examples/receiver/RandomReceiver.scala). If you want to use Bullet for your data, you will need to do read and convert your data to Bullet Records in a similar manner. - -At the end of this section, you will have: - - * Launched the Bullet backend on spark - * Setup the [Web Service](ws/setup.md) with it's built-in REST pubsub enabled - * Setup the [UI](ui/setup.md) talking to the Web Service using [bullet-ui-0.4.0](https://github.com/yahoo/bullet-ui/releases/tag/v0.4.0) - -**Prerequisites** - - * You will need to be on an Unix-based system (Mac OS X, Ubuntu ...) with ```curl``` installed - * You will need [JDK 8](http://www.oracle.com/technetwork/java/javase/downloads/index.html) installed - -## Install Script - -DO THIS LATER (one-liner?) - -## Manual Installation - -### Setup the Bullet Web Service and REST Pub-Sub - -Before we launch the Bullet Spark backend, we first need to setup the Bullet Web Service and PubSub layer. The bullet-core repo provides a [pubsub.rest](https://github.com/bullet-db/bullet-core/tree/master/src/main/java/com/yahoo/bullet/pubsub/rest) package which is a simple implementation of the PubSub layer using REST endpoints. The bullet web service can be configured to use this built-in REST PubSub to provide the additional REST endpoints needed to serve as a PubSub layer as well as the web service. - -#### Step 1: Setup directories and examples - -```bash -export BULLET_HOME=$(pwd)/bullet-quickstart -mkdir -p $BULLET_HOME/backend/spark -mkdir -p $BULLET_HOME/service -mkdir -p $BULLET_HOME/ui -cd $BULLET_HOME -DO THE THING to download the compressed folder - used to be: curl -LO https://github.com/yahoo/bullet-docs/releases/download/v0.4.0/examples_artifacts.tar.gz -tar -xzf examples_artifacts.tar.gz -export BULLET_EXAMPLES=$BULLET_HOME/bullet-examples -``` - -#### Step 2: Install the Bullet Web Service - -```bash -cd $BULLET_HOME/service -curl -Lo bullet-service.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-service/0.2.1/bullet-service-0.2.1-embedded.jar -cp $BULLET_EXAMPLES/web-service/example_rest_pubsub_config.yaml $BULLET_HOME/service/ -cp $BULLET_EXAMPLES/web-service/example_columns.json $BULLET_HOME/service/ -``` - -#### Step 3: Launch the Web Service - -```bash -cd $BULLET_HOME/service -java -jar bullet-service.jar --bullet.pubsub.config=$BULLET_HOME/service/example_rest_pubsub_config.yaml --bullet.schema.file=$BULLET_HOME/service/example_columns.json --server.port=9999 --bullet.pubsub.builtin.rest.enabled=true --logging.path=. --logging.file=log.txt &> log.txt & -``` - -The Web Service usually takes ~10-15 seconds to start. - -You can check the status of the Web Service by looking at the Web Service log: - -```bash -cat $BULLET_HOME/service/log.txt -``` - -The log should contain a message that reads something like `Started Application in X seconds` (usually the last line of the file if the web service has been run recently). - -#### Step 4: Test the Web Service (optional) - -We can check that the Web Service is up and running by getting the example columns through the API: - -```bash -curl -s http://localhost:9999/api/bullet/columns -``` - -#### Step 5: Test the PubSub Layer (optional) - -To ensure that the Web Service has been configured to expose the necessary PubSub REST endpoints, we can "write" a fake-query to the PubSub, and then read it back by hand. Since there is currently no backend running, any queries written to the PubSub will simply be stored there until we read it manually. - -Write a fake empty query to the query endpoint: - -```bash -curl -s -H 'Content-Type: application/json' -X POST -d '{}' http://localhost:9999/api/bullet/pubsub/query -``` - -Receiving no error response should indicate that the fake query was written to the pubsub. Then read a query from this same endpoint: - -```bash -curl http://localhost:9999/api/bullet/pubsub/query -``` - -This should print `'{}'` to the screen, indicating we have successfully written and then read a fake empty query from the PubSub layer. Subsequent reads from this endpoint will return nothing because no more queries have been written to the PubSub endpoint. - - -### Setup Bullet Backend on Spark - -We will run the bullet-spark backend using [Spark 2.2.1](https://spark.apache.org/releases/spark-release-2-2-1.html). - -#### Step 6: Install Spark 2.2.1 - -```bash -export BULLET_SPARK=$BULLET_HOME/backend/spark -cd $BULLET_SPARK -curl -O http://www-eu.apache.org/dist/spark/spark-2.2.1/spark-2.2.1-bin-hadoop2.7.tgz -tar -xzf spark-2.2.1-bin-hadoop2.7.tgz -``` - -#### Step 7: Setup Bullet-Spark and Example Data Producer - -```bash -cp $BULLET_HOME/bullet-examples/backend/spark/* $BULLET_SPARK -curl -Lo bullet-spark.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-spark/0.1.1/bullet-spark-0.1.1-standalone.jar -``` - -#### Step 8: Launch the Bullet Spark Backend - -```bash -$BULLET_SPARK/spark-2.2.1-bin-hadoop2.7/bin/spark-submit --master local[10] --class com.yahoo.bullet.spark.BulletSparkStreamingMain --driver-class-path $BULLET_SPARK/bullet-spark-example.jar:$BULLET_SPARK/bullet-spark.jar $BULLET_SPARK/bullet-spark.jar &> log.txt & -``` - -The backend will usually be up and running within 5-10 seconds. The Web Service will now be hooked up through the REST PubSub to the Spark backend. You can now run a Bullet query by hitting the web service directly: - -```bash -curl -s -H 'Content-Type: text/plain' -X POST -d '{"aggregation": {"size": 1}}' http://localhost:9999/api/bullet/sse-query -``` - -This query will return a result JSON containing a "records" field containing a single record, and a "meta" field with some meta information. - -!!! note "What is this data?" - - This data is randomly generated by the [custom data producer](https://github.com/bullet-db/bullet-db.github.io/blob/src/examples/spark/src/main/scala/com/yahoo/bullet/spark/examples/receiver/RandomReceiver.scala) that was created for the sole purpose of generating toy data to demo Bullet. In practice, your spout would read from an actual data source such as Kafka. - - - - - - - - - -### Setting up the Bullet UI - -#### Step 9: Install Node - -```bash -curl -s https://raw.githubusercontent.com/creationix/nvm/v0.33.1/install.sh | bash -source ~/.bashrc -nvm install v6.9.4 -nvm use v6.9.4 -``` - -#### Step 10: Install the Bullet UI - -```bash -cd $BULLET_HOME/ui -curl -LO https://github.com/yahoo/bullet-ui/releases/download/v0.4.0/bullet-ui-v0.4.0.tar.gz -tar -xzf bullet-ui-v0.4.0.tar.gz -cp $BULLET_EXAMPLES/ui/env-settings.json config/ -``` - -#### Step 11: Launch the UI - -```bash -PORT=8800 node express-server.js & -``` - -Visit [http://localhost:8800](http://localhost:8800) to query your topology with the UI. See [UI usage](ui/usage.md) for some example queries and interactions using this UI. You see what the Schema means by visiting the Schema section. - -!!! note "Running it remotely?" - - If you access the UI from another machine than where your UI is actually running, you will need to edit ```config/env-settings.json```. Since the UI is a client-side app, the machine that your browser is running on will fetch the UI and attempt to use these settings to talk to the Web Service. Since they point to localhost by default, your browser will attempt to connect there and fail. An easy fix is to change ```localhost``` in your env-settings.json to point to the host name where you will hosting the UI. This will be the same as the UI host you use in the browser. You can also do a local port forward on the machine accessing the UI by running: - ```ssh -N -L 8800:localhost:8800 -L 9999:localhost:9999 hostname-of-the-quickstart-components 2>&1``` - -## Playing around with the instance - -Check out and follow along with the [UI Usage](ui/usage.md) page as it shows you some queries you can run using this UI. - -## Teardown - -If you were using the [Install Script](#install-script) or if you don't want to manually bring down everything, you can run: - -```bash -curl -sLo- https://raw.githubusercontent.com/yahoo/bullet-docs/v0.4.0/examples/install-all.sh | bash -s cleanup -``` - -If you were performing the steps yourself, you can also manually cleanup **all the components and all the downloads** using: - -| | | -| -------------- | ---------------------------------------------------------------- | -| UI | ```pkill -f [e]xpress-server.js``` | -| Web Service | ```pkill -f [e]xample_drpc_pubsub_config.yaml``` | -| Storm | ```pkill -f [a]pache-storm-1.1.2``` | -| File System | ```rm -rf $BULLET_HOME /tmp/dev-storm-zookeeper /tmp/jetty-*``` | - -This does *not* delete ```$HOME/.nvm``` and some extra lines nvm may have added to your ```$HOME/{.profile, .bash_profile, .zshrc, .bashrc}```. - -## What did we do? - -This section will go over the various custom pieces this example plugged into Bullet, so you can better understand what we did. - -### Storm topology - -The topology was the Bullet topology plugged in with a custom spout. This spout is implemented in this [example project](https://github.com/yahoo/bullet-docs/blob/master/examples/storm/) and was already built for you when you [downloaded the examples](#step-1-setup-directories-and-examples). It does not read from any data source and just produces random, structured data. It also produces only up to a maximum number of records in a given period. Both this maximum and the length of a period are configurable. If you examine $BULLET_HOME/backend/storm/launch.sh, you'll see the following: - -```bash -storm jar bullet-storm-example-1.0-SNAPSHOT-jar-with-dependencies.jar \ - com.yahoo.bullet.Topology \ - --bullet-conf bullet_settings.yaml \ - --bullet-spout com.yahoo.bullet.storm.examples.RandomSpout \ - --bullet-spout-parallelism 1 \ - ... - --bullet-spout-arg 20 \ - --bullet-spout-arg 101 \ - ... -``` - -This command launches the jar (an uber or "fat" jar) containing the custom spout code and all dependencies you copied in Step 5. We pass the name of your spout class with ```--bullet-spout com.yahoo.bullet.storm.examples.RandomSpout``` to the Bullet main class ```com.yahoo.bullet.Topology``` with two arguments ```--bullet-spout-arg 20``` and ```--bullet-spout-arg 101```. The first argument tells the Spout to generate at most 20 tuples (records) in a period and the second argument says a period is 101 ms long. - -The settings defined by ```--bullet-conf bullet_settings.yaml``` and the arguments here run all components in the topology with a parallelism of 1. So there will be one spout that is producing ~200 rps. - -!!! note "I thought you said hundreds of thousands of records..." - - 200 records is not Big Data by any stretch of the imagination but this Quick Start is running everything on one machine and is meant to introduce you to what Bullet does. In practice, you would scale and run your components with CPU and memory configurations to accommodate for your data volume and querying needs. - - -Let's look at the [custom spout code](https://github.com/yahoo/bullet-docs/blob/master/examples/storm/src/main/java/com/yahoo/bullet/storm/examples/RandomSpout.java) that generates the data. - -```java - @Override - public void nextTuple() { - long timeNow = System.nanoTime(); - // Only emit if we are still in the interval and haven't gone over our per period max - if (timeNow <= nextIntervalStart && generatedThisPeriod < maxPerPeriod) { - outputCollector.emit(new Values(generateRecord()), DUMMY_ID); - generatedThisPeriod++; - } - if (timeNow > nextIntervalStart) { - log.info("Generated {} tuples out of {}", generatedThisPeriod, maxPerPeriod); - nextIntervalStart = timeNow + period; - generatedThisPeriod = 0; - periodCount++; - } - // It is courteous to sleep for a short time if you're not emitting anything... - try { - Thread.sleep(1); - } catch (InterruptedException e) { - log.error("Error: ", e); - } - } -``` - -This method above emits the tuples. The Storm framework calls this method. This function only emits at most the given maximum tuples per period. - -!!! note "Why a DUMMY_ID?" - - When the spout emits the randomly generated tuple, it attaches a ```DUMMY_ID``` to it. In Storm terms, this is a message ID. By adding a message ID, this tuple can be made to flow reliably. The Bullet component that receives this tuple (Filter bolt) acknowledges or "acks" this tuple. If the tuple did not make it to Filter bolt within a configured timeout window, Storm will call a ```fail(Object messageId)``` method on the spout. This particular spout does not define one and hence the usage of a ```DUMMY_ID```. If your source of data can identify records uniquely and you can re-emit them on a fail, you should attach that actual ID in place of the ```DUMMY_ID```. - -```java - private BulletRecord generateRecord() { - BulletRecord record = new BulletRecord(); - String uuid = UUID.randomUUID().toString(); - - record.setString(STRING, uuid); - record.setLong(LONG, (long) generatedThisPeriod); - record.setDouble(DOUBLE, random.nextDouble()); - record.setString(TYPE, STRING_POOL[random.nextInt(STRING_POOL.length)]); - record.setLong(DURATION, System.currentTimeMillis() % INTEGER_POOL[random.nextInt(INTEGER_POOL.length)]); - - Map booleanMap = new HashMap<>(4); - booleanMap.put(uuid.substring(0, 8), random.nextBoolean()); - booleanMap.put(uuid.substring(9, 13), random.nextBoolean()); - booleanMap.put(uuid.substring(14, 18), random.nextBoolean()); - booleanMap.put(uuid.substring(19, 23), random.nextBoolean()); - record.setBooleanMap(BOOLEAN_MAP, booleanMap); - - Map statsMap = new HashMap<>(4); - statsMap.put(PERIOD_COUNT, periodCount); - statsMap.put(RECORD_NUMBER, periodCount * maxPerPeriod + generatedThisPeriod); - statsMap.put(NANO_TIME, System.nanoTime()); - statsMap.put(TIMESTAMP, System.currentTimeMillis()); - record.setLongMap(STATS_MAP, statsMap); - - Map randomMapA = new HashMap<>(2); - Map randomMapB = new HashMap<>(2); - randomMapA.put(RANDOM_MAP_KEY_A, STRING_POOL[random.nextInt(STRING_POOL.length)]); - randomMapA.put(RANDOM_MAP_KEY_B, STRING_POOL[random.nextInt(STRING_POOL.length)]); - randomMapB.put(RANDOM_MAP_KEY_A, STRING_POOL[random.nextInt(STRING_POOL.length)]); - randomMapB.put(RANDOM_MAP_KEY_B, STRING_POOL[random.nextInt(STRING_POOL.length)]); - record.setListOfStringMap(LIST, asList(randomMapA, randomMapB)); - - return record; - } -``` - -This method generates some fields randomly and inserts them into a BulletRecord. Note that the BulletRecord is typed and all data must be inserted with the proper types. - -If you put Bullet on your data, you will need to write a Spout (or a topology if your reading is complex), that reads from your data source and emits BulletRecords with the fields you wish to be query-able placed into a BulletRecord similar to this example. - -### PubSub - -We used the [DRPC PubSub](pubsub/storm-drpc.md) since we were using the Storm Backend. This code was included in the Bullet Storm artifact that we downloaded (the JAR with dependencies). We configured the Backend to use this PubSub by adding these settings to the YAML file that we passed to our Storm topology. Notice that we set the context to ```QUERY_PROCESSING``` since this is the Backend. - -```yaml -bullet.pubsub.context.name: "QUERY_PROCESSING" -bullet.pubsub.class.name: "com.yahoo.bullet.storm.drpc.DRPCPubSub" -bullet.pubsub.storm.drpc.function: "bullet-query" -``` - -For the Web Service, we passed in a YAML file that pointed to our DRPC server that was part of the Storm cluster we launched. Notice that we set the context to ```QUERY_SUBMISSION``` since this is the Web Service. - -```yaml -bullet.pubsub.context.name: "QUERY_SUBMISSION" -bullet.pubsub.class.name: "com.yahoo.bullet.storm.drpc.DRPCPubSub" -bullet.pubsub.storm.drpc.servers: - - 127.0.0.1 -bullet.pubsub.storm.drpc.function: "bullet-query" -bullet.pubsub.storm.drpc.http.protocol: "http" -bullet.pubsub.storm.drpc.http.port: "3774" -bullet.pubsub.storm.drpc.http.path: "drpc" -bullet.pubsub.storm.drpc.http.connect.retry.limit: 3 -bullet.pubsub.storm.drpc.http.connect.timeout.ms: 1000 -``` - -### Web Service - -We launched the Web Service using two custom files - a PubSub configuration YAML file and JSON schema file. - -The JSON columns file contains the schema for our data specified in JSON. Since our schema is not going to change, we use the Web Service to serve it from a file. If your schema changes dynamically, you will need to provide your own endpoint to the UI. - -The following is a snippet from the [JSON file](https://github.com/yahoo/bullet-docs/blob/master/examples/web-service/example_columns.json). Notice how the types of the fields are specified. Also, if you have generated BulletRecord with Map fields whose keys are known, you can specify them here using ```enumerations```. - -```javascript -[ - { - "name": "probability", - "type": "DOUBLE", - "description": "Generated from Random#nextDouble" - }, - ... - { - "name": "stats_map", - "type": "MAP", - "subtype": "LONG", - "description": "This map contains some numeric information such as the current number of periods etc.", - "enumerations": [ - ... - {"name": "nano_time", "description": "The ns time when this record was generated"} - ] - }, - { - "name": "classifiers", - "type": "LIST", - "subtype": "MAP", - "description": "This contains two maps, each with: field_A and field_B whose values are randomly chosen from: foo, bar, baz, qux, quux, norf" - } -] -``` -The contents of the [PubSub configuration file](https://github.com/yahoo/bullet-docs/blob/master/examples/web-service/example_drpc_pubsub_config.yaml) was discussed in the [PubSub section above](#pubsub). - -### UI - -Finally, we configured the UI with the custom environment specific settings file. We did not add any environments since we only had the one. - -```javascript -{ - "default": { - "queryHost": "http://localhost:9999", - "queryNamespace": "api/bullet", - "queryPath": "query", - "schemaHost": "http://localhost:9999", - "schemaNamespace": "api/bullet", - "helpLinks": [ - { - "name": "Examples", - "link": "https://yahoo.github.io/bullet-docs/ui/usage" - } - ], - "bugLink": "https://github.com/yahoo/bullet-ui/issues", - "modelVersion": 2, - "migrations": { - "deletions": "result" - }, - "defaultValues": { - "defaultValues": { - "aggregationMaxSize": 1024, - "rawMaxSize": 500, - "durationMaxSecs": 540, - "distributionNumberOfPoints": 11, - "distributionQuantilePoints": "0, 0.25, 0.5, 0.75, 0.9, 1", - "distributionQuantileStart": 0, - "distributionQuantileEnd": 1, - "distributionQuantileIncrement": 0.1, - "queryTimeoutSecs": 3, - "sketches": { - "countDistinctMaxEntries": 16384, - "groupByMaxEntries": 512, - "distributionMaxEntries": 1024, - "distributionMaxNumberOfPoints": 200, - "topKMaxEntries": 1024, - "topKErrorType": "No False Negatives" - }, - "metadataKeyMapping": { - "theta": "theta", - "uniquesEstimate": "uniques_estimate", - "queryCreationTime": "query_receive_time", - "queryTerminationTime": "query_finish_time", - "estimatedResult": "was_estimated", - "standardDeviations": "standard_deviations", - "normalizedRankError": "normalized_rank_error", - "maximumCountError": "maximum_count_error", - "itemsSeen": "items_seen", - "minimumValue": "minimum_value", - "maximumValue": "maximum_value" - } - } - } -} -``` - -Since we served our schema through the same Web Service as our queries, both these point to our Web Service. Note that there is no ```schemaPath``` because it must be the constant string ```columns```. If you define a custom endpoint for your schema, you must ensure that it can be obtained by making a GET request to ```schemaHost/schemaNamespace/columns```. diff --git a/docs/quick-start/storm.md b/docs/quick-start/storm.md index e8ab231a..118a8c99 100644 --- a/docs/quick-start/storm.md +++ b/docs/quick-start/storm.md @@ -1,16 +1,13 @@ # Quick Start on Storm -!!! note "NOTE: This is an old version of Bullet" - The version of Bullet this Quick Start uses does not support the newest functionality such as Windowing. We are working hard to get new documentation up as soon as possible. Use the [Spark Quick Start](spark.md) to see all the latest features. An updated Quick Start for Storm is coming soon. - This section gets you running a mock instance of Bullet to play around with. The instance will run using [Bullet on Storm](../backend/storm-setup.md) and use the [DRPC Pubsub](../pubsub/storm-drpc.md). Since we do not have an actual data source, we will produce some fake data and convert it into [Bullet Records](../backend/ingestion.md) in a [custom Storm spout](https://github.com/bullet-db/bullet-db.github.io/blob/master/examples/storm/src/main/java/com/yahoo/bullet/storm/examples/RandomSpout.java). If you want to use Bullet for your data, you will need to do read and convert your data to Bullet Records in a similar manner. At the end of this section, you will have: - * Setup the Bullet topology using a custom spout on [bullet-storm-0.6.2](https://github.com/bullet-db/bullet-storm/releases/tag/bullet-storm-0.6.2) - * Setup the [Web Service](../ws/setup.md) talking to the topology and serving a schema for your UI using [bullet-service-0.1.1](https://github.com/bullet-db/bullet-service/releases/tag/bullet-service-0.1.1) - * Setup the [DRPC PubSub](../pubsub/storm-drpc.md) talking to the topology and Web Service. - * Setup the [UI](../ui/setup.md) talking to the Web Service using [bullet-ui-0.4.0](https://github.com/bullet-db/bullet-ui/releases/tag/v0.4.0) + * Setup the Bullet topology using a custom spout on [bullet-storm-0.8.3](https://github.com/bullet-db/bullet-storm/releases/tag/bullet-storm-0.8.3) + * Setup the [Web Service](../ws/setup.md) talking to the topology and serving a schema for your UI using [bullet-service-0.2.2](https://github.com/bullet-db/bullet-service/releases/tag/bullet-service-0.2.2) + * Setup the [REST PubSub](../pubsub/rest.md) talking to the topology and Web Service. + * Setup the [UI](../ui/setup.md) talking to the Web Service using [bullet-ui-0.5.0](https://github.com/bullet-db/bullet-ui/releases/tag/v0.5.0) **Prerequisites** @@ -51,20 +48,14 @@ tar -xzf examples_artifacts.tar.gz export BULLET_EXAMPLES=$BULLET_HOME/bullet-examples ``` -#### Step 2: Install Storm 1.1 +#### Step 2: Install Storm 1.2 ```bash cd $BULLET_HOME/backend -curl -O http://apache.org/dist/storm/apache-storm-1.1.2/apache-storm-1.1.2.zip -unzip apache-storm-1.1.2.zip -export PATH=$(pwd)/apache-storm-1.1.2/bin/:$PATH -``` -Add a DRPC server setting to the Storm config: - -```bash -echo 'drpc.servers: ["127.0.0.1"]' >> apache-storm-1.1.2/conf/storm.yaml +curl -O http://apache.org/dist/storm/apache-storm-1.2.2/apache-storm-1.2.2.zip +unzip apache-storm-1.2.2.zip +export PATH=$(pwd)/apache-storm-1.2.2/bin/:$PATH ``` - #### Step 3: Launch Storm components Launch each of the following components, in order and wait for the commands to go through. You may have to do these one at a time. You will see a JVM being launched for each one and connection messages as the components communicate through Zookeeper. @@ -72,7 +63,6 @@ Launch each of the following components, in order and wait for the commands to g ```bash storm dev-zookeeper & storm nimbus & -storm drpc & storm ui & storm logviewer & storm supervisor & @@ -82,48 +72,24 @@ It may take 30-60 seconds for all the components to launch. Once everything is up without errors, visit [http://localhost:8080](http://localhost:8080) and see if the Storm UI loads. -#### Step 4: Test Storm (Optional) - -Before Bullet, test to see if Storm and DRPC are up and running by launching a example topology that comes with your Storm installation: - -```bash -storm jar apache-storm-1.1.2/examples/storm-starter/storm-starter-topologies-1.1.2.jar org.apache.storm.starter.BasicDRPCTopology topology -``` - -Visit your UI with a browser and see if a topology with name "topology" is running. If everything is good, you should be able to ping DRPC with: - -```bash -curl localhost:3774/drpc/exclamation/foo -``` - -and get back a ```foo!```. Any string you pass as part of the URL is returned to you with a "!" at the end. - -Kill this topology after with: - -```bash -storm kill topology -``` - !!! note "Local mode cleanup" If you notice any problems while setting up storm or while relaunching a topology, it may be because some state is corrupted. When running Storm in this fashion, states and serializations are stored in ```storm-local``` and ```/tmp/```. You may want to ```rm -rf storm-local/* /tmp/dev-storm-zookeeper``` to clean up this state before relaunching Storm components. See the [tear down section](#teardown) on how to kill any running instances. ### Setting up the example Bullet topology -Now that Storm is up and running, we can put Bullet on it. We will use an example Spout that runs on Bullet 0.4.3 on our Storm cluster. The source is available [here](https://github.com/bullet-db/bullet-db.github.io/blob/src/examples/storm). This was part of the artifact that you installed in Step 1. +Now that Storm is up and running, we can put Bullet on it. We will use an example Spout that runs on Bullet 0.8.3 on our Storm cluster. The source is available [here](https://github.com/bullet-db/bullet-db.github.io/blob/src/examples/storm). This was part of the artifact that you installed in Step 1. -#### Step 5: Setup the Storm example +#### Step 4: Setup the Storm example ```bash -cp $BULLET_EXAMPLES/storm/* $BULLET_HOME/backend/storm +cp $BULLET_EXAMPLES/backend/storm/* $BULLET_HOME/backend/storm ``` !!! note "Settings" Take a look at bullet_settings.yaml for the settings that are being overridden for this example. You can add or change settings as you like by referring to [core Bullet settings in bullet_defaults.yaml](https://github.com/bullet-db/bullet-core/blob/master/src/main/resources/bullet_defaults.yaml) and [Storm settings in bullet_storm_defaults.yaml](https://github.com/bullet-db/bullet-storm/blob/master/src/main/resources/bullet_storm_defaults.yaml). In particular, we have [customized these settings](https://github.com/bullet-db/bullet-db.github.io/blob/src/examples/storm/src/main/resources/bullet_settings.yaml) that affect the Bullet queries you can run: - ```bullet.query.max.duration: 570000``` Longest query time can be 570s. The Storm cluster default DRPC timeout is 600s. - ```bullet.query.aggregation.raw.max.size: 500``` The max ```RAW``` records you can fetch is 500. ```bullet.query.aggregation.max.size: 1024``` The max records you can fetch for any query is 1024. @@ -144,48 +110,42 @@ cp $BULLET_EXAMPLES/storm/* $BULLET_HOME/backend/storm ```cd $BULLET_HOME && git clone git@github.com:bullet-db/bullet-db.github.io.git``` - ```cd bullet-docs/examples/storm && mvn package``` + ```cd bullet-db.github.io/examples/storm && mvn package``` - You will find the ```bullet-storm-example-1.0-SNAPSHOT-jar-with-dependencies.jar``` in ```$BULLET_HOME/bullet-docs/examples/storm/target/``` + You will find the ```bullet-storm-example-1.0-SNAPSHOT-jar-with-dependencies.jar``` in ```$BULLET_HOME/bullet-db.github.io/examples/storm/target/``` - You can also make the ```examples_artifacts.tar.gz``` file with all the settings that is placed in ```$BULLET_EXAMPLES``` by just running ```make``` in the ```bullet-docs/examples/``` folder. + You can also make the ```examples_artifacts.tar.gz``` file with all the settings that is placed in ```$BULLET_EXAMPLES``` by just running ```make``` in the ```bullet-db.github.io/examples/``` folder. -#### Step 6: Launch the topology +#### Step 5: Launch the topology ```bash cd $BULLET_HOME/backend/storm && ./launch.sh ``` Visit the UI and see if the topology is up. You should see the ```DataSource``` spout begin emitting records. -Test the Bullet topology by: - -```bash -curl -s -X POST -d '{"id":"", "content":"{}"}' http://localhost:3774/drpc/bullet-query -``` - -You should get a random record (serialized as a String inside a JSON message sent back through the PubSub) from Bullet. - -!!! note "What is this data?" +!!! note "Where is this data coming from?" This data is randomly generated by the [custom Storm spout](https://github.com/bullet-db/bullet-db.github.io/blob/src/examples/storm/src/main/java/com/yahoo/bullet/storm/examples/RandomSpout.java) that is in the example topology you just launched. In practice, your spout would read from an actual data source such as Kafka instead. See [below](#storm-topology) for more details about this random data spout. ### Setting up the Bullet Web Service -#### Step 7: Install the Bullet Web Service +#### Step 6: Install the Bullet Web Service ```bash cd $BULLET_HOME/service -curl -Lo bullet-service.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-service/0.1.1/bullet-service-0.1.1-embedded.jar +curl -Lo bullet-service.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-service/0.2.2/bullet-service-0.2.2-embedded.jar cp $BULLET_EXAMPLES/web-service/example* $BULLET_HOME/service/ -cp $BULLET_EXAMPLES/storm/*jar-with-dependencies.jar $BULLET_HOME/service/bullet-storm-jar-with-dependencies.jar ``` -#### Step 8: Launch the Web Service +#### Step 7: Launch the Web Service ```bash cd $BULLET_HOME/service -java -Dloader.path=bullet-storm-jar-with-dependencies.jar -jar bullet-service.jar --bullet.pubsub.config=example_drpc_pubsub_config.yaml --bullet.schema.file=example_columns.json --server.port=9999 --logging.path=. --logging.file=log.txt &> log.txt & +java -jar bullet-service.jar --bullet.pubsub.config=example_rest_pubsub_config.yaml --bullet.schema.file=example_columns.json --bullet.pubsub.builtin.rest.enabled=true --server.port=9999 --logging.path=. --logging.file=log.txt &> log.txt & ``` + +Note that we turned on the built-in REST pubsub in the Web Service when launching it. The REST PubSub is bundled into the Bullet API by default, so no additional jars are needed. + You can verify that it is up by running a Bullet query or getting the example columns through the API: ```bash @@ -195,7 +155,7 @@ curl -s http://localhost:9999/api/bullet/columns ### Setting up the Bullet UI -#### Step 9: Install Node +#### Step 8: Install Node ```bash curl -s https://raw.githubusercontent.com/creationix/nvm/v0.33.1/install.sh | bash @@ -204,16 +164,16 @@ nvm install v6.9.4 nvm use v6.9.4 ``` -#### Step 10: Install the Bullet UI +#### Step 9: Install the Bullet UI ```bash cd $BULLET_HOME/ui -curl -LO https://github.com/bullet-db/bullet-ui/releases/download/src/bullet-ui-v0.4.0.tar.gz -tar -xzf bullet-ui-v0.4.0.tar.gz +curl -LO https://github.com/bullet-db/bullet-ui/releases/download/src/bullet-ui-v0.5.0.tar.gz +tar -xzf bullet-ui-v0.5.0.tar.gz cp $BULLET_EXAMPLES/ui/env-settings.json config/ ``` -#### Step 11: Launch the UI +#### Step 10: Launch the UI ```bash PORT=8800 node express-server.js & @@ -242,8 +202,8 @@ If you were performing the steps yourself, you can also manually cleanup **all t | | | | -------------- | ---------------------------------------------------------------- | | UI | ```pkill -f [e]xpress-server.js``` | -| Web Service | ```pkill -f [e]xample_drpc_pubsub_config.yaml``` | -| Storm | ```pkill -f [a]pache-storm-1.1.2``` | +| Web Service | ```pkill -f [e]xample_rest_pubsub_config.yaml``` | +| Storm | ```pkill -f [a]pache-storm-1.2.2``` | | File System | ```rm -rf $BULLET_HOME /tmp/dev-storm-zookeeper /tmp/jetty-*``` | This does *not* delete ```$HOME/.nvm``` and some extra lines nvm may have added to your ```$HOME/{.profile, .bash_profile, .zshrc, .bashrc}```. @@ -259,7 +219,7 @@ The topology was the Bullet topology plugged in with a custom spout. This spout ```bash storm jar bullet-storm-example-1.0-SNAPSHOT-jar-with-dependencies.jar \ com.yahoo.bullet.Topology \ - --bullet-conf bullet_settings.yaml \ + --bullet-conf ./bullet_settings.yaml \ --bullet-spout com.yahoo.bullet.storm.examples.RandomSpout \ --bullet-spout-parallelism 1 \ ... @@ -270,7 +230,7 @@ storm jar bullet-storm-example-1.0-SNAPSHOT-jar-with-dependencies.jar \ This command launches the jar (an uber or "fat" jar) containing the custom spout code and all dependencies you copied in Step 5. We pass the name of your spout class with ```--bullet-spout com.yahoo.bullet.storm.examples.RandomSpout``` to the Bullet main class ```com.yahoo.bullet.Topology``` with two arguments ```--bullet-spout-arg 20``` and ```--bullet-spout-arg 101```. The first argument tells the Spout to generate at most 20 tuples (records) in a period and the second argument says a period is 101 ms long. -The settings defined by ```--bullet-conf bullet_settings.yaml``` and the arguments here run all components in the topology with a parallelism of 1. So there will be one spout that is producing ~200 rps. +The settings defined by ```--bullet-conf ./bullet_settings.yaml``` and the arguments here run all components in the topology with a parallelism of 1. So there will be one spout that is producing ~200 rps. !!! note "I thought you said hundreds of thousands of records..." @@ -287,6 +247,7 @@ Let's look at the [custom spout code](https://github.com/bullet-db/bullet-db.git if (timeNow <= nextIntervalStart && generatedThisPeriod < maxPerPeriod) { outputCollector.emit(new Values(generateRecord()), DUMMY_ID); generatedThisPeriod++; + return; } if (timeNow > nextIntervalStart) { log.info("Generated {} tuples out of {}", generatedThisPeriod, maxPerPeriod); @@ -352,27 +313,26 @@ If you put Bullet on your data, you will need to write a Spout (or a topology if ### PubSub -We used the [DRPC PubSub](../pubsub/storm-drpc.md) since we were using the Storm Backend. This code was included in the Bullet Storm artifact that we downloaded (the JAR with dependencies). We configured the Backend to use this PubSub by adding these settings to the YAML file that we passed to our Storm topology. Notice that we set the context to ```QUERY_PROCESSING``` since this is the Backend. +We used the [REST PubSub](../pubsub/rest.md). Note that even though we support a DRPC PubSub, it doesn't actually support windowing so we have not used it for this example. We configured the Backend to use this PubSub by adding these settings to the YAML file that we passed to our Storm topology. Notice that we set the context to ```QUERY_PROCESSING``` since this is the Backend. ```yaml bullet.pubsub.context.name: "QUERY_PROCESSING" -bullet.pubsub.class.name: "com.yahoo.bullet.storm.drpc.DRPCPubSub" -bullet.pubsub.storm.drpc.function: "bullet-query" +bullet.pubsub.class.name: "com.yahoo.bullet.pubsub.rest.RESTPubSub" ``` -For the Web Service, we passed in a YAML file that pointed to our DRPC server that was part of the Storm cluster we launched. Notice that we set the context to ```QUERY_SUBMISSION``` since this is the Web Service. +For the Web Service, we passed in a YAML file that pointed to itself for the REST endpoints that serve as the PubSub interface. Notice that we set the context to ```QUERY_SUBMISSION``` since this is the Web Service. ```yaml bullet.pubsub.context.name: "QUERY_SUBMISSION" -bullet.pubsub.class.name: "com.yahoo.bullet.storm.drpc.DRPCPubSub" -bullet.pubsub.storm.drpc.servers: - - 127.0.0.1 -bullet.pubsub.storm.drpc.function: "bullet-query" -bullet.pubsub.storm.drpc.http.protocol: "http" -bullet.pubsub.storm.drpc.http.port: "3774" -bullet.pubsub.storm.drpc.http.path: "drpc" -bullet.pubsub.storm.drpc.http.connect.retry.limit: 3 -bullet.pubsub.storm.drpc.http.connect.timeout.ms: 1000 +bullet.pubsub.class.name: "com.yahoo.bullet.pubsub.rest.RESTPubSub" +bullet.pubsub.rest.query.urls: + - "http://localhost:9999/api/bullet/pubsub/query" +bullet.pubsub.rest.result.url: "http://localhost:9999/api/bullet/pubsub/result" +bullet.pubsub.rest.connect.timeout.ms: 30000 +bullet.pubsub.rest.connect.retry.limit: 10 +bullet.pubsub.rest.subscriber.max.uncommitted.messages: 100 +bullet.pubsub.rest.result.subscriber.min.wait.ms: 10 +bullet.pubsub.rest.query.subscriber.min.wait.ms: 10 ``` ### Web Service @@ -420,51 +380,61 @@ Finally, we configured the UI with the custom environment specific settings file "default": { "queryHost": "http://localhost:9999", "queryNamespace": "api/bullet", - "queryPath": "query", + "queryPath": "ws-query", + "queryStompRequestChannel": "/server/request", + "queryStompResponseChannel": "/client/response", "schemaHost": "http://localhost:9999", "schemaNamespace": "api/bullet", "helpLinks": [ { - "name": "Examples", - "link": "https://yahoo.github.io/bullet-docs/ui/usage" + "name": "Tutorials", + "link": "https://bullet-db.github.io/ui/usage" } ], - "bugLink": "https://github.com/yahoo/bullet-ui/issues", - "modelVersion": 2, + "bugLink": "https://github.com/bullet-db/bullet-ui/issues", + "modelVersion": 3, "migrations": { - "deletions": "result" + "deletions": "query" }, - "defaultValues": { "defaultValues": { "aggregationMaxSize": 1024, "rawMaxSize": 500, - "durationMaxSecs": 540, + "durationMaxSecs": 86400, "distributionNumberOfPoints": 11, "distributionQuantilePoints": "0, 0.25, 0.5, 0.75, 0.9, 1", "distributionQuantileStart": 0, "distributionQuantileEnd": 1, "distributionQuantileIncrement": 0.1, - "queryTimeoutSecs": 3, + "windowEmitFrequencyMinSecs": 1, + "everyForRecordBasedWindow": 1, + "everyForTimeBasedWindow": 2, "sketches": { "countDistinctMaxEntries": 16384, "groupByMaxEntries": 512, "distributionMaxEntries": 1024, - "distributionMaxNumberOfPoints": 200, + "distributionMaxNumberOfPoints": 100, "topKMaxEntries": 1024, "topKErrorType": "No False Negatives" }, "metadataKeyMapping": { - "theta": "theta", - "uniquesEstimate": "uniques_estimate", - "queryCreationTime": "query_receive_time", - "queryTerminationTime": "query_finish_time", - "estimatedResult": "was_estimated", - "standardDeviations": "standard_deviations", - "normalizedRankError": "normalized_rank_error", - "maximumCountError": "maximum_count_error", - "itemsSeen": "items_seen", - "minimumValue": "minimum_value", - "maximumValue": "maximum_value" + "querySection": "Query", + "windowSection": "Window", + "sketchSection": "Sketch", + "theta": "Theta", + "uniquesEstimate": "Uniques Estimate", + "queryCreationTime": "Receive Time", + "queryTerminationTime": "Finish Time", + "estimatedResult": "Was Estimated", + "standardDeviations": "Standard Deviations", + "normalizedRankError": "Normalized Rank Error", + "maximumCountError": "Maximum Count Error", + "itemsSeen": "Items Seen", + "minimumValue": "Minimum Value", + "maximumValue": "Maximum Value", + "windowNumber": "Number", + "windowSize": "Size", + "windowEmitTime": "Emit Time", + "expectedEmitTime": "Expected Emit Time" } } } diff --git a/docs/releases.md b/docs/releases.md index 26d50070..aee52e94 100644 --- a/docs/releases.md +++ b/docs/releases.md @@ -26,6 +26,7 @@ The core Bullet logic (a library) that can be used to implement Bullet on differ | Date | Release | Highlights | | ------------ | ------------------------------------------------------------------------------------- | ---------- | +| 2018-06-22 | [**0.4.1**](https://github.com/bullet-db/bullet-core/releases/tag/bullet-core-0.4.1) | Added RESTPublisher HTTP Timeout Setting | | 2018-06-18 | [**0.4.0**](https://github.com/bullet-db/bullet-core/releases/tag/bullet-core-0.4.0) | Added support for Integer and Float data types, and configurable BulletRecordProvider class used to instantiate BulletRecords in bullet-core | | 2018-04-11 | [**0.3.4**](https://github.com/bullet-db/bullet-core/releases/tag/bullet-core-0.3.4) | Pre-Start delaying and Buffering changes - queries are now buffered at the start of a query instead of start of each window | | 2018-03-30 | [**0.3.3**](https://github.com/bullet-db/bullet-core/releases/tag/bullet-core-0.3.3) | Bug fix for com.yahoo.bullet.core.querying.Querier#isClosedForPartition | diff --git a/examples/install-all-storm.sh b/examples/install-all-storm.sh index 35b32549..864bb9d0 100755 --- a/examples/install-all-storm.sh +++ b/examples/install-all-storm.sh @@ -2,10 +2,10 @@ set -euo pipefail -BULLET_EXAMPLES_VERSION=0.4.0 -BULLET_UI_VERSION=0.4.0 -BULLET_WS_VERSION=0.1.1 -STORM_VERSION=1.0.3 +BULLET_EXAMPLES_VERSION=0.5.1 +BULLET_UI_VERSION=0.5.0 +BULLET_WS_VERSION=0.2.2 +STORM_VERSION=1.2.2 NVM_VERSION=0.33.1 NODE_VERSION=6.9.4 @@ -80,7 +80,6 @@ install_storm() { println "Configuring Storm ..." export PATH="${BACKEND}/${STORM}/bin/:${PATH}" - echo 'drpc.servers: ["127.0.0.1"]' >> "${BACKEND}/${STORM}/conf/storm.yaml" println "Done!" } @@ -91,9 +90,6 @@ launch_storm() { println "Launching Storm Nimbus..." storm nimbus & - println "Launching Storm DRPC..." - storm drpc & - println "Launching Storm UI..." storm ui & @@ -112,7 +108,7 @@ launch_storm() { launch_bullet_storm() { println "Copying Bullet topology configuration and artifacts..." - cp "${BULLET_EXAMPLES}/storm"/* "${BULLET_HOME}/backend/storm" + cp "${BULLET_EXAMPLES}/backend/storm"/* "${BULLET_HOME}/backend/storm" println "Launching the Bullet topology..." println "==============================================================================" @@ -123,11 +119,6 @@ launch_bullet_storm() { println "==============================================================================" sleep 30 println "==============================================================================" - - println "Testing the Storm topology" - println "" - println "Getting one random record from the Bullet topology..." - curl -s -X POST -d '{"id":"", "content":"{}"}' http://localhost:3774/drpc/bullet-query println "Done!" } @@ -138,16 +129,16 @@ launch_bullet_web_service() { println "Downloading Bullet Web Service ${BULLET_WS_VERSION}..." download "http://jcenter.bintray.com/com/yahoo/bullet/bullet-service/${BULLET_WS_VERSION}" "${BULLET_WS_JAR}" - println "Configuring Bullet Web Service and plugging in Storm DRPC PubSub..." + println "Configuring Bullet Web Service and plugging in In-Memory REST PubSub..." cp "${BULLET_DOWNLOADS}/${BULLET_WS_JAR}" "${BULLET_SERVICE_HOME}/bullet-service.jar" - cp "${BULLET_EXAMPLES}/storm"/*jar-with-dependencies.jar "${BULLET_SERVICE_HOME}/bullet-storm-jar-with-dependencies.jar" cp "${BULLET_EXAMPLES}/web-service/"example_* "${BULLET_SERVICE_HOME}/" - println "Launching Bullet Web Service..." + println "Launching Bullet Web Service with the built-in REST PubSub enabled..." cd "${BULLET_SERVICE_HOME}" - java -Dloader.path=bullet-storm-jar-with-dependencies.jar -jar bullet-service.jar \ - --bullet.pubsub.config=example_drpc_pubsub_config.yaml --bullet.schema.file=example_columns.json \ - --server.port=9999 --logging.path="${BULLET_SERVICE_HOME}" --logging.file=log.txt &> "${BULLET_SERVICE_HOME}/log.txt" & + java -jar ./bullet-service.jar \ + --bullet.pubsub.config=example_rest_pubsub_config.yaml --bullet.schema.file=example_columns.json \ + --server.port=9999 --bullet.pubsub.builtin.rest.enabled=true --logging.path="${BULLET_SERVICE_HOME}" \ + --logging.file=log.txt &> "${BULLET_SERVICE_HOME}/log.txt" & println "Sleeping for 15 s to ensure Bullet Web Service is up..." sleep 15 @@ -160,7 +151,7 @@ launch_bullet_web_service() { println "Getting column schema from the Web Service..." println "" curl -s http://localhost:9999/api/bullet/columns - println "Finished Bullet Web Service test" + println "Finished Bullet Web Service test!" } install_node() { diff --git a/examples/spark/src/main/resources/bullet_spark_kafka_settings.yaml b/examples/spark/src/main/resources/bullet_spark_kafka_settings.yaml index 728546c3..0530d517 100644 --- a/examples/spark/src/main/resources/bullet_spark_kafka_settings.yaml +++ b/examples/spark/src/main/resources/bullet_spark_kafka_settings.yaml @@ -99,5 +99,15 @@ bullet.pubsub.kafka.response.topic.name: "bullet.responses" ## https://github.com/bullet-db/bullet-core/blob/master/src/main/resources/bullet_defaults.yaml ######################################################################################################################## ######################################################################################################################## +bullet.query.aggregation.raw.max.size: 500 +bullet.query.aggregation.max.size: 1024 +bullet.query.aggregation.count.distinct.sketch.entries: 16384 +bullet.query.aggregation.group.sketch.entries: 1024 +bullet.query.aggregation.distribution.sketch.entries: 1024 +bullet.query.aggregation.distribution.max.points: 200 +bullet.query.aggregation.distribution.generated.points.rounding: 6 +bullet.query.aggregation.top.k.sketch.entries: 1024 +bullet.query.aggregation.top.k.sketch.error.type: "NFN" +bullet.result.metadata.enable: true # Factory class to get new BulletRecords. bullet.record.provider.class.name: "com.yahoo.bullet.record.SimpleBulletRecordProvider" diff --git a/examples/spark/src/main/resources/bullet_spark_rest_settings.yaml b/examples/spark/src/main/resources/bullet_spark_rest_settings.yaml deleted file mode 100644 index 396419ab..00000000 --- a/examples/spark/src/main/resources/bullet_spark_rest_settings.yaml +++ /dev/null @@ -1,104 +0,0 @@ -######################################################################################################################## -############################################### Bullet Spark defaults ################################################# -######################################################################################################################## -# This is the name of the concrete implementation of Data Producer to use. -bullet.spark.data.producer.class.name: "com.yahoo.bullet.spark.examples.RandomProducer" - -# This is the batch interval of your Spark Streaming job. Find out more at -# https://spark.apache.org/docs/latest/streaming-programming-guide.html#setting-the-right-batch-interval. -bullet.spark.batch.duration.ms: 1000 - -# This is the size of the buffer for accumulating queries in the Query Receiver before emitting to Spark. -bullet.spark.receiver.query.block.size: 1 - -# This is the maximum number of partitions that will be created by the Query Receiver. -bullet.spark.receiver.query.coalesce.partitions: 10 - -# This is the number of Data Producers. -bullet.spark.data.producer.parallelism: 1 - -# This is the checkpoint directory. If you are running your Spark on a cluster, the directory must be an HDFS path. -bullet.spark.checkpoint.dir: "/tmp/spark-checkpoint" - -# If true, Bullet Spark recovers context from checkpoint files when restarting. -# Otherwise Bullet Spark creates a new context. -bullet.spark.recover.from.checkpoint.enable: false - -# This is the Spark application name. -bullet.spark.app.name: "BulletSparkStreamingJob" - -# If true, Bullet Spark collects metrics which can be accessed via the Spark REST API (/metrics/json). -bullet.spark.metrics.enabled: false - -# If true, enables parallel processing of queries in each partition of the Filter Streaming job, This is particularly -# useful when using Producers that are Direct (e.g. DirectKafkaProducer) and you would like to avoid repartitioning -# the data and instead choose to parallelize within each partition (fixed by the producer) instead. -# It speeds up the processing within those partitions by partitioning queries to multiple threads to do the filtering -# operation concurrently. -bullet.spark.filter.partition.parallel.mode.enabled: false - -# This is the thread pool size to use when bullet.spark.filter.partition.parallel.mode.enabled is true. -bullet.spark.filter.partition.parallel.mode.parallelism: 4 - -# This is the minimum number of queries at which the parallel partition filtering is applied. Since there are fixed -# costs to manage a thread pool, they are only created once the number of queries exceeds this threshold. -# It is only used when bullet.spark.filter.partition.parallel.mode.enabled is true. -bullet.spark.filter.partition.parallel.mode.min.query.threshold: 10 - -# The following 2 settings are used to set the checkpoint intervals independently for each stateful transformation. -# Checkpoint interval = Spark duration * checkpoint duration multiplier -# Use this to control the frequency of checkpointing operation. If this is set too high, there might be too much -# data to checkpoint (RDD lineage graph). -bullet.spark.query.union.checkpoint.duration.multiplier: 10 -bullet.spark.join.checkpoint.duration.multiplier: 10 - -# The feedback publisher switches your PubSub into QUERY_SUBMISSION mode to loop back metadata messages to query -# receiver. If you need to change settings for your publisher in this mode that is different from the settings -# used in the result publisher, override them here. This setting needs to be a Map if provided. -# The example below pretends that your PubSub settings start with bullet.pubsub.custom. You will provide yours. -# Example: -# -# bullet.spark.loop.pubsub.overrides: -# bullet.pubsub.custom.publisher.setting: 1 -# bullet.pubsub.custom.nested.publisher.setting: -# foo: bar -# bar: baz -bullet.spark.loop.pubsub.overrides: {} - -######################################################################################################################## -############################################### Spark Streaming defaults ############################################### -######################################################################################################################## -# The following settings are passed to Spark directly. You can add more settings here. -# Find out more information about configuring a Spark job at https://spark.apache.org/docs/latest/configuration.html. -# Add configuration that change infrequently here and submit more variable settings while submitting the job on the -# command line. -spark.serializer: "org.apache.spark.serializer.KryoSerializer" -spark.closure.serializer: "org.apache.spark.serializer.KryoSerializer" -spark.streaming.stopGracefullyOnShutdown: "true" -spark.streaming.receiver.writeAheadLog.enable: "false" -spark.streaming.driver.writeAheadLog.allowBatching: "false" - -######################################################################################################################## -############################################### Query PubSub defaults ################################################## -######################################################################################################################## -# This is the type of PubSub context to use for result publisher. -# The feedback publisher uses QUERY_SUBMISSION since it submits messages. -bullet.pubsub.context.name: "QUERY_PROCESSING" -# This is the name of the concrete implementation of PubSub to use. -# By default, it is the bulletin REST in-memory PubSub. -bullet.pubsub.class.name: "com.yahoo.bullet.pubsub.rest.RESTPubSub" -# A list of url(s) for the query endpoint. In the web service, this should contain a single URL for the query endpoint -# of the in-memory pubsub instance running on that web service. For the backend it should contain the urls of all -# the pubsub instances. -bullet.pubsub.rest.query.urls: - - "http://localhost:9999/api/bullet/pubsub/query" - -######################################################################################################################## -############################################### Bullet Core settings ################################################### -######################################################################################################################## -## You can also configure the core Bullet settings here. For documentation and defaults for those settings, refer to: -## https://github.com/bullet-db/bullet-core/blob/master/src/main/resources/bullet_defaults.yaml -######################################################################################################################## -######################################################################################################################## -# Factory class to get new BulletRecords. -bullet.record.provider.class.name: "com.yahoo.bullet.record.SimpleBulletRecordProvider" diff --git a/examples/storm/bin/launch.sh b/examples/storm/bin/launch.sh index f5e7ed08..54da3114 100755 --- a/examples/storm/bin/launch.sh +++ b/examples/storm/bin/launch.sh @@ -3,7 +3,7 @@ # We pass 20 and 100 to the RandomSpout, which means it generates up to 20 random records every 100 ms. storm jar bullet-storm-example-1.0-SNAPSHOT-jar-with-dependencies.jar \ com.yahoo.bullet.storm.Topology \ - --bullet-conf bullet_settings.yaml \ + --bullet-conf ./bullet_settings.yaml \ --bullet-spout com.yahoo.bullet.storm.examples.RandomSpout \ --bullet-spout-parallelism 1 \ --bullet-spout-cpu-load 100.0 \ diff --git a/examples/storm/pom.xml b/examples/storm/pom.xml index 69a1a84e..50a16050 100644 --- a/examples/storm/pom.xml +++ b/examples/storm/pom.xml @@ -7,7 +7,7 @@ jar bullet-storm-example - scm:git:ssh://git@github.com/yahoo/bullet-docs.git + scm:git:ssh://git@github.com/bullet-db/bullet-db.github.io.git HEAD @@ -26,7 +26,7 @@ UTF-8 1.8 1.8 - 0.8.2 + 0.8.3 0.4.0 0.2.0 1.0.3 diff --git a/examples/storm/src/main/resources/bullet_settings.yaml b/examples/storm/src/main/resources/bullet_settings.yaml index 0ee99385..6069c715 100644 --- a/examples/storm/src/main/resources/bullet_settings.yaml +++ b/examples/storm/src/main/resources/bullet_settings.yaml @@ -1,29 +1,36 @@ -# Custom settings for the example +# Custom or notable settings for the example # Settings not overridden will default to the defaults in bullet_storm_defaults.yaml in the bullet-storm artifact # and to the defaults in bullet_defaults.yaml in the bullet-core artifact. -# Settings that start with bullet.topology are Storm settings and everything else are Bullet Core settings. + +# Storm settings bullet.topology.name: "bullet" bullet.topology.metrics.enable: false bullet.topology.metrics.built.in.enable: false bullet.topology.query.spout.parallelism: 1 bullet.topology.query.spout.cpu.load: 30.0 bullet.topology.query.spout.memory.on.heap.load: 256.0 -bullet.topology.query.spout.memory.off.heap.load: 192.0 +bullet.topology.query.spout.memory.off.heap.load: 160.0 bullet.topology.filter.bolt.parallelism: 1 bullet.topology.filter.bolt.cpu.load: 100.0 bullet.topology.filter.bolt.memory.on.heap.load: 256.0 -bullet.topology.filter.bolt.memory.off.heap.load: 192.0 +bullet.topology.filter.bolt.memory.off.heap.load: 160.0 bullet.topology.return.bolt.parallelism: 1 bullet.topology.result.bolt.cpu.load: 10.0 bullet.topology.result.bolt.memory.on.heap.load: 128.0 -bullet.topology.result.bolt.memory.off.heap.load: 192.0 +bullet.topology.result.bolt.memory.off.heap.load: 160.0 +bullet.topology.loop.bolt.parallelism: 1 +bullet.topology.loop.bolt.cpu.load: 10.0 +bullet.topology.loop.bolt.memory.on.heap.load: 128.0 +bullet.topology.loop.bolt.memory.off.heap.load: 160.0 bullet.topology.join.bolt.parallelism: 1 bullet.topology.join.bolt.cpu.load: 20.0 bullet.topology.join.bolt.memory.on.heap.load: 128.0 -bullet.topology.join.bolt.memory.off.heap.load: 192.0 -bullet.topology.join.bolt.error.tick.timeout: 3 -bullet.topology.join.bolt.query.tick.timeout: 3 -bullet.topology.tick.interval.secs: 1 +bullet.topology.join.bolt.memory.off.heap.load: 160.0 +bullet.topology.join.bolt.query.post.finish.buffer.ticks: 3 +bullet.topology.join.bolt.query.pre.start.delay.ticks: 2 +bullet.topology.tick.spout.interval.ms: 100 + +# Bullet Core settings bullet.query.aggregation.raw.max.size: 500 bullet.query.aggregation.max.size: 1024 bullet.query.aggregation.count.distinct.sketch.entries: 16384 @@ -33,47 +40,11 @@ bullet.query.aggregation.distribution.max.points: 200 bullet.query.aggregation.distribution.generated.points.rounding: 6 bullet.query.aggregation.top.k.sketch.entries: 1024 bullet.query.aggregation.top.k.sketch.error.type: "NFN" -bullet.query.max.duration: 570000 bullet.result.metadata.enable: true -bullet.result.metadata.metrics: - - name: "Query Identifier" - key: "query_id" - - name: "Query Body" - key: "query" - - name: "Query Creation Time" - key: "query_receive_time" - - name: "Query Termination Time" - key: "query_finish_time" - - name: "Sketch Metadata" - key: "sketches" - - name: "Estimated Result" - key: "was_estimated" - - name: "Standard Deviations" - key: "standard_deviations" - - name: "Family" - key: "family" - - name: "Size" - key: "size" - - name: "Theta" - key: "theta" - - name: "Uniques Estimate" - key: "uniques_estimate" - - name: "Minimum Value" - key: "minimum_value" - - name: "Maximum Value" - key: "maximum_value" - - name: "Items Seen" - key: "items_seen" - - name: "Normalized Rank Error" - key: "normalized_rank_error" - - name: "Maximum Count Error" - key: "maximum_count_error" - - name: "Active Items" - key: "active_items" -bullet.record.inject.timestamp.enable: true -bullet.record.inject.timestamp.key: "receive_timestamp" -# Storm DRPC PubSub settings +# REST PubSub settings bullet.pubsub.context.name: "QUERY_PROCESSING" -bullet.pubsub.class.name: "com.yahoo.bullet.storm.drpc.DRPCPubSub" -bullet.pubsub.storm.drpc.function: "bullet-query" +bullet.pubsub.class.name: "com.yahoo.bullet.pubsub.rest.RESTPubSub" +bullet.pubsub.rest.query.urls: + - "http://localhost:9999/api/bullet/pubsub/query" +bullet.pubsub.rest.result.url: "http://localhost:9999/api/bullet/pubsub/result" diff --git a/examples/ui/env-settings.json b/examples/ui/env-settings.json index 2d33339c..b0b63826 100644 --- a/examples/ui/env-settings.json +++ b/examples/ui/env-settings.json @@ -19,15 +19,14 @@ "deletions": "query" }, "defaultValues": { - "aggregationMaxSize": 512, - "rawMaxSize": 100, - "durationMaxSecs": 120, + "aggregationMaxSize": 1024, + "rawMaxSize": 500, + "durationMaxSecs": 86400, "distributionNumberOfPoints": 11, "distributionQuantilePoints": "0, 0.25, 0.5, 0.75, 0.9, 1", "distributionQuantileStart": 0, "distributionQuantileEnd": 1, "distributionQuantileIncrement": 0.1, - "queryTimeoutSecs": 3, "windowEmitFrequencyMinSecs": 1, "everyForRecordBasedWindow": 1, "everyForTimeBasedWindow": 2, diff --git a/mkdocs.yml b/mkdocs.yml index 10d77dd3..685e2cb3 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -43,7 +43,7 @@ markdown_extensions: extra: collapse_toc: true include_search: true - service_version: v0.5.0 + service_version: v0.5.1 extra_css: - css/extra.css