Adding range based pruning to bloom index

- keys compared lexicographically using String::compareTo - Range metadata additionally written into parquet file footers - Trim fat & few optimizations to speed up indexing - Add param to control whether input shall be cached, to speed up lookup - Add param to turn on/off range pruning - Auto compute of parallelism now simply factors in amount of comparisons done - More accurate parallelism computation when range pruning is on - tests added & hardened, docs updated
apache · Aug 4, 2017 · d7542e6 · d7542e6
1 parent 19c22b2
commit d7542e6
Show file tree

Hide file tree

Showing 25 changed files with 785 additions and 474 deletions.
diff --git a/docs/configurations.md b/docs/configurations.md
@@ -29,9 +29,15 @@ summary: "Here we list all possible configurations and what they mean"
         - [withIndexType](#withIndexType) (indexType = BLOOM) <br/>
         <span style="color:grey">Type of index to use. Default is Bloom filter. Possible options are [BLOOM | HBASE | INMEMORY]. Bloom filters removes the dependency on a external system and is stored in the footer of the Parquet Data Files</span>
         - [bloomFilterNumEntries](#bloomFilterNumEntries) (60000) <br/>
-        <span style="color:grey">Only application if index type is BLOOM. <br/>This is the number of entries to be stored in the bloom filter. We assume the maxParquetFileSize is 128MB and averageRecordSize is 1024B and hence we approx a total of 130K records in a file. The default (60000) is roughly half of this approximation. [#70](https://github.com/uber/hoodie/issues/70) tracks computing this dynamically. Warning: Setting this very low, will generate a lot of false positives and index lookup will have to scan a lot more files than it has to and Setting this to a very high number will increase the size every data file linearly (roughly 4KB for every 50000 entries).</span>
+        <span style="color:grey">Only applies if index type is BLOOM. <br/>This is the number of entries to be stored in the bloom filter. We assume the maxParquetFileSize is 128MB and averageRecordSize is 1024B and hence we approx a total of 130K records in a file. The default (60000) is roughly half of this approximation. [#70](https://github.com/uber/hoodie/issues/70) tracks computing this dynamically. Warning: Setting this very low, will generate a lot of false positives and index lookup will have to scan a lot more files than it has to and Setting this to a very high number will increase the size every data file linearly (roughly 4KB for every 50000 entries).</span>
         - [bloomFilterFPP](#bloomFilterFPP) (0.000000001) <br/>
-        <span style="color:grey">Only application if index type is BLOOM. <br/> Error rate allowed given the number of entries. This is used to calculate how many bits should be assigned for the bloom filter and the number of hash functions. This is usually set very low (default: 0.000000001), we like to tradeoff disk space for lower false positives</span>
+        <span style="color:grey">Only applies if index type is BLOOM. <br/> Error rate allowed given the number of entries. This is used to calculate how many bits should be assigned for the bloom filter and the number of hash functions. This is usually set very low (default: 0.000000001), we like to tradeoff disk space for lower false positives</span>
+        - [bloomIndexPruneByRanges](#bloomIndexPruneByRanges) (true) <br/>
+        <span style="color:grey">Only applies if index type is BLOOM. <br/> When true, range information from files to leveraged speed up index lookups. Particularly helpful, if the key has a monotonously increasing prefix, such as timestamp.</span>
+        - [bloomIndexUseCaching](#bloomIndexUseCaching) (true) <br/>
+        <span style="color:grey">Only applies if index type is BLOOM. <br/> When true, the input RDD will cached to speed up index lookup by reducing IO for computing parallelism or affected partitions</span>
+        - [bloomIndexParallelism](#bloomIndexParallelism) (0) <br/>
+        <span style="color:grey">Only applies if index type is BLOOM. <br/> This is the amount of parallelism for index lookup, which involves a Spark Shuffle. By default, this is auto computed based on input workload characteristics</span>
         - [hbaseZkQuorum](#hbaseZkQuorum) (zkString) <br/>
         <span style="color:grey">Only application if index type is HBASE. HBase ZK Quorum url to connect to.</span>
         - [hbaseZkPort](#hbaseZkPort) (port) <br/>

diff --git a/hoodie-client/src/main/java/com/uber/hoodie/HoodieReadClient.java b/hoodie-client/src/main/java/com/uber/hoodie/HoodieReadClient.java
@@ -30,7 +30,7 @@
 import com.uber.hoodie.common.util.FSUtils;
 import com.uber.hoodie.config.HoodieWriteConfig;
 import com.uber.hoodie.exception.HoodieException;
-import com.uber.hoodie.index.HoodieBloomIndex;
+import com.uber.hoodie.index.bloom.HoodieBloomIndex;
 
 import com.uber.hoodie.table.HoodieTable;
 
@@ -50,7 +50,6 @@
 import java.io.IOException;
 import java.io.Serializable;
 import java.util.ArrayList;
-import java.util.Collection;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;

diff --git a/hoodie-client/src/main/java/com/uber/hoodie/config/HoodieIndexConfig.java b/hoodie-client/src/main/java/com/uber/hoodie/config/HoodieIndexConfig.java
@@ -42,6 +42,10 @@ public class HoodieIndexConfig extends DefaultHoodieConfig {
     public static final String BLOOM_INDEX_PARALLELISM_PROP = "hoodie.bloom.index.parallelism";
     // Disable explicit bloom index parallelism setting by default - hoodie auto computes
     public static final String DEFAULT_BLOOM_INDEX_PARALLELISM = "0";
+    public static final String BLOOM_INDEX_PRUNE_BY_RANGES_PROP = "hoodie.bloom.index.prune.by.ranges";
+    public static final String DEFAULT_BLOOM_INDEX_PRUNE_BY_RANGES = "true";
+    public static final String BLOOM_INDEX_USE_CACHING_PROP = "hoodie.bloom.index.use.caching";
+    public static final String DEFAULT_BLOOM_INDEX_USE_CACHING = "true";
 
     // ***** HBase Index Configs *****
     public final static String HBASE_ZKQUORUM_PROP = "hoodie.index.hbase.zkquorum";
@@ -112,6 +116,16 @@ public Builder bloomIndexParallelism(int parallelism) {
             return this;
         }
 
+        public Builder bloomIndexPruneByRanges(boolean pruneRanges) {
+            props.setProperty(BLOOM_INDEX_PRUNE_BY_RANGES_PROP, String.valueOf(pruneRanges));
+            return this;
+        }
+
+        public Builder bloomIndexUseCaching(boolean useCaching) {
+            props.setProperty(BLOOM_INDEX_USE_CACHING_PROP, String.valueOf(useCaching));
+            return this;
+        }
+
         public Builder numBucketsPerPartition(int numBuckets) {
             props.setProperty(BUCKETED_INDEX_NUM_BUCKETS_PROP, String.valueOf(numBuckets));
             return this;
@@ -127,6 +141,10 @@ public HoodieIndexConfig build() {
                 BLOOM_FILTER_FPP, DEFAULT_BLOOM_FILTER_FPP);
             setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_PARALLELISM_PROP),
                     BLOOM_INDEX_PARALLELISM_PROP, DEFAULT_BLOOM_INDEX_PARALLELISM);
+            setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_PRUNE_BY_RANGES_PROP),
+                    BLOOM_INDEX_PRUNE_BY_RANGES_PROP, DEFAULT_BLOOM_INDEX_PRUNE_BY_RANGES);
+            setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_USE_CACHING_PROP),
+                    BLOOM_INDEX_USE_CACHING_PROP, DEFAULT_BLOOM_INDEX_USE_CACHING);
             // Throws IllegalArgumentException if the value set is not a known Hoodie Index Type
             HoodieIndex.IndexType.valueOf(props.getProperty(INDEX_TYPE_PROP));
             return config;

diff --git a/hoodie-client/src/main/java/com/uber/hoodie/config/HoodieWriteConfig.java b/hoodie-client/src/main/java/com/uber/hoodie/config/HoodieWriteConfig.java
@@ -203,6 +203,14 @@ public int getBloomIndexParallelism() {
         return Integer.parseInt(props.getProperty(HoodieIndexConfig.BLOOM_INDEX_PARALLELISM_PROP));
     }
 
+    public boolean getBloomIndexPruneByRanges() {
+        return Boolean.parseBoolean(props.getProperty(HoodieIndexConfig.BLOOM_INDEX_PRUNE_BY_RANGES_PROP));
+    }
+
+    public boolean getBloomIndexUseCaching() {
+        return Boolean.parseBoolean(props.getProperty(HoodieIndexConfig.BLOOM_INDEX_USE_CACHING_PROP));
+    }
+
     public int getNumBucketsPerPartition() {
         return Integer.parseInt(props.getProperty(HoodieIndexConfig.BUCKETED_INDEX_NUM_BUCKETS_PROP));
     }