Skip to content

Commit

Permalink
Adding range based pruning to bloom index
Browse files Browse the repository at this point in the history
 - keys compared lexicographically using String::compareTo
 - Range metadata additionally written into parquet file footers
 - Trim fat & few optimizations to speed up indexing
 - Add param to control whether input shall be cached, to speed up lookup
 - Add param to turn on/off range pruning
 - Auto compute of parallelism now simply factors in amount of comparisons done
 - More accurate parallelism computation when range pruning is on
 - tests added & hardened, docs updated
  • Loading branch information
Vinoth Chandar committed Aug 4, 2017
1 parent 19c22b2 commit d7542e6
Show file tree
Hide file tree
Showing 25 changed files with 785 additions and 474 deletions.
10 changes: 8 additions & 2 deletions docs/configurations.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,15 @@ summary: "Here we list all possible configurations and what they mean"
- [withIndexType](#withIndexType) (indexType = BLOOM) <br/>
<span style="color:grey">Type of index to use. Default is Bloom filter. Possible options are [BLOOM | HBASE | INMEMORY]. Bloom filters removes the dependency on a external system and is stored in the footer of the Parquet Data Files</span>
- [bloomFilterNumEntries](#bloomFilterNumEntries) (60000) <br/>
<span style="color:grey">Only application if index type is BLOOM. <br/>This is the number of entries to be stored in the bloom filter. We assume the maxParquetFileSize is 128MB and averageRecordSize is 1024B and hence we approx a total of 130K records in a file. The default (60000) is roughly half of this approximation. [#70](https://github.com/uber/hoodie/issues/70) tracks computing this dynamically. Warning: Setting this very low, will generate a lot of false positives and index lookup will have to scan a lot more files than it has to and Setting this to a very high number will increase the size every data file linearly (roughly 4KB for every 50000 entries).</span>
<span style="color:grey">Only applies if index type is BLOOM. <br/>This is the number of entries to be stored in the bloom filter. We assume the maxParquetFileSize is 128MB and averageRecordSize is 1024B and hence we approx a total of 130K records in a file. The default (60000) is roughly half of this approximation. [#70](https://github.com/uber/hoodie/issues/70) tracks computing this dynamically. Warning: Setting this very low, will generate a lot of false positives and index lookup will have to scan a lot more files than it has to and Setting this to a very high number will increase the size every data file linearly (roughly 4KB for every 50000 entries).</span>
- [bloomFilterFPP](#bloomFilterFPP) (0.000000001) <br/>
<span style="color:grey">Only application if index type is BLOOM. <br/> Error rate allowed given the number of entries. This is used to calculate how many bits should be assigned for the bloom filter and the number of hash functions. This is usually set very low (default: 0.000000001), we like to tradeoff disk space for lower false positives</span>
<span style="color:grey">Only applies if index type is BLOOM. <br/> Error rate allowed given the number of entries. This is used to calculate how many bits should be assigned for the bloom filter and the number of hash functions. This is usually set very low (default: 0.000000001), we like to tradeoff disk space for lower false positives</span>
- [bloomIndexPruneByRanges](#bloomIndexPruneByRanges) (true) <br/>
<span style="color:grey">Only applies if index type is BLOOM. <br/> When true, range information from files to leveraged speed up index lookups. Particularly helpful, if the key has a monotonously increasing prefix, such as timestamp.</span>
- [bloomIndexUseCaching](#bloomIndexUseCaching) (true) <br/>
<span style="color:grey">Only applies if index type is BLOOM. <br/> When true, the input RDD will cached to speed up index lookup by reducing IO for computing parallelism or affected partitions</span>
- [bloomIndexParallelism](#bloomIndexParallelism) (0) <br/>
<span style="color:grey">Only applies if index type is BLOOM. <br/> This is the amount of parallelism for index lookup, which involves a Spark Shuffle. By default, this is auto computed based on input workload characteristics</span>
- [hbaseZkQuorum](#hbaseZkQuorum) (zkString) <br/>
<span style="color:grey">Only application if index type is HBASE. HBase ZK Quorum url to connect to.</span>
- [hbaseZkPort](#hbaseZkPort) (port) <br/>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.exception.HoodieException;
import com.uber.hoodie.index.HoodieBloomIndex;
import com.uber.hoodie.index.bloom.HoodieBloomIndex;

import com.uber.hoodie.table.HoodieTable;

Expand All @@ -50,7 +50,6 @@
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@ public class HoodieIndexConfig extends DefaultHoodieConfig {
public static final String BLOOM_INDEX_PARALLELISM_PROP = "hoodie.bloom.index.parallelism";
// Disable explicit bloom index parallelism setting by default - hoodie auto computes
public static final String DEFAULT_BLOOM_INDEX_PARALLELISM = "0";
public static final String BLOOM_INDEX_PRUNE_BY_RANGES_PROP = "hoodie.bloom.index.prune.by.ranges";
public static final String DEFAULT_BLOOM_INDEX_PRUNE_BY_RANGES = "true";
public static final String BLOOM_INDEX_USE_CACHING_PROP = "hoodie.bloom.index.use.caching";
public static final String DEFAULT_BLOOM_INDEX_USE_CACHING = "true";

// ***** HBase Index Configs *****
public final static String HBASE_ZKQUORUM_PROP = "hoodie.index.hbase.zkquorum";
Expand Down Expand Up @@ -112,6 +116,16 @@ public Builder bloomIndexParallelism(int parallelism) {
return this;
}

public Builder bloomIndexPruneByRanges(boolean pruneRanges) {
props.setProperty(BLOOM_INDEX_PRUNE_BY_RANGES_PROP, String.valueOf(pruneRanges));
return this;
}

public Builder bloomIndexUseCaching(boolean useCaching) {
props.setProperty(BLOOM_INDEX_USE_CACHING_PROP, String.valueOf(useCaching));
return this;
}

public Builder numBucketsPerPartition(int numBuckets) {
props.setProperty(BUCKETED_INDEX_NUM_BUCKETS_PROP, String.valueOf(numBuckets));
return this;
Expand All @@ -127,6 +141,10 @@ public HoodieIndexConfig build() {
BLOOM_FILTER_FPP, DEFAULT_BLOOM_FILTER_FPP);
setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_PARALLELISM_PROP),
BLOOM_INDEX_PARALLELISM_PROP, DEFAULT_BLOOM_INDEX_PARALLELISM);
setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_PRUNE_BY_RANGES_PROP),
BLOOM_INDEX_PRUNE_BY_RANGES_PROP, DEFAULT_BLOOM_INDEX_PRUNE_BY_RANGES);
setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_USE_CACHING_PROP),
BLOOM_INDEX_USE_CACHING_PROP, DEFAULT_BLOOM_INDEX_USE_CACHING);
// Throws IllegalArgumentException if the value set is not a known Hoodie Index Type
HoodieIndex.IndexType.valueOf(props.getProperty(INDEX_TYPE_PROP));
return config;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,14 @@ public int getBloomIndexParallelism() {
return Integer.parseInt(props.getProperty(HoodieIndexConfig.BLOOM_INDEX_PARALLELISM_PROP));
}

public boolean getBloomIndexPruneByRanges() {
return Boolean.parseBoolean(props.getProperty(HoodieIndexConfig.BLOOM_INDEX_PRUNE_BY_RANGES_PROP));
}

public boolean getBloomIndexUseCaching() {
return Boolean.parseBoolean(props.getProperty(HoodieIndexConfig.BLOOM_INDEX_USE_CACHING_PROP));
}

public int getNumBucketsPerPartition() {
return Integer.parseInt(props.getProperty(HoodieIndexConfig.BUCKETED_INDEX_NUM_BUCKETS_PROP));
}
Expand Down

0 comments on commit d7542e6

Please sign in to comment.