From 5acca826330b2afc15b6a2d5e2134d7d1a64e4a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-Fran=C3=A7ois=20BOEUF?= Date: Wed, 1 Feb 2023 14:35:50 +0100 Subject: [PATCH] Reduce bloom filter size by using the optimal count for hash functions. (#11900) --- lucene/CHANGES.txt | 5 +- .../codecs/bloom/BloomFilterFactory.java | 4 +- .../bloom/BloomFilteringPostingsFormat.java | 13 +- .../bloom/DefaultBloomFilterFactory.java | 2 +- .../apache/lucene/codecs/bloom/FuzzySet.java | 129 +++++++++--------- .../lucene/codecs/bloom/HashFunction.java | 2 +- .../lucene/codecs/bloom/MurmurHash2.java | 101 -------------- .../lucene/codecs/bloom/MurmurHash64.java | 85 ++++++++++++ 8 files changed, 166 insertions(+), 175 deletions(-) delete mode 100644 lucene/codecs/src/java/org/apache/lucene/codecs/bloom/MurmurHash2.java create mode 100644 lucene/codecs/src/java/org/apache/lucene/codecs/bloom/MurmurHash64.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 1094842a970..c5d96cbc6fe 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -116,7 +116,10 @@ Improvements Optimizations --------------------- -(No changes) + +* GITHUB#11900: BloomFilteringPostingsFormat now uses multiple hash functions + in order to achieve the same false positive probability with less memory. + (Jean-François Boeuf) Bug Fixes --------------------- diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilterFactory.java b/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilterFactory.java index 69f31f19585..910f59a5ab9 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilterFactory.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilterFactory.java @@ -42,9 +42,7 @@ public abstract class BloomFilterFactory { * @return null or a hopefully more densely packed, smaller bitset */ public FuzzySet downsize(FieldInfo fieldInfo, FuzzySet initialSet) { - // Aim for a bitset size that would have 10% of bits set (so 90% of searches - // would fail-fast) - float targetMaxSaturation = 0.1f; + float targetMaxSaturation = initialSet.getTargetMaxSaturation(); return initialSet.downsize(targetMaxSaturation); } diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java index 5578fa26b08..2c908fcabe3 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java @@ -53,7 +53,7 @@ * *

A choice of {@link BloomFilterFactory} can be passed to tailor Bloom Filter settings on a * per-field basis. The default configuration is {@link DefaultBloomFilterFactory} which allocates a - * ~8mb bitset and hashes values using {@link MurmurHash2}. This should be suitable for most + * ~8mb bitset and hashes values using {@link MurmurHash64}. This should be suitable for most * purposes. * *

The format of the blm file is as follows: @@ -83,8 +83,8 @@ public final class BloomFilteringPostingsFormat extends PostingsFormat { /** Extension of Bloom Filters file */ static final String BLOOM_EXTENSION = "blm"; - BloomFilterFactory bloomFilterFactory = new DefaultBloomFilterFactory(); - private PostingsFormat delegatePostingsFormat; + private final BloomFilterFactory bloomFilterFactory; + private final PostingsFormat delegatePostingsFormat; /** * Creates Bloom filters for a selection of fields created in the index. This is recorded as a set @@ -120,7 +120,7 @@ public BloomFilteringPostingsFormat(PostingsFormat delegatePostingsFormat) { // Used only by core Lucene at read-time via Service Provider instantiation - // do not use at Write-time in application code. public BloomFilteringPostingsFormat() { - super(BLOOM_CODEC_NAME); + this(null, new DefaultBloomFilterFactory()); } @Override @@ -366,6 +366,11 @@ public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException { public ImpactsEnum impacts(int flags) throws IOException { return delegate().impacts(flags); } + + @Override + public String toString() { + return getClass().getSimpleName() + "(filter=" + filter.toString() + ")"; + } } @Override diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/DefaultBloomFilterFactory.java b/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/DefaultBloomFilterFactory.java index 61e60dcbca1..306fdb9b4f9 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/DefaultBloomFilterFactory.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/DefaultBloomFilterFactory.java @@ -31,7 +31,7 @@ public class DefaultBloomFilterFactory extends BloomFilterFactory { public FuzzySet getSetForField(SegmentWriteState state, FieldInfo info) { // Assume all of the docs have a unique term (e.g. a primary key) and we hope to maintain a set // with 10% of bits set - return FuzzySet.createSetBasedOnQuality(state.segmentInfo.maxDoc(), 0.10f); + return FuzzySet.createOptimalSet(state.segmentInfo.maxDoc(), 0.1023f); } @Override diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/FuzzySet.java b/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/FuzzySet.java index 5ccb04203da..f1d2dee65c7 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/FuzzySet.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/FuzzySet.java @@ -44,21 +44,6 @@ */ public class FuzzySet implements Accountable { - public static final int VERSION_SPI = 1; // HashFunction used to be loaded through a SPI - public static final int VERSION_START = VERSION_SPI; - public static final int VERSION_CURRENT = 2; - - public static HashFunction hashFunctionForVersion(int version) { - if (version < VERSION_START) { - throw new IllegalArgumentException( - "Version " + version + " is too old, expected at least " + VERSION_START); - } else if (version > VERSION_CURRENT) { - throw new IllegalArgumentException( - "Version " + version + " is too new, expected at most " + VERSION_CURRENT); - } - return MurmurHash2.INSTANCE; - } - /** * Result from {@link FuzzySet#contains(BytesRef)}: can never return definitively YES (always * MAYBE), but can sometimes definitely return NO. @@ -71,6 +56,7 @@ public enum ContainsResult { private HashFunction hashFunction; private FixedBitSet filter; private int bloomSize; + private final int hashCount; // The sizes of BitSet used are all numbers that, when expressed in binary form, // are all ones. This is to enable fast downsizing from one bitset to another @@ -82,12 +68,9 @@ public enum ContainsResult { static final int[] usableBitSetSizes; static { - usableBitSetSizes = new int[30]; - int mask = 1; - int size = mask; + usableBitSetSizes = new int[26]; for (int i = 0; i < usableBitSetSizes.length; i++) { - size = (size << 1) | mask; - usableBitSetSizes[i] = size; + usableBitSetSizes[i] = (1 << (i + 6)) - 1; } } @@ -131,48 +114,60 @@ public static int getNearestSetSize(int maxNumberOfValuesExpected, float desired public static FuzzySet createSetBasedOnMaxMemory(int maxNumBytes) { int setSize = getNearestSetSize(maxNumBytes); - return new FuzzySet( - new FixedBitSet(setSize + 1), setSize, hashFunctionForVersion(VERSION_CURRENT)); + return new FuzzySet(new FixedBitSet(setSize + 1), setSize, 1); } public static FuzzySet createSetBasedOnQuality( - int maxNumUniqueValues, float desiredMaxSaturation) { + int maxNumUniqueValues, float desiredMaxSaturation, int version) { int setSize = getNearestSetSize(maxNumUniqueValues, desiredMaxSaturation); - return new FuzzySet( - new FixedBitSet(setSize + 1), setSize, hashFunctionForVersion(VERSION_CURRENT)); + return new FuzzySet(new FixedBitSet(setSize + 1), setSize, 1); + } + + public static FuzzySet createOptimalSet(int maxNumUniqueValues, float targetMaxFpp) { + int setSize = + (int) + Math.ceil( + (maxNumUniqueValues * Math.log(targetMaxFpp)) + / Math.log(1 / Math.pow(2, Math.log(2)))); + setSize = getNearestSetSize(2 * setSize); + int optimalK = (int) Math.round(((double) setSize / maxNumUniqueValues) * Math.log(2)); + return new FuzzySet(new FixedBitSet(setSize + 1), setSize, optimalK); } - private FuzzySet(FixedBitSet filter, int bloomSize, HashFunction hashFunction) { + private FuzzySet(FixedBitSet filter, int bloomSize, int hashCount) { super(); this.filter = filter; this.bloomSize = bloomSize; - this.hashFunction = hashFunction; + this.hashFunction = MurmurHash64.INSTANCE; + this.hashCount = hashCount; } /** * The main method required for a Bloom filter which, given a value determines set membership. - * Unlike a conventional set, the fuzzy set returns NO or MAYBE rather than true or false. + * Unlike a conventional set, the fuzzy set returns NO or MAYBE rather than true or false. Hash + * generation follows the same principles as {@link #addValue(BytesRef)} * * @return NO or MAYBE */ public ContainsResult contains(BytesRef value) { - int hash = hashFunction.hash(value); - if (hash < 0) { - hash = hash * -1; + long hash = hashFunction.hash(value); + int msb = (int) (hash >>> Integer.SIZE); + int lsb = (int) hash; + for (int i = 0; i < hashCount; i++) { + int bloomPos = (lsb + i * msb); + if (!mayContainValue(bloomPos)) { + return ContainsResult.NO; + } } - return mayContainValue(hash); + return ContainsResult.MAYBE; } /** * Serializes the data set to file using the following format: * *