Skip to content

Commit

Permalink
Reduce bloom filter size by using the optimal count for hash function…
Browse files Browse the repository at this point in the history
…s. (#11900)
  • Loading branch information
jfboeuf committed Feb 1, 2023
1 parent f9cb6a3 commit 5acca82
Show file tree
Hide file tree
Showing 8 changed files with 166 additions and 175 deletions.
5 changes: 4 additions & 1 deletion lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,10 @@ Improvements

Optimizations
---------------------
(No changes)

* GITHUB#11900: BloomFilteringPostingsFormat now uses multiple hash functions
in order to achieve the same false positive probability with less memory.
(Jean-François Boeuf)

Bug Fixes
---------------------
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,7 @@ public abstract class BloomFilterFactory {
* @return null or a hopefully more densely packed, smaller bitset
*/
public FuzzySet downsize(FieldInfo fieldInfo, FuzzySet initialSet) {
// Aim for a bitset size that would have 10% of bits set (so 90% of searches
// would fail-fast)
float targetMaxSaturation = 0.1f;
float targetMaxSaturation = initialSet.getTargetMaxSaturation();
return initialSet.downsize(targetMaxSaturation);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@
*
* <p>A choice of {@link BloomFilterFactory} can be passed to tailor Bloom Filter settings on a
* per-field basis. The default configuration is {@link DefaultBloomFilterFactory} which allocates a
* ~8mb bitset and hashes values using {@link MurmurHash2}. This should be suitable for most
* ~8mb bitset and hashes values using {@link MurmurHash64}. This should be suitable for most
* purposes.
*
* <p>The format of the blm file is as follows:
Expand Down Expand Up @@ -83,8 +83,8 @@ public final class BloomFilteringPostingsFormat extends PostingsFormat {
/** Extension of Bloom Filters file */
static final String BLOOM_EXTENSION = "blm";

BloomFilterFactory bloomFilterFactory = new DefaultBloomFilterFactory();
private PostingsFormat delegatePostingsFormat;
private final BloomFilterFactory bloomFilterFactory;
private final PostingsFormat delegatePostingsFormat;

/**
* Creates Bloom filters for a selection of fields created in the index. This is recorded as a set
Expand Down Expand Up @@ -120,7 +120,7 @@ public BloomFilteringPostingsFormat(PostingsFormat delegatePostingsFormat) {
// Used only by core Lucene at read-time via Service Provider instantiation -
// do not use at Write-time in application code.
public BloomFilteringPostingsFormat() {
super(BLOOM_CODEC_NAME);
this(null, new DefaultBloomFilterFactory());
}

@Override
Expand Down Expand Up @@ -366,6 +366,11 @@ public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException {
public ImpactsEnum impacts(int flags) throws IOException {
return delegate().impacts(flags);
}

@Override
public String toString() {
return getClass().getSimpleName() + "(filter=" + filter.toString() + ")";
}
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ public class DefaultBloomFilterFactory extends BloomFilterFactory {
public FuzzySet getSetForField(SegmentWriteState state, FieldInfo info) {
// Assume all of the docs have a unique term (e.g. a primary key) and we hope to maintain a set
// with 10% of bits set
return FuzzySet.createSetBasedOnQuality(state.segmentInfo.maxDoc(), 0.10f);
return FuzzySet.createOptimalSet(state.segmentInfo.maxDoc(), 0.1023f);
}

@Override
Expand Down
129 changes: 65 additions & 64 deletions lucene/codecs/src/java/org/apache/lucene/codecs/bloom/FuzzySet.java
Original file line number Diff line number Diff line change
Expand Up @@ -44,21 +44,6 @@
*/
public class FuzzySet implements Accountable {

public static final int VERSION_SPI = 1; // HashFunction used to be loaded through a SPI
public static final int VERSION_START = VERSION_SPI;
public static final int VERSION_CURRENT = 2;

public static HashFunction hashFunctionForVersion(int version) {
if (version < VERSION_START) {
throw new IllegalArgumentException(
"Version " + version + " is too old, expected at least " + VERSION_START);
} else if (version > VERSION_CURRENT) {
throw new IllegalArgumentException(
"Version " + version + " is too new, expected at most " + VERSION_CURRENT);
}
return MurmurHash2.INSTANCE;
}

/**
* Result from {@link FuzzySet#contains(BytesRef)}: can never return definitively YES (always
* MAYBE), but can sometimes definitely return NO.
Expand All @@ -71,6 +56,7 @@ public enum ContainsResult {
private HashFunction hashFunction;
private FixedBitSet filter;
private int bloomSize;
private final int hashCount;

// The sizes of BitSet used are all numbers that, when expressed in binary form,
// are all ones. This is to enable fast downsizing from one bitset to another
Expand All @@ -82,12 +68,9 @@ public enum ContainsResult {
static final int[] usableBitSetSizes;

static {
usableBitSetSizes = new int[30];
int mask = 1;
int size = mask;
usableBitSetSizes = new int[26];
for (int i = 0; i < usableBitSetSizes.length; i++) {
size = (size << 1) | mask;
usableBitSetSizes[i] = size;
usableBitSetSizes[i] = (1 << (i + 6)) - 1;
}
}

Expand Down Expand Up @@ -131,48 +114,60 @@ public static int getNearestSetSize(int maxNumberOfValuesExpected, float desired

public static FuzzySet createSetBasedOnMaxMemory(int maxNumBytes) {
int setSize = getNearestSetSize(maxNumBytes);
return new FuzzySet(
new FixedBitSet(setSize + 1), setSize, hashFunctionForVersion(VERSION_CURRENT));
return new FuzzySet(new FixedBitSet(setSize + 1), setSize, 1);
}

public static FuzzySet createSetBasedOnQuality(
int maxNumUniqueValues, float desiredMaxSaturation) {
int maxNumUniqueValues, float desiredMaxSaturation, int version) {
int setSize = getNearestSetSize(maxNumUniqueValues, desiredMaxSaturation);
return new FuzzySet(
new FixedBitSet(setSize + 1), setSize, hashFunctionForVersion(VERSION_CURRENT));
return new FuzzySet(new FixedBitSet(setSize + 1), setSize, 1);
}

public static FuzzySet createOptimalSet(int maxNumUniqueValues, float targetMaxFpp) {
int setSize =
(int)
Math.ceil(
(maxNumUniqueValues * Math.log(targetMaxFpp))
/ Math.log(1 / Math.pow(2, Math.log(2))));
setSize = getNearestSetSize(2 * setSize);
int optimalK = (int) Math.round(((double) setSize / maxNumUniqueValues) * Math.log(2));
return new FuzzySet(new FixedBitSet(setSize + 1), setSize, optimalK);
}

private FuzzySet(FixedBitSet filter, int bloomSize, HashFunction hashFunction) {
private FuzzySet(FixedBitSet filter, int bloomSize, int hashCount) {
super();
this.filter = filter;
this.bloomSize = bloomSize;
this.hashFunction = hashFunction;
this.hashFunction = MurmurHash64.INSTANCE;
this.hashCount = hashCount;
}

/**
* The main method required for a Bloom filter which, given a value determines set membership.
* Unlike a conventional set, the fuzzy set returns NO or MAYBE rather than true or false.
* Unlike a conventional set, the fuzzy set returns NO or MAYBE rather than true or false. Hash
* generation follows the same principles as {@link #addValue(BytesRef)}
*
* @return NO or MAYBE
*/
public ContainsResult contains(BytesRef value) {
int hash = hashFunction.hash(value);
if (hash < 0) {
hash = hash * -1;
long hash = hashFunction.hash(value);
int msb = (int) (hash >>> Integer.SIZE);
int lsb = (int) hash;
for (int i = 0; i < hashCount; i++) {
int bloomPos = (lsb + i * msb);
if (!mayContainValue(bloomPos)) {
return ContainsResult.NO;
}
}
return mayContainValue(hash);
return ContainsResult.MAYBE;
}

/**
* Serializes the data set to file using the following format:
*
* <ul>
* <li>FuzzySet --&gt;FuzzySetVersion,HashFunctionName,BloomSize,
* NumBitSetWords,BitSetWord<sup>NumBitSetWords</sup>
* <li>HashFunctionName --&gt; {@link DataOutput#writeString(String) String} The name of a
* ServiceProvider registered {@link HashFunction}
* <li>FuzzySetVersion --&gt; {@link DataOutput#writeInt Uint32} The version number of the
* {@link FuzzySet} class
* <li>FuzzySet --&gt;hashCount,BloomSize, NumBitSetWords,BitSetWord<sup>NumBitSetWords</sup>
* <li>hashCount --&gt; {@link DataOutput#writeVInt Uint32} The number of hash functions (k).
* <li>BloomSize --&gt; {@link DataOutput#writeInt Uint32} The modulo value used to project
* hashes into the field's Bitset
* <li>NumBitSetWords --&gt; {@link DataOutput#writeInt Uint32} The number of longs (as returned
Expand All @@ -185,7 +180,7 @@ public ContainsResult contains(BytesRef value) {
* @throws IOException If there is a low-level I/O error
*/
public void serialize(DataOutput out) throws IOException {
out.writeInt(VERSION_CURRENT);
out.writeVInt(hashCount);
out.writeInt(bloomSize);
long[] bits = filter.getBits();
out.writeInt(bits.length);
Expand All @@ -197,48 +192,41 @@ public void serialize(DataOutput out) throws IOException {
}

public static FuzzySet deserialize(DataInput in) throws IOException {
int version = in.readInt();
if (version == VERSION_SPI) {
in.readString();
}
final HashFunction hashFunction = hashFunctionForVersion(version);
int hashCount = in.readVInt();
int bloomSize = in.readInt();
int numLongs = in.readInt();
long[] longs = new long[numLongs];
for (int i = 0; i < numLongs; i++) {
longs[i] = in.readLong();
}
FixedBitSet bits = new FixedBitSet(longs, bloomSize + 1);
return new FuzzySet(bits, bloomSize, hashFunction);
return new FuzzySet(bits, bloomSize, hashCount);
}

private ContainsResult mayContainValue(int positiveHash) {
assert positiveHash >= 0;
private boolean mayContainValue(int aHash) {
// Bloom sizes are always base 2 and so can be ANDed for a fast modulo
int pos = positiveHash & bloomSize;
if (filter.get(pos)) {
// This term may be recorded in this index (but could be a collision)
return ContainsResult.MAYBE;
}
// definitely NOT in this segment
return ContainsResult.NO;
int pos = aHash & bloomSize;
return filter.get(pos);
}

/**
* Records a value in the set. The referenced bytes are hashed and then modulo n'd where n is the
* chosen size of the internal bitset.
* Records a value in the set. The referenced bytes are hashed. From the 64-bit generated hash,
* two 32-bit hashes are derived from the msb and lsb which can be used to derive more hashes (see
* https://www.eecs.harvard.edu/~michaelm/postscripts/rsa2008.pdf). Finally, each generated hash
* is modulo n'd where n is the chosen size of the internal bitset.
*
* @param value the key value to be hashed
* @throws IOException If there is a low-level I/O error
*/
public void addValue(BytesRef value) throws IOException {
int hash = hashFunction.hash(value);
if (hash < 0) {
hash = hash * -1;
long hash = hashFunction.hash(value);
int msb = (int) (hash >>> Integer.SIZE);
int lsb = (int) hash;
for (int i = 0; i < hashCount; i++) {
// Bitmasking using bloomSize is effectively a modulo operation.
int bloomPos = (lsb + i * msb) & bloomSize;
filter.set(bloomPos);
}
// Bitmasking using bloomSize is effectively a modulo operation.
int bloomPos = hash & bloomSize;
filter.set(bloomPos);
}

/**
Expand Down Expand Up @@ -279,7 +267,7 @@ public FuzzySet downsize(float targetMaxSaturation) {
} else {
return null;
}
return new FuzzySet(rightSizedBitSet, rightSizedBitSetSize, hashFunction);
return new FuzzySet(rightSizedBitSet, rightSizedBitSetSize, hashCount);
}

public int getEstimatedUniqueValues() {
Expand All @@ -297,6 +285,10 @@ public static int getEstimatedNumberUniqueValuesAllowingForCollisions(
return (int) (setSizeAsDouble * logInverseSaturation);
}

public float getTargetMaxSaturation() {
return 0.5f;
}

public float getSaturation() {
int numBitsSet = filter.cardinality();
return (float) numBitsSet / (float) bloomSize;
Expand All @@ -309,6 +301,15 @@ public long ramBytesUsed() {

@Override
public String toString() {
return getClass().getSimpleName() + "(hash=" + hashFunction + ")";
return getClass().getSimpleName()
+ "(hash="
+ hashFunction
+ ", k="
+ hashCount
+ ", bits="
+ filter.cardinality()
+ "/"
+ filter.length()
+ ")";
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -33,5 +33,5 @@ public abstract class HashFunction {
* @param bytes the data to be hashed
* @return the hash of the bytes referenced by bytes.offset and length bytes.length
*/
public abstract int hash(BytesRef bytes);
public abstract long hash(BytesRef bytes);
}
Loading

0 comments on commit 5acca82

Please sign in to comment.