Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

OAK-10674: DocumentStore: verify that we could use Oak's Bloom filter #1345

Draft
wants to merge 4 commits into
base: trunk
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,13 @@
*/
package org.apache.jackrabbit.oak.plugins.document.cache;

import org.apache.jackrabbit.guava.common.base.Predicate;
import org.apache.jackrabbit.guava.common.hash.BloomFilter;
import org.apache.jackrabbit.guava.common.hash.Funnel;
import org.apache.jackrabbit.guava.common.hash.PrimitiveSink;
import org.apache.jackrabbit.oak.plugins.document.cache.tmp.BloomFilter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.Closeable;
import java.util.List;
import java.util.function.Predicate;

public class CacheChangesTracker implements Closeable {

Expand All @@ -48,13 +46,13 @@ public class CacheChangesTracker implements Closeable {
}

public void invalidateDocument(String key) {
if (keyFilter.apply(key)) {
if (keyFilter.test(key)) {
lazyBloomFilter.put(key);
}
}

public boolean mightBeenAffected(String key) {
return keyFilter.apply(key) && lazyBloomFilter.mightContain(key);
return keyFilter.test(key) && lazyBloomFilter.mightContain(key);
}

@Override
Expand All @@ -65,7 +63,8 @@ public void close() {
if (lazyBloomFilter.filter == null) {
LOG.debug("Disposing CacheChangesTracker for {}, no filter was needed", keyFilter);
} else {
LOG.debug("Disposing CacheChangesTracker for {}, filter fpp was: {}", keyFilter, lazyBloomFilter.filter.expectedFpp());
LOG.debug("Disposing CacheChangesTracker for {}, filter fpp was: {}", keyFilter,
lazyBloomFilter.filter.expectedFpp());
}
}
}
Expand All @@ -76,36 +75,29 @@ public static class LazyBloomFilter {

private final int entries;

private volatile BloomFilter<String> filter;
private volatile BloomFilter filter;

public LazyBloomFilter(int entries) {
this.entries = entries;
}

public synchronized void put(String entry) {
getFilter().put(entry);
getFilter().add(entry);
}

public boolean mightContain(String entry) {
if (filter == null) {
return false;
} else {
synchronized (this) {
return filter.mightContain(entry);
return filter.mayContain(entry);
}
}
}

private BloomFilter<String> getFilter() {
private BloomFilter getFilter() {
if (filter == null) {
filter = BloomFilter.create(new Funnel<String>() {
private static final long serialVersionUID = -7114267990225941161L;

@Override
public void funnel(String from, PrimitiveSink into) {
into.putUnencodedChars(from);
}
}, entries, FPP);
filter = BloomFilter.construct(entries, FPP);
}
return filter;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.jackrabbit.oak.plugins.document.cache.tmp;

import org.jetbrains.annotations.NotNull;

/**
* A Bloom filter implementation.
* <p>
* Copied for the purpose of testing in oak-store-document; see OAK-10674.
*/
public class BloomFilter {

private final int k;
private final int arraySize;
private final long[] data;

private BloomFilter(long[] data, int k) {
this.data = data;
this.k = k;
this.arraySize = data.length;
}

/**
* Construct a Bloom filter. With a fpp of 0.01, the memory usage is roughly 1
* byte per entry.
*
* @param bytes the size in number of bytes (eg. 64_000_000 for 64 MB memory
* usage)
* @param fpp the false-positive probability (eg. 0.01 for a 1% false-positive
* probability)
* @return the Bloom filter
*/
public static BloomFilter construct(long n, double fpp) {
long m = calculateBits(n, fpp);
int k = calculateK((double) m / n);
return new BloomFilter(new long[(int) ((m + 63) / 64)], k);
}

// See also https://hur.st/bloomfilter

/**
* Calculate the best k parameter for a Bloom filter.
*
* @param bitsPerKey the number of bits per key (eg. 10)
* @return the k parameter
*/
public static int calculateK(double bitsPerKey) {
return Math.max(1, (int) Math.round(bitsPerKey * Math.log(2)));
}

/**
* Calculate the number of bits needed for a Bloom filter, given a number of entries and the k parameter.
*
* @param n the number of entries (eg. 1_000_000)
* @param fpp the false positive probability (eg. 0.01)
* @return the bits needed
*/
public static long calculateBits(long n, double fpp) {
return (long) Math.ceil((n * Math.log(fpp)) / Math.log(1 / Math.pow(2, Math.log(2))));
}

/**
* Calculate the maximum number of entries in the set, given the the memory size
* in bits, and a target false positive probability.
*
* @param bits the number of bits (eg. 10_000_000)
* @param fpp the false positive probability (eg. 0.01)
* @return the maximum number of entries to be added
*/
public static long calculateN(long bits, double fpp) {
return (long) Math.ceil((bits * Math.log(Math.pow(0.5, Math.log(2))) / Math.log(fpp)));
}

/**
* Calculate the false positive probability.
*
* @param bits the number of bits (eg. 10_000_000)
* @param fpp the false positive probability (eg. 0.01)
* @return the maximum number of entries to be added
*/
public static double calculateFpp(long n, long bits, int k) {
// p = pow(1 - exp(-k / (m / n)), k)
return Math.pow(1 - Math.exp(-k / ((double) bits / n)), k);
}

/**
* Get the expected false positive rate for the current entries in the
* filter. This will first calculate the estimated entry count, and then
* calculate the false positive probability from there.
*/
public double expectedFpp() {
return calculateFpp(getEstimatedEntryCount(), getBitCount(), getK());
}

/**
* Add an entry.
*
* @param hash the hash value (need to be a high quality hash code, with all
* bits having high entropy)
*/
public void add(long hash) {
long a = (hash >>> 32) | (hash << 32);
long b = hash;
for (int i = 0; i < k; i++) {
data[Hash.reduce((int) (a >>> 32), arraySize)] |= 1L << a;
a += b;
}
}

/**
* Whether the entry may be in the set.
*
* @param hash the hash value (need to be a high quality hash code, with all
* bits having high entropy)
* @return true if the entry was added, or, with a certain false positive
* probability, even if it was not added
*/
public boolean mayContain(long hash) {
long a = (hash >>> 32) | (hash << 32);
long b = hash;
for (int i = 0; i < k; i++) {
if ((data[Hash.reduce((int) (a >>> 32), arraySize)] & 1L << a) == 0) {
return false;
}
a += b;
}
return true;
}

/**
* Get the number of bits needed for the array.
*
* @return the number of bits
*/
public long getBitCount() {
return data.length * 64L;
}

public int getK() {
return k;
}

/**
* Get the estimated entry count (number of distinct items added). This
* operation is relatively slow, as it loops over all the entries.
*
* @return the estimated entry count, or Long.MAX_VALUE if the number can not be estimated.
*/
public long getEstimatedEntryCount() {
long x = 0;
for (long d : data) {
x += Long.bitCount(d);
}
double m = getBitCount();
return (long) (-(m / k) * Math.log(1 - (x / m)));
}

/**
* Add an entry. This internally uses the hashCode() method to derive a
* high-quality hash code.
*
* @param obj the object (must not be null)
*/
public void add(@NotNull Object obj) {
add(Hash.hash64(obj.hashCode()));
}

/**
* Whether the entry may be in the set. This internally uses the hashCode()
* method to derive a high-quality hash code.
*
* @param obj the object (must not be null)
* @return true if the entry was added, or, with a certain false positive
* probability, even if it was not added
*/
public boolean mayContain(@NotNull Object obj) {
return mayContain(Hash.hash64(obj.hashCode()));
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.jackrabbit.oak.plugins.document.cache.tmp;

/**
* A hash function utility class.
* <p>
* Copied for the purpose of testing in oak-store-document; see OAK-10674.
*/
public class Hash {

private Hash() {
// utility class
}

/**
* Calculate a 64-bit hash value from a value, using a seed.
*
* The current algorithm used the finalizer of the MurmurHash3 hash function,
* but callers shouldn't rely on that.
*
* @param x the value
* @param seed the seed
* @return the hash value
*/
public static long hash64(long x, long seed) {
x += seed;
x = (x ^ (x >>> 33)) * 0xff51afd7ed558ccdL;
x = (x ^ (x >>> 33)) * 0xc4ceb9fe1a85ec53L;
x = x ^ (x >>> 33);
return x;
}

/**
* Calculate a 64-bit hash value from a value. The input is a 64-bit value and
* the output is a 64-bit values. Two different inputs are never mapped to the
* same output. The operation is reversible.
*
* @param x the value
* @return the hash value
*/
public static long hash64(long x) {
return hash64(x, 100);
}

/**
* Shrink the hash to a value 0..n. Kind of like modulo, but using
* multiplication and shift, which are faster to compute.
*
* @param hash the hash
* @param n the maximum of the result
* @return the reduced value
*/
public static int reduce(int hash, int n) {
// http://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
return (int) (((hash & 0xffffffffL) * (n & 0xffffffffL)) >>> 32);
}

}
Loading