Skip to content

Commit

Permalink
HBASE-13510 - Purge ByteBloomFilter (Ram)
Browse files Browse the repository at this point in the history
  • Loading branch information
ramkrish86 committed May 19, 2015
1 parent 901714d commit 5e7e626
Show file tree
Hide file tree
Showing 14 changed files with 760 additions and 790 deletions.
Original file line number Original file line Diff line number Diff line change
Expand Up @@ -59,8 +59,8 @@
import org.apache.hadoop.hbase.io.hfile.HFile.FileInfo; import org.apache.hadoop.hbase.io.hfile.HFile.FileInfo;
import org.apache.hadoop.hbase.regionserver.TimeRangeTracker; import org.apache.hadoop.hbase.regionserver.TimeRangeTracker;
import org.apache.hadoop.hbase.util.BloomFilter; import org.apache.hadoop.hbase.util.BloomFilter;
import org.apache.hadoop.hbase.util.BloomFilterUtil;
import org.apache.hadoop.hbase.util.BloomFilterFactory; import org.apache.hadoop.hbase.util.BloomFilterFactory;
import org.apache.hadoop.hbase.util.ByteBloomFilter;
import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.FSUtils; import org.apache.hadoop.hbase.util.FSUtils;
import org.apache.hadoop.hbase.util.Writables; import org.apache.hadoop.hbase.util.Writables;
Expand Down Expand Up @@ -424,7 +424,7 @@ private void printMeta(HFile.Reader reader, Map<byte[], byte[]> fileInfo)
System.out.println("Bloom filter:"); System.out.println("Bloom filter:");
if (bloomFilter != null) { if (bloomFilter != null) {
System.out.println(FOUR_SPACES + bloomFilter.toString().replaceAll( System.out.println(FOUR_SPACES + bloomFilter.toString().replaceAll(
ByteBloomFilter.STATS_RECORD_SEP, "\n" + FOUR_SPACES)); BloomFilterUtil.STATS_RECORD_SEP, "\n" + FOUR_SPACES));
} else { } else {
System.out.println(FOUR_SPACES + "Not present"); System.out.println(FOUR_SPACES + "Not present");
} }
Expand All @@ -438,7 +438,7 @@ private void printMeta(HFile.Reader reader, Map<byte[], byte[]> fileInfo)
System.out.println("Delete Family Bloom filter:"); System.out.println("Delete Family Bloom filter:");
if (bloomFilter != null) { if (bloomFilter != null) {
System.out.println(FOUR_SPACES System.out.println(FOUR_SPACES
+ bloomFilter.toString().replaceAll(ByteBloomFilter.STATS_RECORD_SEP, + bloomFilter.toString().replaceAll(BloomFilterUtil.STATS_RECORD_SEP,
"\n" + FOUR_SPACES)); "\n" + FOUR_SPACES));
} else { } else {
System.out.println(FOUR_SPACES + "Not present"); System.out.println(FOUR_SPACES + "Not present");
Expand Down
Original file line number Original file line Diff line number Diff line change
Expand Up @@ -707,7 +707,6 @@ public static class Writer implements Compactor.CellSink {
private final BloomType bloomType; private final BloomType bloomType;
private byte[] lastBloomKey; private byte[] lastBloomKey;
private int lastBloomKeyOffset, lastBloomKeyLen; private int lastBloomKeyOffset, lastBloomKeyLen;
private CellComparator kvComparator;
private Cell lastCell = null; private Cell lastCell = null;
private long earliestPutTs = HConstants.LATEST_TIMESTAMP; private long earliestPutTs = HConstants.LATEST_TIMESTAMP;
private Cell lastDeleteFamilyCell = null; private Cell lastDeleteFamilyCell = null;
Expand Down Expand Up @@ -754,8 +753,6 @@ private Writer(FileSystem fs, Path path,
.withFileContext(fileContext) .withFileContext(fileContext)
.create(); .create();


this.kvComparator = comparator;

generalBloomFilterWriter = BloomFilterFactory.createGeneralBloomAtWrite( generalBloomFilterWriter = BloomFilterFactory.createGeneralBloomAtWrite(
conf, cacheConf, bloomType, conf, cacheConf, bloomType,
(int) Math.min(maxKeys, Integer.MAX_VALUE), writer); (int) Math.min(maxKeys, Integer.MAX_VALUE), writer);
Expand Down Expand Up @@ -864,7 +861,9 @@ private void appendGeneralBloomfilter(final Cell cell) throws IOException {
* 1. Row = Row * 1. Row = Row
* 2. RowCol = Row + Qualifier * 2. RowCol = Row + Qualifier
*/ */
byte[] bloomKey; byte[] bloomKey = null;
// Used with ROW_COL bloom
KeyValue bloomKeyKV = null;
int bloomKeyOffset, bloomKeyLen; int bloomKeyOffset, bloomKeyLen;


switch (bloomType) { switch (bloomType) {
Expand All @@ -877,29 +876,32 @@ private void appendGeneralBloomfilter(final Cell cell) throws IOException {
// merge(row, qualifier) // merge(row, qualifier)
// TODO: could save one buffer copy in case of compound Bloom // TODO: could save one buffer copy in case of compound Bloom
// filters when this involves creating a KeyValue // filters when this involves creating a KeyValue
bloomKey = generalBloomFilterWriter.createBloomKey(cell.getRowArray(), bloomKeyKV = KeyValueUtil.createFirstOnRow(cell.getRowArray(), cell.getRowOffset(),
cell.getRowOffset(), cell.getRowLength(), cell.getQualifierArray(), cell.getRowLength(),
cell.getQualifierOffset(), cell.getQualifierLength()); HConstants.EMPTY_BYTE_ARRAY, 0, 0, cell.getQualifierArray(),
bloomKeyOffset = 0; cell.getQualifierOffset(),
bloomKeyLen = bloomKey.length; cell.getQualifierLength());
bloomKey = bloomKeyKV.getBuffer();
bloomKeyOffset = bloomKeyKV.getKeyOffset();
bloomKeyLen = bloomKeyKV.getKeyLength();
break; break;
default: default:
throw new IOException("Invalid Bloom filter type: " + bloomType + throw new IOException("Invalid Bloom filter type: " + bloomType +
" (ROW or ROWCOL expected)"); " (ROW or ROWCOL expected)");
} }
generalBloomFilterWriter.add(bloomKey, bloomKeyOffset, bloomKeyLen); generalBloomFilterWriter.add(bloomKey, bloomKeyOffset, bloomKeyLen);
if (lastBloomKey != null) { if (lastBloomKey != null) {
boolean res = false; int res = 0;
// hbase:meta does not have blooms. So we need not have special interpretation // hbase:meta does not have blooms. So we need not have special interpretation
// of the hbase:meta cells. We can safely use Bytes.BYTES_RAWCOMPARATOR for ROW Bloom // of the hbase:meta cells. We can safely use Bytes.BYTES_RAWCOMPARATOR for ROW Bloom
if (bloomType == BloomType.ROW) { if (bloomType == BloomType.ROW) {
res = Bytes.BYTES_RAWCOMPARATOR.compare(bloomKey, bloomKeyOffset, bloomKeyLen, res = Bytes.BYTES_RAWCOMPARATOR.compare(bloomKey, bloomKeyOffset, bloomKeyLen,
lastBloomKey, lastBloomKeyOffset, lastBloomKeyLen) <= 0; lastBloomKey, lastBloomKeyOffset, lastBloomKeyLen);
} else { } else {
res = (CellComparator.COMPARATOR.compare(lastBloomKeyOnlyKV, bloomKey, // TODO : Caching of kv components becomes important in these cases
bloomKeyOffset, bloomKeyLen) >= 0); res = CellComparator.COMPARATOR.compare(bloomKeyKV, lastBloomKeyOnlyKV);
} }
if (res) { if (res <= 0) {
throw new IOException("Non-increasing Bloom keys: " throw new IOException("Non-increasing Bloom keys: "
+ Bytes.toStringBinary(bloomKey, bloomKeyOffset, bloomKeyLen) + " after " + Bytes.toStringBinary(bloomKey, bloomKeyOffset, bloomKeyLen) + " after "
+ Bytes.toStringBinary(lastBloomKey, lastBloomKeyOffset, lastBloomKeyLen)); + Bytes.toStringBinary(lastBloomKey, lastBloomKeyOffset, lastBloomKeyLen));
Expand Down Expand Up @@ -1252,7 +1254,10 @@ public boolean passesGeneralBloomFilter(byte[] row, int rowOffset,
return true; return true;
} }


byte[] key; // Used in ROW bloom
byte[] key = null;
// Used in ROW_COL bloom
KeyValue kvKey = null;
switch (bloomFilterType) { switch (bloomFilterType) {
case ROW: case ROW:
if (col != null) { if (col != null) {
Expand All @@ -1267,8 +1272,9 @@ public boolean passesGeneralBloomFilter(byte[] row, int rowOffset,
break; break;


case ROWCOL: case ROWCOL:
key = bloomFilter.createBloomKey(row, rowOffset, rowLen, col, kvKey = KeyValueUtil.createFirstOnRow(row, rowOffset, rowLen,
colOffset, colLen); HConstants.EMPTY_BYTE_ARRAY, 0, 0, col, colOffset,
colLen);
break; break;


default: default:
Expand Down Expand Up @@ -1304,9 +1310,7 @@ public boolean passesGeneralBloomFilter(byte[] row, int rowOffset,
if (bloomFilterType == BloomType.ROW) { if (bloomFilterType == BloomType.ROW) {
keyIsAfterLast = (Bytes.BYTES_RAWCOMPARATOR.compare(key, lastBloomKey) > 0); keyIsAfterLast = (Bytes.BYTES_RAWCOMPARATOR.compare(key, lastBloomKey) > 0);
} else { } else {
// TODO : Convert key to Cell so that we could use compare(Cell, Cell) keyIsAfterLast = (CellComparator.COMPARATOR.compare(kvKey, lastBloomKeyOnlyKV)) > 0;
keyIsAfterLast = (CellComparator.COMPARATOR.compare(lastBloomKeyOnlyKV, key, 0,
key.length)) < 0;
} }
} }


Expand All @@ -1315,19 +1319,17 @@ public boolean passesGeneralBloomFilter(byte[] row, int rowOffset,
// columns, a file might be skipped if using row+col Bloom filter. // columns, a file might be skipped if using row+col Bloom filter.
// In order to ensure this file is included an additional check is // In order to ensure this file is included an additional check is
// required looking only for a row bloom. // required looking only for a row bloom.
byte[] rowBloomKey = bloomFilter.createBloomKey(row, rowOffset, rowLen, KeyValue rowBloomKey = KeyValueUtil.createFirstOnRow(row, rowOffset, rowLen,
null, 0, 0); HConstants.EMPTY_BYTE_ARRAY, 0, 0, HConstants.EMPTY_BYTE_ARRAY, 0, 0);
// hbase:meta does not have blooms. So we need not have special interpretation // hbase:meta does not have blooms. So we need not have special interpretation
// of the hbase:meta cells. We can safely use Bytes.BYTES_RAWCOMPARATOR for ROW Bloom // of the hbase:meta cells. We can safely use Bytes.BYTES_RAWCOMPARATOR for ROW Bloom
if (keyIsAfterLast if (keyIsAfterLast
&& (CellComparator.COMPARATOR.compare(lastBloomKeyOnlyKV, rowBloomKey, 0, && (CellComparator.COMPARATOR.compare(rowBloomKey, lastBloomKeyOnlyKV)) > 0) {
rowBloomKey.length)) < 0) {
exists = false; exists = false;
} else { } else {
exists = exists =
bloomFilter.contains(key, 0, key.length, bloom) || bloomFilter.contains(kvKey, bloom) ||
bloomFilter.contains(rowBloomKey, 0, rowBloomKey.length, bloomFilter.contains(rowBloomKey, bloom);
bloom);
} }
} else { } else {
exists = !keyIsAfterLast exists = !keyIsAfterLast
Expand Down
Original file line number Original file line Diff line number Diff line change
Expand Up @@ -20,32 +20,75 @@


import java.nio.ByteBuffer; import java.nio.ByteBuffer;


import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.classification.InterfaceAudience; import org.apache.hadoop.hbase.classification.InterfaceAudience;


/** /**
* Defines the general behavior of a bloom filter.
* *
* Implements a <i>Bloom filter</i>, as defined by Bloom in 1970.
* <p> * <p>
* The Bloom filter is a data structure that was introduced in 1970 and that * The Bloom filter is a data structure that was introduced in 1970 and that has
* has been adopted by the networking research community in the past decade * been adopted by the networking research community in the past decade thanks
* thanks to the bandwidth efficiencies that it offers for the transmission of * to the bandwidth efficiencies that it offers for the transmission of set
* set membership information between networked hosts. A sender encodes the * membership information between networked hosts. A sender encodes the
* information into a bit vector, the Bloom filter, that is more compact than a * information into a bit vector, the Bloom filter, that is more compact than a
* conventional representation. Computation and space costs for construction * conventional representation. Computation and space costs for construction are
* are linear in the number of elements. The receiver uses the filter to test * linear in the number of elements. The receiver uses the filter to test
* whether various elements are members of the set. Though the filter will * whether various elements are members of the set. Though the filter will
* occasionally return a false positive, it will never return a false negative. * occasionally return a false positive, it will never return a false negative.
* When creating the filter, the sender can choose its desired point in a * When creating the filter, the sender can choose its desired point in a
* trade-off between the false positive rate and the size. * trade-off between the false positive rate and the size.
* *
* <p>
* Originally inspired by <a href="http://www.one-lab.org">European Commission
* One-Lab Project 034819</a>.
*
* Bloom filters are very sensitive to the number of elements inserted into
* them. For HBase, the number of entries depends on the size of the data stored
* in the column. Currently the default region size is 256MB, so entry count ~=
* 256MB / (average value size for column). Despite this rule of thumb, there is
* no efficient way to calculate the entry count after compactions. Therefore,
* it is often easier to use a dynamic bloom filter that will add extra space
* instead of allowing the error rate to grow.
*
* ( http://www.eecs.harvard.edu/~michaelm/NEWWORK/postscripts/BloomFilterSurvey
* .pdf )
*
* m denotes the number of bits in the Bloom filter (bitSize) n denotes the
* number of elements inserted into the Bloom filter (maxKeys) k represents the
* number of hash functions used (nbHash) e represents the desired false
* positive rate for the bloom (err)
*
* If we fix the error rate (e) and know the number of entries, then the optimal
* bloom size m = -(n * ln(err) / (ln(2)^2) ~= n * ln(err) / ln(0.6185)
*
* The probability of false positives is minimized when k = m/n ln(2).
*
* @see BloomFilter The general behavior of a filter
*
* @see <a
* href="http://portal.acm.org/citation.cfm?id=362692&dl=ACM&coll=portal">
* Space/Time Trade-Offs in Hash Coding with Allowable Errors</a>
*
* @see BloomFilterWriter for the ability to add elements to a Bloom filter * @see BloomFilterWriter for the ability to add elements to a Bloom filter
*/ */
@InterfaceAudience.Private @InterfaceAudience.Private
public interface BloomFilter extends BloomFilterBase { public interface BloomFilter extends BloomFilterBase {


/** /**
* Check if the specified key is contained in the bloom filter. * Check if the specified key is contained in the bloom filter.
* * Used in ROW_COL blooms where the blooms are serialized as KeyValues
* @param keyCell the key to check for the existence of
* @param bloom bloom filter data to search. This can be null if auto-loading
* is supported.
* @return true if matched by bloom, false if not
*/
boolean contains(Cell keyCell, ByteBuffer bloom);

/**
* Check if the specified key is contained in the bloom filter.
* Used in ROW bloom where the blooms are just plain byte[]
* @param buf data to check for existence of * @param buf data to check for existence of
* @param offset offset into the data * @param offset offset into the data
* @param length length of the data * @param length length of the data
Expand Down
Original file line number Original file line Diff line number Diff line change
Expand Up @@ -41,11 +41,4 @@ public interface BloomFilterBase {
* @return Size of the bloom, in bytes * @return Size of the bloom, in bytes
*/ */
long getByteSize(); long getByteSize();

/**
* Create a key for a row-column Bloom filter.
*/
byte[] createBloomKey(byte[] rowBuf, int rowOffset, int rowLen,
byte[] qualBuf, int qualOffset, int qualLen);

} }
Loading

0 comments on commit 5e7e626

Please sign in to comment.