From 47df1d4795fe292324736d106a731de7e081030e Mon Sep 17 00:00:00 2001 From: iverase Date: Wed, 19 Jun 2019 09:30:01 +0200 Subject: [PATCH 01/19] Add new storage strategy when cardinality if points is low on a BKD tree leaf --- .../org/apache/lucene/util/bkd/BKDReader.java | 82 +++++++++++++++++-- .../org/apache/lucene/util/bkd/BKDWriter.java | 72 +++++++++++++--- 2 files changed, 136 insertions(+), 18 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java index c1f5a63d7de8..4aa6fa03b30e 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java @@ -441,8 +441,15 @@ int readDocIDs(IndexInput in, long blockFP, int[] docIDs) throws IOException { void visitDocValues(int[] commonPrefixLengths, byte[] scratchDataPackedValue, byte[] scratchMinIndexPackedValue, byte[] scratchMaxIndexPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException { + if (version >= BKDWriter.VERSION_LOW_CARDINALITY_LEAVES) { + visitDocValuesWithCardinality(commonPrefixLengths, scratchDataPackedValue, scratchMinIndexPackedValue, scratchMaxIndexPackedValue, in, docIDs, count, visitor); + } else { + visitDocValuesNoCardinality(commonPrefixLengths, scratchDataPackedValue, scratchMinIndexPackedValue, scratchMaxIndexPackedValue, in, docIDs, count, visitor); + } + } - + void visitDocValuesNoCardinality(int[] commonPrefixLengths, byte[] scratchDataPackedValue, byte[] scratchMinIndexPackedValue, byte[] scratchMaxIndexPackedValue, + IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException { readCommonPrefixes(commonPrefixLengths, scratchDataPackedValue, in); if (numIndexDims != 1 && version >= BKDWriter.VERSION_LEAF_STORES_BOUNDS) { @@ -480,12 +487,62 @@ void visitDocValues(int[] commonPrefixLengths, byte[] scratchDataPackedValue, by int compressedDim = readCompressedDim(in); if (compressedDim == -1) { - visitRawDocValues(commonPrefixLengths, scratchDataPackedValue, in, docIDs, count, visitor); + visitUniqueRawDocValues(scratchDataPackedValue, docIDs, count, visitor); } else { visitCompressedDocValues(commonPrefixLengths, scratchDataPackedValue, in, docIDs, count, visitor, compressedDim); } } + void visitDocValuesWithCardinality(int[] commonPrefixLengths, byte[] scratchDataPackedValue, byte[] scratchMinIndexPackedValue, byte[] scratchMaxIndexPackedValue, + IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException { + + readCommonPrefixes(commonPrefixLengths, scratchDataPackedValue, in); + int compressedDim = readCompressedDim(in); + if (compressedDim == -1) { + //all values are the same + visitor.grow(count); + visitUniqueRawDocValues(scratchDataPackedValue, docIDs, count, visitor); + } else { + if (numIndexDims != 1 && version >= BKDWriter.VERSION_LEAF_STORES_BOUNDS) { + byte[] minPackedValue = scratchMinIndexPackedValue; + System.arraycopy(scratchDataPackedValue, 0, minPackedValue, 0, packedIndexBytesLength); + byte[] maxPackedValue = scratchMaxIndexPackedValue; + //Copy common prefixes before reading adjusted + // box + System.arraycopy(minPackedValue, 0, maxPackedValue, 0, packedIndexBytesLength); + readMinMax(commonPrefixLengths, minPackedValue, maxPackedValue, in); + + // The index gives us range of values for each dimension, but the actual range of values + // might be much more narrow than what the index told us, so we double check the relation + // here, which is cheap yet might help figure out that the block either entirely matches + // or does not match at all. This is especially more likely in the case that there are + // multiple dimensions that have correlation, ie. splitting on one dimension also + // significantly changes the range of values in another dimension. + Relation r = visitor.compare(minPackedValue, maxPackedValue); + if (r == Relation.CELL_OUTSIDE_QUERY) { + return; + } + visitor.grow(count); + + if (r == Relation.CELL_INSIDE_QUERY) { + for (int i = 0; i < count; ++i) { + visitor.visit(docIDs[i]); + } + return; + } + } else { + visitor.grow(count); + } + if (compressedDim == -2) { + //low cardinality values + visitSparseRawDocValues(commonPrefixLengths, scratchDataPackedValue, in, docIDs, count, visitor); + } else { + //high cardinality + visitCompressedDocValues(commonPrefixLengths, scratchDataPackedValue, in, docIDs, count, visitor, compressedDim); + } + } + } + private void readMinMax(int[] commonPrefixLengths, byte[] minPackedValue, byte[] maxPackedValue, IndexInput in) throws IOException { for (int dim = 0; dim < numIndexDims; dim++) { int prefix = commonPrefixLengths[dim]; @@ -495,12 +552,27 @@ private void readMinMax(int[] commonPrefixLengths, byte[] minPackedValue, byte[] } // Just read suffixes for every dimension - private void visitRawDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException { - for (int i = 0; i < count; ++i) { + private void visitSparseRawDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException { + int i; + for (i = 0; i < count;) { + int length = in.readVInt(); for(int dim=0;dim