apache · iverase · Jun 26, 2019 · Jun 19, 2019 · Jun 19, 2019 · Jun 20, 2019
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
@@ -441,8 +441,15 @@ int readDocIDs(IndexInput in, long blockFP, int[] docIDs) throws IOException {
 
   void visitDocValues(int[] commonPrefixLengths, byte[] scratchDataPackedValue, byte[] scratchMinIndexPackedValue, byte[] scratchMaxIndexPackedValue,
                       IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException {
+    if (version >= BKDWriter.VERSION_LOW_CARDINALITY_LEAVES) {
+      visitDocValuesWithCardinality(commonPrefixLengths, scratchDataPackedValue, scratchMinIndexPackedValue, scratchMaxIndexPackedValue, in, docIDs, count, visitor);
+    } else {
+      visitDocValuesNoCardinality(commonPrefixLengths, scratchDataPackedValue, scratchMinIndexPackedValue, scratchMaxIndexPackedValue, in, docIDs, count, visitor);
+    }
+  }
 
-
+  void visitDocValuesNoCardinality(int[] commonPrefixLengths, byte[] scratchDataPackedValue, byte[] scratchMinIndexPackedValue, byte[] scratchMaxIndexPackedValue,
+                      IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException {
     readCommonPrefixes(commonPrefixLengths, scratchDataPackedValue, in);
 
     if (numIndexDims != 1 && version >= BKDWriter.VERSION_LEAF_STORES_BOUNDS) {
@@ -480,12 +487,62 @@ void visitDocValues(int[] commonPrefixLengths, byte[] scratchDataPackedValue, by
     int compressedDim = readCompressedDim(in);
 
     if (compressedDim == -1) {
-      visitRawDocValues(commonPrefixLengths, scratchDataPackedValue, in, docIDs, count, visitor);
+      visitUniqueRawDocValues(scratchDataPackedValue, docIDs, count, visitor);
     } else {
       visitCompressedDocValues(commonPrefixLengths, scratchDataPackedValue, in, docIDs, count, visitor, compressedDim);
     }
   }
 
+  void visitDocValuesWithCardinality(int[] commonPrefixLengths, byte[] scratchDataPackedValue, byte[] scratchMinIndexPackedValue, byte[] scratchMaxIndexPackedValue,
+                                     IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException {
+
+    readCommonPrefixes(commonPrefixLengths, scratchDataPackedValue, in);
+    int compressedDim = readCompressedDim(in);
+    if (compressedDim == -1) {
+      //all values are the same
+      visitor.grow(count);
+      visitUniqueRawDocValues(scratchDataPackedValue, docIDs, count, visitor);
+    } else {
+      if (numIndexDims != 1 && version >= BKDWriter.VERSION_LEAF_STORES_BOUNDS) {
+        byte[] minPackedValue = scratchMinIndexPackedValue;
+        System.arraycopy(scratchDataPackedValue, 0, minPackedValue, 0, packedIndexBytesLength);
+        byte[] maxPackedValue = scratchMaxIndexPackedValue;
+        //Copy common prefixes before reading adjusted
+        // box
+        System.arraycopy(minPackedValue, 0, maxPackedValue, 0, packedIndexBytesLength);
+        readMinMax(commonPrefixLengths, minPackedValue, maxPackedValue, in);
+
+        // The index gives us range of values for each dimension, but the actual range of values
+        // might be much more narrow than what the index told us, so we double check the relation
+        // here, which is cheap yet might help figure out that the block either entirely matches
+        // or does not match at all. This is especially more likely in the case that there are
+        // multiple dimensions that have correlation, ie. splitting on one dimension also
+        // significantly changes the range of values in another dimension.
+        Relation r = visitor.compare(minPackedValue, maxPackedValue);
+        if (r == Relation.CELL_OUTSIDE_QUERY) {
+          return;
+        }
+        visitor.grow(count);
+
+        if (r == Relation.CELL_INSIDE_QUERY) {
+          for (int i = 0; i < count; ++i) {
+            visitor.visit(docIDs[i]);
+          }
+          return;
+        }
+      } else {
+        visitor.grow(count);
+      }
+      if (compressedDim == -2) {
+        //low cardinality values
+        visitSparseRawDocValues(commonPrefixLengths, scratchDataPackedValue, in, docIDs, count, visitor);
+      } else {
+        //high cardinality
+        visitCompressedDocValues(commonPrefixLengths, scratchDataPackedValue, in, docIDs, count, visitor, compressedDim);
+      }
+    }
+  }
+
   private void readMinMax(int[] commonPrefixLengths, byte[] minPackedValue, byte[] maxPackedValue, IndexInput in) throws IOException {
     for (int dim = 0; dim < numIndexDims; dim++) {
       int prefix = commonPrefixLengths[dim];
@@ -495,12 +552,27 @@ private void readMinMax(int[] commonPrefixLengths, byte[] minPackedValue, byte[]
   }
 
   // Just read suffixes for every dimension
-  private void visitRawDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException {
-    for (int i = 0; i < count; ++i) {
+  private void visitSparseRawDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException {
+    int i;
+    for (i = 0; i < count;) {
+      int length = in.readVInt();
       for(int dim=0;dim<numDataDims;dim++) {
         int prefix = commonPrefixLengths[dim];
         in.readBytes(scratchPackedValue, dim*bytesPerDim + prefix, bytesPerDim - prefix);
       }
+      for (int j = i; j < i + length; j++) {
+        visitor.visit(docIDs[j], scratchPackedValue);
+      }
+      i+= length;
+    }
+    if (i != count) {
+      throw new CorruptIndexException("Sub blocks do not add up to the expected count: " + count + " != " + i, in);
+    }
+  }
+
+  // Just read suffixes for every dimension
+  private void visitUniqueRawDocValues(byte[] scratchPackedValue, int[] docIDs, int count, IntersectVisitor visitor) throws IOException {
+    for (int i = 0; i < count; i++) {
       visitor.visit(docIDs[i], scratchPackedValue);
     }
   }
@@ -530,7 +602,7 @@ private void visitCompressedDocValues(int[] commonPrefixLengths, byte[] scratchP
 
   private int readCompressedDim(IndexInput in) throws IOException {
     int compressedDim = in.readByte();
-    if (compressedDim < -1 || compressedDim >= numDataDims) {
+    if (compressedDim < -2 || compressedDim >= numDataDims) {
       throw new CorruptIndexException("Got compressedDim="+compressedDim, in);
     }
     return compressedDim;

diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
@@ -80,7 +80,8 @@ public class BKDWriter implements Closeable {
   //public static final int VERSION_CURRENT = VERSION_START;
   public static final int VERSION_LEAF_STORES_BOUNDS = 5;
   public static final int VERSION_SELECTIVE_INDEXING = 6;
-  public static final int VERSION_CURRENT = VERSION_SELECTIVE_INDEXING;
+  public static final int VERSION_LOW_CARDINALITY_LEAVES= 7;
+  public static final int VERSION_CURRENT = VERSION_LOW_CARDINALITY_LEAVES;
 
   /** How many bytes each docs takes in the fixed-width offline format */
   private final int bytesPerDoc;
@@ -516,6 +517,7 @@ private class OneDimensionBKDWriter {
     final int[] leafDocs = new int[maxPointsInLeafNode];
     private long valueCount;
     private int leafCount;
+    private int leafCardinality;
 
     OneDimensionBKDWriter(IndexOutput out) {
       if (numIndexDims != 1) {
@@ -546,6 +548,9 @@ void add(byte[] packedValue, int docID) throws IOException {
       assert valueInOrder(valueCount + leafCount,
           0, lastPackedValue, packedValue, 0, docID, lastDocID);
 
+      if (leafCount == 0 || Arrays.mismatch(leafValues, (leafCount - 1) * bytesPerDim, leafCount * bytesPerDim, packedValue, 0, bytesPerDim) != -1) {
+        leafCardinality++;
+      }
       System.arraycopy(packedValue, 0, leafValues, leafCount * packedBytesLength, packedBytesLength);
       leafDocs[leafCount] = docID;
       docsSeen.set(docID);
@@ -558,7 +563,8 @@ assert valueInOrder(valueCount + leafCount,
       if (leafCount == maxPointsInLeafNode) {
         // We write a block once we hit exactly the max count ... this is different from
         // when we write N > 1 dimensional points where we write between max/2 and max per leaf block
-        writeLeafBlock();
+        writeLeafBlock(leafCardinality);
+        leafCardinality = 0;
         leafCount = 0;
       }
 
@@ -567,7 +573,8 @@ assert valueInOrder(valueCount + leafCount,
 
     public long finish() throws IOException {
       if (leafCount > 0) {
-        writeLeafBlock();
+        writeLeafBlock(leafCardinality);
+        leafCardinality = 0;
         leafCount = 0;
       }
 
@@ -593,7 +600,7 @@ public long finish() throws IOException {
       return indexFP;
     }
 
-    private void writeLeafBlock() throws IOException {
+    private void writeLeafBlock(int leafCardinality) throws IOException {
       assert leafCount != 0;
       if (valueCount == 0) {
         System.arraycopy(leafValues, 0, minPackedValue, 0, packedIndexBytesLength);
@@ -613,7 +620,7 @@ private void writeLeafBlock() throws IOException {
       int offset = (leafCount - 1) * packedBytesLength;
       int prefix = Arrays.mismatch(leafValues, 0, bytesPerDim, leafValues, offset, offset + bytesPerDim);
       if (prefix == -1) {
-          prefix = bytesPerDim;
+        prefix = bytesPerDim;
       }
 
       commonPrefixLengths[0] = prefix;
@@ -635,7 +642,7 @@ public BytesRef apply(int i) {
       assert valuesInOrderAndBounds(leafCount, 0, ArrayUtil.copyOfSubArray(leafValues, 0, packedBytesLength),
           ArrayUtil.copyOfSubArray(leafValues, (leafCount - 1) * packedBytesLength, leafCount * packedBytesLength),
           packedValues, leafDocs, 0);
-      writeLeafBlockPackedValues(scratchOut, commonPrefixLengths, leafCount, 0, packedValues);
+      writeLeafBlockPackedValues(scratchOut, commonPrefixLengths, leafCount, 0, packedValues, leafCardinality);
       scratchOut.copyTo(out);
       scratchOut.reset();
     }
@@ -1028,17 +1035,43 @@ private void writeLeafBlockDocs(DataOutput out, int[] docIDs, int start, int cou
     DocIdsWriter.writeDocIds(docIDs, start, count, out);
   }
 
-  private void writeLeafBlockPackedValues(DataOutput out, int[] commonPrefixLengths, int count, int sortedDim, IntFunction<BytesRef> packedValues) throws IOException {
+  private void writeLeafBlockPackedValues(DataOutput out, int[] commonPrefixLengths, int count, int sortedDim, IntFunction<BytesRef> packedValues, int leafCardinality) throws IOException {
     int prefixLenSum = Arrays.stream(commonPrefixLengths).sum();
     if (prefixLenSum == packedBytesLength) {
       // all values in this block are equal
       out.writeByte((byte) -1);
-    } else {
+    } else if (leafCardinality * (packedBytesLength - prefixLenSum + 2)  <= count * (packedBytesLength - prefixLenSum)) {
+      //estimate if storing the values with cardinality is cheaper than storing all values
+      out.writeByte((byte) -2);
       if (numIndexDims != 1) {
         writeActualBounds(out, commonPrefixLengths, count, packedValues);
       }
+      BytesRef value = packedValues.apply(0);
+      System.arraycopy(value.bytes, value.offset, scratch1, 0, packedBytesLength);
+      int cardinality = 1;
+      for (int i = 1; i < count; i++) {
+        value = packedValues.apply(i);
+        if (Arrays.mismatch(value.bytes, value.offset, value.offset + value.length, scratch1, 0, packedBytesLength) != -1) {
+          out.writeVInt(cardinality);
+          for(int j = 0; j < numDataDims; j++) {
+            out.writeBytes(scratch1, j * bytesPerDim + commonPrefixLengths[j], bytesPerDim - commonPrefixLengths[j]);
+          }
+          System.arraycopy(value.bytes, value.offset, scratch1, 0, packedBytesLength);
+          cardinality = 1;
+        } else {
+          cardinality++;
+        }
+      }
+      out.writeVInt(cardinality);
+      for(int i = 0; i < numDataDims; i++) {
+        out.writeBytes(scratch1, i * bytesPerDim + commonPrefixLengths[i], bytesPerDim - commonPrefixLengths[i]);
+      }
+    } else {
       assert commonPrefixLengths[sortedDim] < bytesPerDim;
       out.writeByte((byte) sortedDim);
+      if (numIndexDims != 1) {
+        writeActualBounds(out, commonPrefixLengths, count, packedValues);
+      }
       int compressedByteOffset = sortedDim * bytesPerDim + commonPrefixLengths[sortedDim];
       commonPrefixLengths[sortedDim]++;
       for (int i = 0; i < count; ) {
@@ -1246,11 +1279,17 @@ private void build(int nodeID, int leafNodeOffset,
       final int count = to - from;
       assert count <= maxPointsInLeafNode;
 
-      // Compute common prefixes
+      // Compute common prefixes and cardinality
       Arrays.fill(commonPrefixLengths, bytesPerDim);
       reader.getValue(from, scratchBytesRef1);
+      int leafCardinality = 1;
+      System.arraycopy(scratchBytesRef1.bytes, scratchBytesRef1.offset, scratch1, 0, packedBytesLength);
       for (int i = from + 1; i < to; ++i) {
         reader.getValue(i, scratchBytesRef2);
+        if (Arrays.mismatch(scratch1, 0, packedBytesLength, scratchBytesRef2.bytes, scratchBytesRef2.offset, scratchBytesRef2.offset + packedBytesLength) != -1) {
+          leafCardinality++;
+          System.arraycopy(scratchBytesRef2.bytes, scratchBytesRef2.offset, scratch1, 0, packedBytesLength);
+        }
         for (int dim=0;dim<numDataDims;dim++) {
           final int offset = dim * bytesPerDim;
           int dimensionPrefixLength = commonPrefixLengths[dim];
@@ -1323,7 +1362,7 @@ public BytesRef apply(int i) {
       };
       assert valuesInOrderAndBounds(count, sortedDim, minPackedValue, maxPackedValue, packedValues,
           docIDs, 0);
-      writeLeafBlockPackedValues(scratchOut, commonPrefixLengths, count, sortedDim, packedValues);
+      writeLeafBlockPackedValues(scratchOut, commonPrefixLengths, count, sortedDim, packedValues, leafCardinality);
       scratchOut.copyTo(out);
       scratchOut.reset();
     } else {
@@ -1395,7 +1434,7 @@ private void build(int nodeID, int leafNodeOffset,
       int from = Math.toIntExact(points.start);
       int to = Math.toIntExact(points.start + points.count);
       //we store common prefix on scratch1
-      computeCommonPrefixLength(heapSource, scratch1, from, to);
+      int leafCardinality = computeCommonPrefixLength(heapSource, scratch1, from, to);
 
       int sortedDim = 0;
       int sortedDimCardinality = Integer.MAX_VALUE;
@@ -1459,7 +1498,7 @@ public BytesRef apply(int i) {
       };
       assert valuesInOrderAndBounds(count, sortedDim, minPackedValue, maxPackedValue, packedValues,
           heapSource.docIDs, from);
-      writeLeafBlockPackedValues(out, commonPrefixLengths, count, sortedDim, packedValues);
+      writeLeafBlockPackedValues(out, commonPrefixLengths, count, sortedDim, packedValues, leafCardinality);
 
     } else {
       // Inner node: partition/recurse
@@ -1516,16 +1555,22 @@ assert valuesInOrderAndBounds(count, sortedDim, minPackedValue, maxPackedValue,
     }
   }
 
-  private void computeCommonPrefixLength(HeapPointWriter heapPointWriter, byte[] commonPrefix, int from, int to) {
+  private int computeCommonPrefixLength(HeapPointWriter heapPointWriter, byte[] commonPrefix, int from, int to) {
     Arrays.fill(commonPrefixLengths, bytesPerDim);
     PointValue value = heapPointWriter.getPackedValueSlice(from);
     BytesRef packedValue = value.packedValue();
     for (int dim = 0; dim < numDataDims; dim++) {
       System.arraycopy(packedValue.bytes, packedValue.offset + dim * bytesPerDim, commonPrefix, dim * bytesPerDim, bytesPerDim);
     }
+    System.arraycopy(packedValue.bytes, packedValue.offset, scratch2, 0, packedBytesLength);
+    int leafCardinality = 1;
     for (int i = from + 1; i < to; i++) {
       value =  heapPointWriter.getPackedValueSlice(i);
       packedValue = value.packedValue();
+      if (Arrays.mismatch(scratch2, 0, packedBytesLength, packedValue.bytes, packedValue.offset, packedValue.offset + packedBytesLength) != -1) {
+        leafCardinality++;
+        System.arraycopy(packedValue.bytes, packedValue.offset, scratch2, 0, packedBytesLength);
+      }
       for (int dim = 0; dim < numDataDims; dim++) {
         if (commonPrefixLengths[dim] != 0) {
           int j = Arrays.mismatch(commonPrefix, dim * bytesPerDim, dim * bytesPerDim + commonPrefixLengths[dim], packedValue.bytes, packedValue.offset + dim * bytesPerDim, packedValue.offset + dim * bytesPerDim + commonPrefixLengths[dim]);
@@ -1535,6 +1580,7 @@ private void computeCommonPrefixLength(HeapPointWriter heapPointWriter, byte[] c
         }
       }
     }
+    return leafCardinality;
   }
 
   // only called from assert