LUCENE-9820: PointTree#size() should handle the case of balanced tree…

… in pre-8.6 indexes (#462) Handle properly the case where trees are fully balanced for number of dimension > 1
apache · Nov 25, 2021 · 800f002 · 800f002
1 parent 8710252
commit 800f002
Show file tree

Hide file tree

Showing 8 changed files with 2,546 additions and 70 deletions.
diff --git a/...ward-codecs/src/test/org/apache/lucene/backward_codecs/lucene60/Lucene60PointsWriter.java b/...ward-codecs/src/test/org/apache/lucene/backward_codecs/lucene60/Lucene60PointsWriter.java
@@ -21,6 +21,7 @@
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
+import org.apache.lucene.backward_codecs.lucene60.bkd.BKDWriter60;
 import org.apache.lucene.backward_codecs.store.EndiannessReverserUtil;
 import org.apache.lucene.codecs.CodecUtil;
 import org.apache.lucene.codecs.MutablePointTree;
@@ -36,8 +37,6 @@
 import org.apache.lucene.index.SegmentWriteState;
 import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.util.IOUtils;
-import org.apache.lucene.util.bkd.BKDConfig;
-import org.apache.lucene.util.bkd.BKDWriter;
 
 /** Writes dimensional values */
 public class Lucene60PointsWriter extends PointsWriter {
@@ -91,37 +90,31 @@ public Lucene60PointsWriter(
   public Lucene60PointsWriter(SegmentWriteState writeState) throws IOException {
     this(
         writeState,
-        BKDConfig.DEFAULT_MAX_POINTS_IN_LEAF_NODE,
-        BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP);
+        BKDWriter60.DEFAULT_MAX_POINTS_IN_LEAF_NODE,
+        BKDWriter60.DEFAULT_MAX_MB_SORT_IN_HEAP);
   }
 
   @Override
   public void writeField(FieldInfo fieldInfo, PointsReader reader) throws IOException {
 
     PointValues.PointTree values = reader.getValues(fieldInfo.name).getPointTree();
 
-    BKDConfig config =
-        new BKDConfig(
-            fieldInfo.getPointDimensionCount(),
-            fieldInfo.getPointIndexDimensionCount(),
-            fieldInfo.getPointNumBytes(),
-            maxPointsInLeafNode);
-
-    try (BKDWriter writer =
-        new BKDWriter(
+    try (BKDWriter60 writer =
+        new BKDWriter60(
             writeState.segmentInfo.maxDoc(),
             writeState.directory,
             writeState.segmentInfo.name,
-            config,
+            fieldInfo.getPointDimensionCount(),
+            fieldInfo.getPointIndexDimensionCount(),
+            fieldInfo.getPointNumBytes(),
+            maxPointsInLeafNode,
             maxMBSortInHeap,
             values.size())) {
 
       if (values instanceof MutablePointTree) {
-        Runnable finalizer =
-            writer.writeField(dataOut, dataOut, dataOut, fieldInfo.name, (MutablePointTree) values);
-        if (finalizer != null) {
-          indexFPs.put(fieldInfo.name, dataOut.getFilePointer());
-          finalizer.run();
+        final long fp = writer.writeField(dataOut, fieldInfo.name, (MutablePointTree) values);
+        if (fp != -1) {
+          indexFPs.put(fieldInfo.name, fp);
         }
         return;
       }
@@ -145,10 +138,8 @@ public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
           });
 
       // We could have 0 points on merge since all docs with dimensional fields may be deleted:
-      Runnable finalizer = writer.finish(dataOut, dataOut, dataOut);
-      if (finalizer != null) {
-        indexFPs.put(fieldInfo.name, dataOut.getFilePointer());
-        finalizer.run();
+      if (writer.getPointCount() > 0) {
+        indexFPs.put(fieldInfo.name, writer.finish(dataOut));
       }
     }
   }
@@ -193,26 +184,22 @@ public void merge(MergeState mergeState) throws IOException {
             }
           }
 
-          BKDConfig config =
-              new BKDConfig(
-                  fieldInfo.getPointDimensionCount(),
-                  fieldInfo.getPointIndexDimensionCount(),
-                  fieldInfo.getPointNumBytes(),
-                  maxPointsInLeafNode);
-
           // System.out.println("MERGE: field=" + fieldInfo.name);
           // Optimize the 1D case to use BKDWriter.merge, which does a single merge sort of the
           // already sorted incoming segments, instead of trying to sort all points again as if
           // we were simply reindexing them:
-          try (BKDWriter writer =
-              new BKDWriter(
+          try (BKDWriter60 writer =
+              new BKDWriter60(
                   writeState.segmentInfo.maxDoc(),
                   writeState.directory,
                   writeState.segmentInfo.name,
-                  config,
+                  fieldInfo.getPointDimensionCount(),
+                  fieldInfo.getPointIndexDimensionCount(),
+                  fieldInfo.getPointNumBytes(),
+                  maxPointsInLeafNode,
                   maxMBSortInHeap,
                   totMaxSize)) {
-            List<PointValues> pointValues = new ArrayList<>();
+            List<PointValues> bkdReaders = new ArrayList<>();
             List<MergeState.DocMap> docMaps = new ArrayList<>();
             for (int i = 0; i < mergeState.pointsReaders.length; i++) {
               PointsReader reader = mergeState.pointsReaders[i];
@@ -231,19 +218,18 @@ public void merge(MergeState mergeState) throws IOException {
                 FieldInfos readerFieldInfos = mergeState.fieldInfos[i];
                 FieldInfo readerFieldInfo = readerFieldInfos.fieldInfo(fieldInfo.name);
                 if (readerFieldInfo != null && readerFieldInfo.getPointDimensionCount() > 0) {
-                  PointValues aPointValues = reader60.readers.get(readerFieldInfo.number);
-                  if (aPointValues != null) {
-                    pointValues.add(aPointValues);
+                  PointValues bkdReader = reader60.readers.get(readerFieldInfo.number);
+                  if (bkdReader != null) {
+                    bkdReaders.add(bkdReader);
                     docMaps.add(mergeState.docMaps[i]);
                   }
                 }
               }
             }
 
-            Runnable finalizer = writer.merge(dataOut, dataOut, dataOut, docMaps, pointValues);
-            if (finalizer != null) {
-              indexFPs.put(fieldInfo.name, dataOut.getFilePointer());
-              finalizer.run();
+            long fp = writer.merge(dataOut, docMaps, bkdReaders);
+            if (fp != -1) {
+              indexFPs.put(fieldInfo.name, fp);
             }
           }
         } else {

diff --git a/...-codecs/src/test/org/apache/lucene/backward_codecs/lucene60/TestLucene60PointsFormat.java b/...-codecs/src/test/org/apache/lucene/backward_codecs/lucene60/TestLucene60PointsFormat.java
@@ -18,6 +18,7 @@
 
 import java.io.IOException;
 import java.util.Arrays;
+import org.apache.lucene.backward_codecs.lucene60.bkd.BKDWriter60;
 import org.apache.lucene.backward_codecs.lucene84.Lucene84RWCodec;
 import org.apache.lucene.codecs.Codec;
 import org.apache.lucene.document.BinaryPoint;
@@ -35,7 +36,6 @@
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.LuceneTestCase.Nightly;
 import org.apache.lucene.util.TestUtil;
-import org.apache.lucene.util.bkd.BKDConfig;
 
 /** Tests Lucene60PointsFormat */
 @Nightly // N-2 formats are only tested on nightly runs
@@ -45,7 +45,7 @@ public class TestLucene60PointsFormat extends BasePointsFormatTestCase {
 
   public TestLucene60PointsFormat() {
     codec = new Lucene84RWCodec();
-    maxPointsInLeafNode = BKDConfig.DEFAULT_MAX_POINTS_IN_LEAF_NODE;
+    maxPointsInLeafNode = BKDWriter60.DEFAULT_MAX_POINTS_IN_LEAF_NODE;
   }
 
   @Override
@@ -280,16 +280,23 @@ public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
         };
 
     final long pointCount = points.estimatePointCount(onePointMatchVisitor);
-    final long lastNodePointCount = totalValues % maxPointsInLeafNode;
+    // With >1 dims, the tree is balanced
+    long actualMaxPointsInLeafNode = points.size();
+    while (actualMaxPointsInLeafNode > maxPointsInLeafNode) {
+      actualMaxPointsInLeafNode = (actualMaxPointsInLeafNode + 1) / 2;
+    }
+    final long countPerFullLeaf = (actualMaxPointsInLeafNode + 1) / 2;
+    final long countPerNotFullLeaf = (actualMaxPointsInLeafNode) / 2;
     assertTrue(
-        "" + pointCount,
-        pointCount == (maxPointsInLeafNode + 1) / 2 // common case
-            || pointCount == (lastNodePointCount + 1) / 2 // not fully populated leaf
-            || pointCount == 2 * ((maxPointsInLeafNode + 1) / 2) // if the point is a split value
-            || pointCount == ((maxPointsInLeafNode + 1) / 2) + ((lastNodePointCount + 1) / 2)
-            // in extreme cases, a point can be shared by 4 leaves
-            || pointCount == 4 * ((maxPointsInLeafNode + 1) / 2)
-            || pointCount == 3 * ((maxPointsInLeafNode + 1) / 2) + ((lastNodePointCount + 1) / 2));
+        pointCount + " vs " + actualMaxPointsInLeafNode,
+        // common case, point in one leaf.
+        pointCount >= countPerNotFullLeaf && pointCount <= countPerFullLeaf
+            ||
+            // one dimension is a split value
+            pointCount >= 2 * countPerNotFullLeaf && pointCount <= 2 * countPerFullLeaf
+            ||
+            // both dimensions are split values
+            pointCount >= 4 * countPerNotFullLeaf && pointCount <= 4 * countPerFullLeaf);
 
     final long docCount = points.estimateDocCount(onePointMatchVisitor);
     if (multiValues) {