Skip to content

Commit

Permalink
LUCENE-9820: PointTree#size() should handle the case of balanced tree…
Browse files Browse the repository at this point in the history
… in pre-8.6 indexes (#462)

Handle properly the case where trees are fully balanced for number of dimension > 1
  • Loading branch information
iverase committed Nov 25, 2021
1 parent 8710252 commit 800f002
Show file tree
Hide file tree
Showing 8 changed files with 2,546 additions and 70 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.lucene.backward_codecs.lucene60.bkd.BKDWriter60;
import org.apache.lucene.backward_codecs.store.EndiannessReverserUtil;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.MutablePointTree;
Expand All @@ -36,8 +37,6 @@
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.bkd.BKDConfig;
import org.apache.lucene.util.bkd.BKDWriter;

/** Writes dimensional values */
public class Lucene60PointsWriter extends PointsWriter {
Expand Down Expand Up @@ -91,37 +90,31 @@ public Lucene60PointsWriter(
public Lucene60PointsWriter(SegmentWriteState writeState) throws IOException {
this(
writeState,
BKDConfig.DEFAULT_MAX_POINTS_IN_LEAF_NODE,
BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP);
BKDWriter60.DEFAULT_MAX_POINTS_IN_LEAF_NODE,
BKDWriter60.DEFAULT_MAX_MB_SORT_IN_HEAP);
}

@Override
public void writeField(FieldInfo fieldInfo, PointsReader reader) throws IOException {

PointValues.PointTree values = reader.getValues(fieldInfo.name).getPointTree();

BKDConfig config =
new BKDConfig(
fieldInfo.getPointDimensionCount(),
fieldInfo.getPointIndexDimensionCount(),
fieldInfo.getPointNumBytes(),
maxPointsInLeafNode);

try (BKDWriter writer =
new BKDWriter(
try (BKDWriter60 writer =
new BKDWriter60(
writeState.segmentInfo.maxDoc(),
writeState.directory,
writeState.segmentInfo.name,
config,
fieldInfo.getPointDimensionCount(),
fieldInfo.getPointIndexDimensionCount(),
fieldInfo.getPointNumBytes(),
maxPointsInLeafNode,
maxMBSortInHeap,
values.size())) {

if (values instanceof MutablePointTree) {
Runnable finalizer =
writer.writeField(dataOut, dataOut, dataOut, fieldInfo.name, (MutablePointTree) values);
if (finalizer != null) {
indexFPs.put(fieldInfo.name, dataOut.getFilePointer());
finalizer.run();
final long fp = writer.writeField(dataOut, fieldInfo.name, (MutablePointTree) values);
if (fp != -1) {
indexFPs.put(fieldInfo.name, fp);
}
return;
}
Expand All @@ -145,10 +138,8 @@ public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
});

// We could have 0 points on merge since all docs with dimensional fields may be deleted:
Runnable finalizer = writer.finish(dataOut, dataOut, dataOut);
if (finalizer != null) {
indexFPs.put(fieldInfo.name, dataOut.getFilePointer());
finalizer.run();
if (writer.getPointCount() > 0) {
indexFPs.put(fieldInfo.name, writer.finish(dataOut));
}
}
}
Expand Down Expand Up @@ -193,26 +184,22 @@ public void merge(MergeState mergeState) throws IOException {
}
}

BKDConfig config =
new BKDConfig(
fieldInfo.getPointDimensionCount(),
fieldInfo.getPointIndexDimensionCount(),
fieldInfo.getPointNumBytes(),
maxPointsInLeafNode);

// System.out.println("MERGE: field=" + fieldInfo.name);
// Optimize the 1D case to use BKDWriter.merge, which does a single merge sort of the
// already sorted incoming segments, instead of trying to sort all points again as if
// we were simply reindexing them:
try (BKDWriter writer =
new BKDWriter(
try (BKDWriter60 writer =
new BKDWriter60(
writeState.segmentInfo.maxDoc(),
writeState.directory,
writeState.segmentInfo.name,
config,
fieldInfo.getPointDimensionCount(),
fieldInfo.getPointIndexDimensionCount(),
fieldInfo.getPointNumBytes(),
maxPointsInLeafNode,
maxMBSortInHeap,
totMaxSize)) {
List<PointValues> pointValues = new ArrayList<>();
List<PointValues> bkdReaders = new ArrayList<>();
List<MergeState.DocMap> docMaps = new ArrayList<>();
for (int i = 0; i < mergeState.pointsReaders.length; i++) {
PointsReader reader = mergeState.pointsReaders[i];
Expand All @@ -231,19 +218,18 @@ public void merge(MergeState mergeState) throws IOException {
FieldInfos readerFieldInfos = mergeState.fieldInfos[i];
FieldInfo readerFieldInfo = readerFieldInfos.fieldInfo(fieldInfo.name);
if (readerFieldInfo != null && readerFieldInfo.getPointDimensionCount() > 0) {
PointValues aPointValues = reader60.readers.get(readerFieldInfo.number);
if (aPointValues != null) {
pointValues.add(aPointValues);
PointValues bkdReader = reader60.readers.get(readerFieldInfo.number);
if (bkdReader != null) {
bkdReaders.add(bkdReader);
docMaps.add(mergeState.docMaps[i]);
}
}
}
}

Runnable finalizer = writer.merge(dataOut, dataOut, dataOut, docMaps, pointValues);
if (finalizer != null) {
indexFPs.put(fieldInfo.name, dataOut.getFilePointer());
finalizer.run();
long fp = writer.merge(dataOut, docMaps, bkdReaders);
if (fp != -1) {
indexFPs.put(fieldInfo.name, fp);
}
}
} else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.backward_codecs.lucene60.bkd.BKDWriter60;
import org.apache.lucene.backward_codecs.lucene84.Lucene84RWCodec;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.document.BinaryPoint;
Expand All @@ -35,7 +36,6 @@
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase.Nightly;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.bkd.BKDConfig;

/** Tests Lucene60PointsFormat */
@Nightly // N-2 formats are only tested on nightly runs
Expand All @@ -45,7 +45,7 @@ public class TestLucene60PointsFormat extends BasePointsFormatTestCase {

public TestLucene60PointsFormat() {
codec = new Lucene84RWCodec();
maxPointsInLeafNode = BKDConfig.DEFAULT_MAX_POINTS_IN_LEAF_NODE;
maxPointsInLeafNode = BKDWriter60.DEFAULT_MAX_POINTS_IN_LEAF_NODE;
}

@Override
Expand Down Expand Up @@ -280,16 +280,23 @@ public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
};

final long pointCount = points.estimatePointCount(onePointMatchVisitor);
final long lastNodePointCount = totalValues % maxPointsInLeafNode;
// With >1 dims, the tree is balanced
long actualMaxPointsInLeafNode = points.size();
while (actualMaxPointsInLeafNode > maxPointsInLeafNode) {
actualMaxPointsInLeafNode = (actualMaxPointsInLeafNode + 1) / 2;
}
final long countPerFullLeaf = (actualMaxPointsInLeafNode + 1) / 2;
final long countPerNotFullLeaf = (actualMaxPointsInLeafNode) / 2;
assertTrue(
"" + pointCount,
pointCount == (maxPointsInLeafNode + 1) / 2 // common case
|| pointCount == (lastNodePointCount + 1) / 2 // not fully populated leaf
|| pointCount == 2 * ((maxPointsInLeafNode + 1) / 2) // if the point is a split value
|| pointCount == ((maxPointsInLeafNode + 1) / 2) + ((lastNodePointCount + 1) / 2)
// in extreme cases, a point can be shared by 4 leaves
|| pointCount == 4 * ((maxPointsInLeafNode + 1) / 2)
|| pointCount == 3 * ((maxPointsInLeafNode + 1) / 2) + ((lastNodePointCount + 1) / 2));
pointCount + " vs " + actualMaxPointsInLeafNode,
// common case, point in one leaf.
pointCount >= countPerNotFullLeaf && pointCount <= countPerFullLeaf
||
// one dimension is a split value
pointCount >= 2 * countPerNotFullLeaf && pointCount <= 2 * countPerFullLeaf
||
// both dimensions are split values
pointCount >= 4 * countPerNotFullLeaf && pointCount <= 4 * countPerFullLeaf);

final long docCount = points.estimateDocCount(onePointMatchVisitor);
if (multiValues) {
Expand Down

0 comments on commit 800f002

Please sign in to comment.