From c6c741322f9dad14ca502b19c8e63228022b13b2 Mon Sep 17 00:00:00 2001 From: "guofeng.my" Date: Sat, 7 Oct 2023 19:59:14 +0800 Subject: [PATCH 01/17] msb vlong --- .../codecs/lucene90/blocktree/FieldReader.java | 2 +- .../blocktree/IntersectTermsEnumFrame.java | 2 +- .../blocktree/Lucene90BlockTreeTermsReader.java | 5 ++++- .../blocktree/Lucene90BlockTreeTermsWriter.java | 15 ++++++++++++++- .../lucene90/blocktree/SegmentTermsEnum.java | 17 +++++++++++++++-- 5 files changed, 35 insertions(+), 6 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/FieldReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/FieldReader.java index 7aba78112e8b..52a6efc3f9ab 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/FieldReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/FieldReader.java @@ -82,7 +82,7 @@ public final class FieldReader extends Terms { // + rootCode + " divisor=" + indexDivisor); // } rootBlockFP = - (new ByteArrayDataInput(rootCode.bytes, rootCode.offset, rootCode.length)).readVLong() + SegmentTermsEnum.readMSBVLong(new ByteArrayDataInput(rootCode.bytes, rootCode.offset, rootCode.length)) >>> Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS; // Initialize FST always off-heap. final IndexInput clone = indexIn.clone(); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnumFrame.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnumFrame.java index eb60d7f35246..fc5e27ee168c 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnumFrame.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnumFrame.java @@ -146,7 +146,7 @@ void load(BytesRef frameIndexData) throws IOException { floorDataReader.reset(frameIndexData.bytes, frameIndexData.offset, frameIndexData.length); // Skip first long -- has redundant fp, hasTerms // flag, isFloor flag - final long code = floorDataReader.readVLong(); + final long code = SegmentTermsEnum.readMSBVLong(floorDataReader); if ((code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0) { // Floor frame numFollowFloorBlocks = floorDataReader.readVInt(); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsReader.java index 4054e7a719cd..4925147da60f 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsReader.java @@ -81,8 +81,11 @@ public final class Lucene90BlockTreeTermsReader extends FieldsProducer { /** Initial terms format. */ public static final int VERSION_START = 0; + /** Initial terms format. */ + public static final int VERSION_MSB_VLONG_OUTPUT = 1; + /** Current terms format. */ - public static final int VERSION_CURRENT = VERSION_START; + public static final int VERSION_CURRENT = VERSION_MSB_VLONG_OUTPUT; /** Extension of terms index file */ static final String TERMS_INDEX_EXTENSION = "tip"; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java index 1b0bd3568c9f..39ad2f57a27a 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java @@ -34,6 +34,7 @@ import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.ByteArrayDataOutput; import org.apache.lucene.store.ByteBuffersDataOutput; import org.apache.lucene.store.DataOutput; @@ -460,6 +461,18 @@ public String toString() { return "BLOCK: prefix=" + brToString(prefix); } + private final byte[] bytes = new byte[8]; + private final ByteArrayDataOutput scratch = new ByteArrayDataOutput(bytes); + + private void writeMSBVLong(long i, DataOutput scratchBytes) throws IOException { + scratch.reset(bytes); + scratch.writeVLong(i); + for (int p = scratch.getPosition() - 1; p > 0; p--) { + scratchBytes.writeByte((byte) ((bytes[p] & 0x7F) | 0x80)); + } + scratchBytes.writeByte((byte) (bytes[0] & 0x7F)); + } + public void compileIndex( List blocks, ByteBuffersDataOutput scratchBytes, @@ -475,7 +488,7 @@ public void compileIndex( // TODO: try writing the leading vLong in MSB order // (opposite of what Lucene does today), for better // outputs sharing in the FST - scratchBytes.writeVLong(encodeOutput(fp, hasTerms, isFloor)); + writeMSBVLong(encodeOutput(fp, hasTerms, isFloor), scratchBytes); if (isFloor) { scratchBytes.writeVInt(blocks.size() - 1); for (int i = 1; i < blocks.size(); i++) { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java index 0e865061cf0e..4ad9c02752ec 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java @@ -24,6 +24,7 @@ import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.TermState; import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.DataInput; import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; @@ -236,7 +237,7 @@ private FST.Arc getArc(int ord) { SegmentTermsEnumFrame pushFrame(FST.Arc arc, BytesRef frameData, int length) throws IOException { scratchReader.reset(frameData.bytes, frameData.offset, frameData.length); - final long code = scratchReader.readVLong(); + final long code = readMSBVLong(scratchReader); final long fpSeek = code >>> Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS; final SegmentTermsEnumFrame f = getFrame(1 + currentFrame.ord); f.hasTerms = (code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS) != 0; @@ -250,6 +251,18 @@ SegmentTermsEnumFrame pushFrame(FST.Arc arc, BytesRef frameData, int l return f; } + static long readMSBVLong(DataInput input) throws IOException { + long l = 0L; + while (true) { + byte b = input.readByte(); + l = (l << 7) | (b & 0x7FL); + if ((b & 0x80) == 0) { + break; + } + } + return l; + } + // Pushes next'd frame or seek'd frame; we later // lazy-load the frame only when needed SegmentTermsEnumFrame pushFrame(FST.Arc arc, long fp, int length) throws IOException { @@ -980,7 +993,7 @@ private void printSeekState(PrintStream out) throws IOException { } else if (isSeekFrame && !f.isFloor) { final ByteArrayDataInput reader = new ByteArrayDataInput(output.bytes, output.offset, output.length); - final long codeOrig = reader.readVLong(); + final long codeOrig = readMSBVLong(reader); final long code = (f.fp << Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS) | (f.hasTerms ? Lucene90BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS : 0) From 7b5e44480732103940a55c02416d387d68fbe021 Mon Sep 17 00:00:00 2001 From: "guofeng.my" Date: Sat, 7 Oct 2023 20:01:21 +0800 Subject: [PATCH 02/17] tidy --- .../apache/lucene/codecs/lucene90/blocktree/FieldReader.java | 3 ++- .../lucene90/blocktree/Lucene90BlockTreeTermsWriter.java | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/FieldReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/FieldReader.java index 52a6efc3f9ab..4f7b0aba4f64 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/FieldReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/FieldReader.java @@ -82,7 +82,8 @@ public final class FieldReader extends Terms { // + rootCode + " divisor=" + indexDivisor); // } rootBlockFP = - SegmentTermsEnum.readMSBVLong(new ByteArrayDataInput(rootCode.bytes, rootCode.offset, rootCode.length)) + SegmentTermsEnum.readMSBVLong( + new ByteArrayDataInput(rootCode.bytes, rootCode.offset, rootCode.length)) >>> Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS; // Initialize FST always off-heap. final IndexInput clone = indexIn.clone(); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java index 39ad2f57a27a..cc9b72bc861c 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java @@ -34,7 +34,6 @@ import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.ByteArrayDataOutput; import org.apache.lucene.store.ByteBuffersDataOutput; import org.apache.lucene.store.DataOutput; From f09a886ae2723c33eb59f5e41968474b5f2885eb Mon Sep 17 00:00:00 2001 From: "guofeng.my" Date: Sat, 7 Oct 2023 23:56:57 +0800 Subject: [PATCH 03/17] msb vlong --- .../lucene90/blocktree/FieldReader.java | 22 +++++++++++++-- .../blocktree/IntersectTermsEnumFrame.java | 2 +- .../Lucene90BlockTreeTermsWriter.java | 27 +++++++++++-------- .../lucene90/blocktree/SegmentTermsEnum.java | 17 ++---------- 4 files changed, 39 insertions(+), 29 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/FieldReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/FieldReader.java index 4f7b0aba4f64..fa1f817c6c5f 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/FieldReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/FieldReader.java @@ -16,12 +16,15 @@ */ package org.apache.lucene.codecs.lucene90.blocktree; +import static org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsReader.VERSION_MSB_VLONG_OUTPUT; + import java.io.IOException; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.DataInput; import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.automaton.CompiledAutomaton; @@ -82,8 +85,7 @@ public final class FieldReader extends Terms { // + rootCode + " divisor=" + indexDivisor); // } rootBlockFP = - SegmentTermsEnum.readMSBVLong( - new ByteArrayDataInput(rootCode.bytes, rootCode.offset, rootCode.length)) + readMSBVLong(new ByteArrayDataInput(rootCode.bytes, rootCode.offset, rootCode.length)) >>> Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS; // Initialize FST always off-heap. final IndexInput clone = indexIn.clone(); @@ -100,6 +102,22 @@ public final class FieldReader extends Terms { */ } + long readMSBVLong(DataInput in) throws IOException { + if (parent.version >= VERSION_MSB_VLONG_OUTPUT) { + long l = 0L; + while (true) { + byte b = in.readByte(); + l = (l << 7) | (b & 0x7FL); + if ((b & 0x80) == 0) { + break; + } + } + return l; + } else { + return in.readVLong(); + } + } + @Override public BytesRef getMin() throws IOException { if (minTerm == null) { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnumFrame.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnumFrame.java index fc5e27ee168c..80bd8366d199 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnumFrame.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnumFrame.java @@ -146,7 +146,7 @@ void load(BytesRef frameIndexData) throws IOException { floorDataReader.reset(frameIndexData.bytes, frameIndexData.offset, frameIndexData.length); // Skip first long -- has redundant fp, hasTerms // flag, isFloor flag - final long code = SegmentTermsEnum.readMSBVLong(floorDataReader); + final long code = ite.fr.readMSBVLong(floorDataReader); if ((code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0) { // Floor frame numFollowFloorBlocks = floorDataReader.readVInt(); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java index cc9b72bc861c..a9244314e897 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java @@ -208,20 +208,20 @@ order, meaning if you just next() the file pointer will * byte of each sub-block, and its file pointer. * * - * @see Lucene90BlockTreeTermsReader * @lucene.experimental + * @see Lucene90BlockTreeTermsReader */ public final class Lucene90BlockTreeTermsWriter extends FieldsConsumer { /** * Suggested default value for the {@code minItemsInBlock} parameter to {@link - * #Lucene90BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int)}. + * #Lucene90BlockTreeTermsWriter(SegmentWriteState, PostingsWriterBase, int, int)}. */ public static final int DEFAULT_MIN_BLOCK_SIZE = 25; /** * Suggested default value for the {@code maxItemsInBlock} parameter to {@link - * #Lucene90BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int)}. + * #Lucene90BlockTreeTermsWriter(SegmentWriteState, PostingsWriterBase, int, int)}. */ public static final int DEFAULT_MAX_BLOCK_SIZE = 48; @@ -438,6 +438,7 @@ private static final class PendingBlock extends PendingEntry { public final boolean hasTerms; public final boolean isFloor; public final int floorLeadByte; + private final byte[] scratch = new byte[10]; public PendingBlock( BytesRef prefix, @@ -460,16 +461,20 @@ public String toString() { return "BLOCK: prefix=" + brToString(prefix); } - private final byte[] bytes = new byte[8]; - private final ByteArrayDataOutput scratch = new ByteArrayDataOutput(bytes); - private void writeMSBVLong(long i, DataOutput scratchBytes) throws IOException { - scratch.reset(bytes); - scratch.writeVLong(i); - for (int p = scratch.getPosition() - 1; p > 0; p--) { - scratchBytes.writeByte((byte) ((bytes[p] & 0x7F) | 0x80)); + // Write LSB VLong to scratch + int pos = 0; + while ((i & ~0x7FL) != 0L) { + scratch[pos++] = ((byte) (i & 0x7FL)); + i >>>= 7; + } + scratch[pos] = (byte) i; + // Reverse order + while (pos > 0) { + scratchBytes.writeByte((byte) ((scratch[pos] & 0x7F) | 0x80)); + pos--; } - scratchBytes.writeByte((byte) (bytes[0] & 0x7F)); + scratchBytes.writeByte((byte) (scratch[pos] & 0x7F)); } public void compileIndex( diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java index 4ad9c02752ec..36df27c841c6 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java @@ -24,7 +24,6 @@ import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.TermState; import org.apache.lucene.store.ByteArrayDataInput; -import org.apache.lucene.store.DataInput; import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; @@ -237,7 +236,7 @@ private FST.Arc getArc(int ord) { SegmentTermsEnumFrame pushFrame(FST.Arc arc, BytesRef frameData, int length) throws IOException { scratchReader.reset(frameData.bytes, frameData.offset, frameData.length); - final long code = readMSBVLong(scratchReader); + final long code = fr.readMSBVLong(scratchReader); final long fpSeek = code >>> Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS; final SegmentTermsEnumFrame f = getFrame(1 + currentFrame.ord); f.hasTerms = (code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS) != 0; @@ -251,18 +250,6 @@ SegmentTermsEnumFrame pushFrame(FST.Arc arc, BytesRef frameData, int l return f; } - static long readMSBVLong(DataInput input) throws IOException { - long l = 0L; - while (true) { - byte b = input.readByte(); - l = (l << 7) | (b & 0x7FL); - if ((b & 0x80) == 0) { - break; - } - } - return l; - } - // Pushes next'd frame or seek'd frame; we later // lazy-load the frame only when needed SegmentTermsEnumFrame pushFrame(FST.Arc arc, long fp, int length) throws IOException { @@ -993,7 +980,7 @@ private void printSeekState(PrintStream out) throws IOException { } else if (isSeekFrame && !f.isFloor) { final ByteArrayDataInput reader = new ByteArrayDataInput(output.bytes, output.offset, output.length); - final long codeOrig = readMSBVLong(reader); + final long codeOrig = fr.readMSBVLong(reader); final long code = (f.fp << Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS) | (f.hasTerms ? Lucene90BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS : 0) From 7b5b9e16b64c89fdd53e82b597aea202733c82c2 Mon Sep 17 00:00:00 2001 From: "guofeng.my" Date: Sun, 8 Oct 2023 00:01:32 +0800 Subject: [PATCH 04/17] reduce diff --- .../lucene90/blocktree/Lucene90BlockTreeTermsWriter.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java index a9244314e897..d82af68de750 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java @@ -208,20 +208,20 @@ order, meaning if you just next() the file pointer will * byte of each sub-block, and its file pointer. * * - * @lucene.experimental * @see Lucene90BlockTreeTermsReader + * @lucene.experimental */ public final class Lucene90BlockTreeTermsWriter extends FieldsConsumer { /** * Suggested default value for the {@code minItemsInBlock} parameter to {@link - * #Lucene90BlockTreeTermsWriter(SegmentWriteState, PostingsWriterBase, int, int)}. + * #Lucene90BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int)}. */ public static final int DEFAULT_MIN_BLOCK_SIZE = 25; /** * Suggested default value for the {@code maxItemsInBlock} parameter to {@link - * #Lucene90BlockTreeTermsWriter(SegmentWriteState, PostingsWriterBase, int, int)}. + * #Lucene90BlockTreeTermsWriter(SegmentWriteState, PostingsWriterBase,int,int)}. */ public static final int DEFAULT_MAX_BLOCK_SIZE = 48; From 805f918184cb03d492282ece7c77c6571482fa94 Mon Sep 17 00:00:00 2001 From: "guofeng.my" Date: Sun, 8 Oct 2023 00:04:12 +0800 Subject: [PATCH 05/17] iter --- .../codecs/lucene90/blocktree/Lucene90BlockTreeTermsReader.java | 2 +- .../codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsReader.java index 4925147da60f..5936eb877213 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsReader.java @@ -81,7 +81,7 @@ public final class Lucene90BlockTreeTermsReader extends FieldsProducer { /** Initial terms format. */ public static final int VERSION_START = 0; - /** Initial terms format. */ + /** Version that uses MSB VLong encoded output */ public static final int VERSION_MSB_VLONG_OUTPUT = 1; /** Current terms format. */ diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java index d82af68de750..777fe5194f2f 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java @@ -221,7 +221,7 @@ public final class Lucene90BlockTreeTermsWriter extends FieldsConsumer { /** * Suggested default value for the {@code maxItemsInBlock} parameter to {@link - * #Lucene90BlockTreeTermsWriter(SegmentWriteState, PostingsWriterBase,int,int)}. + * #Lucene90BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int)}. */ public static final int DEFAULT_MAX_BLOCK_SIZE = 48; From fd0c3ce301d34c0fd91029cf0e234b14da01abf3 Mon Sep 17 00:00:00 2001 From: "guofeng.my" Date: Sun, 8 Oct 2023 00:52:28 +0800 Subject: [PATCH 06/17] fix comment --- .../lucene90/blocktree/Lucene90BlockTreeTermsWriter.java | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java index 777fe5194f2f..a656d6ef1406 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java @@ -489,9 +489,7 @@ public void compileIndex( assert scratchBytes.size() == 0; - // TODO: try writing the leading vLong in MSB order - // (opposite of what Lucene does today), for better - // outputs sharing in the FST + // write the leading vLong in MSB order for better outputs sharing in the FST writeMSBVLong(encodeOutput(fp, hasTerms, isFloor), scratchBytes); if (isFloor) { scratchBytes.writeVInt(blocks.size() - 1); From 4507c14e17c028aa05604cceac57f40090914111 Mon Sep 17 00:00:00 2001 From: "guofeng.my" Date: Sun, 8 Oct 2023 10:09:14 +0800 Subject: [PATCH 07/17] iter --- .../lucene90/blocktree/Lucene90BlockTreeTermsWriter.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java index a656d6ef1406..f82a70e1976e 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java @@ -462,7 +462,7 @@ public String toString() { } private void writeMSBVLong(long i, DataOutput scratchBytes) throws IOException { - // Write LSB VLong to scratch + // Write "LSB VLong" to scratch int pos = 0; while ((i & ~0x7FL) != 0L) { scratch[pos++] = ((byte) (i & 0x7FL)); @@ -471,10 +471,10 @@ private void writeMSBVLong(long i, DataOutput scratchBytes) throws IOException { scratch[pos] = (byte) i; // Reverse order while (pos > 0) { - scratchBytes.writeByte((byte) ((scratch[pos] & 0x7F) | 0x80)); + scratchBytes.writeByte((byte) (scratch[pos] | 0x80)); pos--; } - scratchBytes.writeByte((byte) (scratch[pos] & 0x7F)); + scratchBytes.writeByte(scratch[pos]); } public void compileIndex( From 0ef627229483b72651732ff796c77e4d9a8ed5b0 Mon Sep 17 00:00:00 2001 From: "guofeng.my" Date: Sun, 8 Oct 2023 10:49:51 +0800 Subject: [PATCH 08/17] iter --- .../Lucene90BlockTreeTermsWriter.java | 24 ++++++++----------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java index f82a70e1976e..604aa9d03e39 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java @@ -438,7 +438,6 @@ private static final class PendingBlock extends PendingEntry { public final boolean hasTerms; public final boolean isFloor; public final int floorLeadByte; - private final byte[] scratch = new byte[10]; public PendingBlock( BytesRef prefix, @@ -461,20 +460,17 @@ public String toString() { return "BLOCK: prefix=" + brToString(prefix); } - private void writeMSBVLong(long i, DataOutput scratchBytes) throws IOException { - // Write "LSB VLong" to scratch - int pos = 0; - while ((i & ~0x7FL) != 0L) { - scratch[pos++] = ((byte) (i & 0x7FL)); - i >>>= 7; + private static void writeMSBVLong(long i, DataOutput scratchBytes) throws IOException { + assert i >= 0; + // Keep zero bits on most significant byte to have more chance to get prefix bytes shared. + // e.g. we expect 0x7FFF stored as [0x81, 0xFF, 0x7F] but not [0xFF, 0xFF, 0x40] + int LSBVLongBytes = (Long.SIZE - Long.numberOfLeadingZeros(i) - 1) / 7 + 1; + i <<= Long.SIZE - LSBVLongBytes * 7; + for (int j = 1; j < LSBVLongBytes; j++) { + scratchBytes.writeByte((byte) (((i >>> 57) & 0x7FL) | 0x80)); + i = i << 7; } - scratch[pos] = (byte) i; - // Reverse order - while (pos > 0) { - scratchBytes.writeByte((byte) (scratch[pos] | 0x80)); - pos--; - } - scratchBytes.writeByte(scratch[pos]); + scratchBytes.writeByte((byte) (((i >>> 57) & 0x7FL))); } public void compileIndex( From 8f86c9542ef7778f33ae9913d9531f21bbe6066e Mon Sep 17 00:00:00 2001 From: "guofeng.my" Date: Sun, 8 Oct 2023 19:56:20 +0800 Subject: [PATCH 09/17] rename --- .../lucene90/blocktree/FieldReader.java | 26 +++++++++++-------- .../blocktree/IntersectTermsEnumFrame.java | 2 +- .../lucene90/blocktree/SegmentTermsEnum.java | 4 +-- 3 files changed, 18 insertions(+), 14 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/FieldReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/FieldReader.java index fa1f817c6c5f..5ab481f9201d 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/FieldReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/FieldReader.java @@ -85,7 +85,7 @@ public final class FieldReader extends Terms { // + rootCode + " divisor=" + indexDivisor); // } rootBlockFP = - readMSBVLong(new ByteArrayDataInput(rootCode.bytes, rootCode.offset, rootCode.length)) + readLongOutput(new ByteArrayDataInput(rootCode.bytes, rootCode.offset, rootCode.length)) >>> Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS; // Initialize FST always off-heap. final IndexInput clone = indexIn.clone(); @@ -102,22 +102,26 @@ public final class FieldReader extends Terms { */ } - long readMSBVLong(DataInput in) throws IOException { + long readLongOutput(DataInput in) throws IOException { if (parent.version >= VERSION_MSB_VLONG_OUTPUT) { - long l = 0L; - while (true) { - byte b = in.readByte(); - l = (l << 7) | (b & 0x7FL); - if ((b & 0x80) == 0) { - break; - } - } - return l; + return readMSBVLong(in); } else { return in.readVLong(); } } + private static long readMSBVLong(DataInput in) throws IOException { + long l = 0L; + while (true) { + byte b = in.readByte(); + l = (l << 7) | (b & 0x7FL); + if ((b & 0x80) == 0) { + break; + } + } + return l; + } + @Override public BytesRef getMin() throws IOException { if (minTerm == null) { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnumFrame.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnumFrame.java index 80bd8366d199..5808d88b56fc 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnumFrame.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnumFrame.java @@ -146,7 +146,7 @@ void load(BytesRef frameIndexData) throws IOException { floorDataReader.reset(frameIndexData.bytes, frameIndexData.offset, frameIndexData.length); // Skip first long -- has redundant fp, hasTerms // flag, isFloor flag - final long code = ite.fr.readMSBVLong(floorDataReader); + final long code = ite.fr.readLongOutput(floorDataReader); if ((code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0) { // Floor frame numFollowFloorBlocks = floorDataReader.readVInt(); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java index 36df27c841c6..f79c4aa69230 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java @@ -236,7 +236,7 @@ private FST.Arc getArc(int ord) { SegmentTermsEnumFrame pushFrame(FST.Arc arc, BytesRef frameData, int length) throws IOException { scratchReader.reset(frameData.bytes, frameData.offset, frameData.length); - final long code = fr.readMSBVLong(scratchReader); + final long code = fr.readLongOutput(scratchReader); final long fpSeek = code >>> Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS; final SegmentTermsEnumFrame f = getFrame(1 + currentFrame.ord); f.hasTerms = (code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS) != 0; @@ -980,7 +980,7 @@ private void printSeekState(PrintStream out) throws IOException { } else if (isSeekFrame && !f.isFloor) { final ByteArrayDataInput reader = new ByteArrayDataInput(output.bytes, output.offset, output.length); - final long codeOrig = fr.readMSBVLong(reader); + final long codeOrig = fr.readLongOutput(reader); final long code = (f.fp << Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS) | (f.hasTerms ? Lucene90BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS : 0) From 457df4f2b56ac18bbd1751a40b06b2a990b18cb6 Mon Sep 17 00:00:00 2001 From: "guofeng.my" Date: Sun, 8 Oct 2023 22:21:40 +0800 Subject: [PATCH 10/17] fix review comments --- .../codecs/lucene90/blocktree/FieldReader.java | 4 ++-- .../blocktree/IntersectTermsEnumFrame.java | 2 +- .../blocktree/Lucene90BlockTreeTermsReader.java | 2 +- .../blocktree/Lucene90BlockTreeTermsWriter.java | 16 ++++++++-------- .../lucene90/blocktree/SegmentTermsEnum.java | 4 ++-- 5 files changed, 14 insertions(+), 14 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/FieldReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/FieldReader.java index 5ab481f9201d..7891ce18957e 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/FieldReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/FieldReader.java @@ -85,7 +85,7 @@ public final class FieldReader extends Terms { // + rootCode + " divisor=" + indexDivisor); // } rootBlockFP = - readLongOutput(new ByteArrayDataInput(rootCode.bytes, rootCode.offset, rootCode.length)) + readVLongOutput(new ByteArrayDataInput(rootCode.bytes, rootCode.offset, rootCode.length)) >>> Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS; // Initialize FST always off-heap. final IndexInput clone = indexIn.clone(); @@ -102,7 +102,7 @@ public final class FieldReader extends Terms { */ } - long readLongOutput(DataInput in) throws IOException { + long readVLongOutput(DataInput in) throws IOException { if (parent.version >= VERSION_MSB_VLONG_OUTPUT) { return readMSBVLong(in); } else { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnumFrame.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnumFrame.java index 5808d88b56fc..d9ca7a9bbd81 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnumFrame.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnumFrame.java @@ -146,7 +146,7 @@ void load(BytesRef frameIndexData) throws IOException { floorDataReader.reset(frameIndexData.bytes, frameIndexData.offset, frameIndexData.length); // Skip first long -- has redundant fp, hasTerms // flag, isFloor flag - final long code = ite.fr.readLongOutput(floorDataReader); + final long code = ite.fr.readVLongOutput(floorDataReader); if ((code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0) { // Floor frame numFollowFloorBlocks = floorDataReader.readVInt(); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsReader.java index 5936eb877213..05fbf0e5a5af 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsReader.java @@ -81,7 +81,7 @@ public final class Lucene90BlockTreeTermsReader extends FieldsProducer { /** Initial terms format. */ public static final int VERSION_START = 0; - /** Version that uses MSB VLong encoded output */ + /** Version that uses MSB VLong encoded output, see GITHUB#12620. */ public static final int VERSION_MSB_VLONG_OUTPUT = 1; /** Current terms format. */ diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java index 604aa9d03e39..66081a1494ae 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java @@ -460,17 +460,17 @@ public String toString() { return "BLOCK: prefix=" + brToString(prefix); } - private static void writeMSBVLong(long i, DataOutput scratchBytes) throws IOException { - assert i >= 0; + private static void writeMSBVLong(long l, DataOutput scratchBytes) throws IOException { + assert l >= 0; // Keep zero bits on most significant byte to have more chance to get prefix bytes shared. // e.g. we expect 0x7FFF stored as [0x81, 0xFF, 0x7F] but not [0xFF, 0xFF, 0x40] - int LSBVLongBytes = (Long.SIZE - Long.numberOfLeadingZeros(i) - 1) / 7 + 1; - i <<= Long.SIZE - LSBVLongBytes * 7; - for (int j = 1; j < LSBVLongBytes; j++) { - scratchBytes.writeByte((byte) (((i >>> 57) & 0x7FL) | 0x80)); - i = i << 7; + final int bytesNeeded = (Long.SIZE - Long.numberOfLeadingZeros(l) - 1) / 7 + 1; + l <<= Long.SIZE - bytesNeeded * 7; + for (int i = 1; i < bytesNeeded; i++) { + scratchBytes.writeByte((byte) (((l >>> 57) & 0x7FL) | 0x80)); + l = l << 7; } - scratchBytes.writeByte((byte) (((i >>> 57) & 0x7FL))); + scratchBytes.writeByte((byte) (((l >>> 57) & 0x7FL))); } public void compileIndex( diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java index f79c4aa69230..cb5577d8d6c9 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java @@ -236,7 +236,7 @@ private FST.Arc getArc(int ord) { SegmentTermsEnumFrame pushFrame(FST.Arc arc, BytesRef frameData, int length) throws IOException { scratchReader.reset(frameData.bytes, frameData.offset, frameData.length); - final long code = fr.readLongOutput(scratchReader); + final long code = fr.readVLongOutput(scratchReader); final long fpSeek = code >>> Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS; final SegmentTermsEnumFrame f = getFrame(1 + currentFrame.ord); f.hasTerms = (code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS) != 0; @@ -980,7 +980,7 @@ private void printSeekState(PrintStream out) throws IOException { } else if (isSeekFrame && !f.isFloor) { final ByteArrayDataInput reader = new ByteArrayDataInput(output.bytes, output.offset, output.length); - final long codeOrig = fr.readLongOutput(reader); + final long codeOrig = fr.readVLongOutput(reader); final long code = (f.fp << Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS) | (f.hasTerms ? Lucene90BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS : 0) From 8b538008992268b554dee7be3c23dd7b9f7ed1d0 Mon Sep 17 00:00:00 2001 From: "guofeng.my" Date: Sun, 8 Oct 2023 22:27:32 +0800 Subject: [PATCH 11/17] iter --- .../lucene90/blocktree/Lucene90BlockTreeTermsReader.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsReader.java index 05fbf0e5a5af..f91c7b03cd0b 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsReader.java @@ -81,7 +81,9 @@ public final class Lucene90BlockTreeTermsReader extends FieldsProducer { /** Initial terms format. */ public static final int VERSION_START = 0; - /** Version that uses MSB VLong encoded output, see GITHUB#12620. */ + /** + * Version that uses MSB VLong encoded output for better outputs sharing in FST, see GITHUB#12620. + */ public static final int VERSION_MSB_VLONG_OUTPUT = 1; /** Current terms format. */ From c491de367d7aa80a7bae65377ea5d26f5c4fe1ff Mon Sep 17 00:00:00 2001 From: "guofeng.my" Date: Sun, 8 Oct 2023 22:28:13 +0800 Subject: [PATCH 12/17] iter --- .../codecs/lucene90/blocktree/Lucene90BlockTreeTermsReader.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsReader.java index f91c7b03cd0b..1b385f8fa09f 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsReader.java @@ -82,7 +82,7 @@ public final class Lucene90BlockTreeTermsReader extends FieldsProducer { public static final int VERSION_START = 0; /** - * Version that uses MSB VLong encoded output for better outputs sharing in FST, see GITHUB#12620. + * Version that encode output as MSB VLong for better outputs sharing in FST, see GITHUB#12620. */ public static final int VERSION_MSB_VLONG_OUTPUT = 1; From 8797aba899d7615ff8561741e2a302c74852a851 Mon Sep 17 00:00:00 2001 From: "guofeng.my" Date: Mon, 9 Oct 2023 19:46:32 +0800 Subject: [PATCH 13/17] review fix --- .../lucene90/blocktree/FieldReader.java | 8 ++++- .../Lucene90BlockTreeTermsWriter.java | 32 +++++++++++-------- .../lucene90/blocktree/TestMSBVLong.java | 28 ++++++++++++++++ 3 files changed, 54 insertions(+), 14 deletions(-) create mode 100644 lucene/core/src/test/org/apache/lucene/codecs/lucene90/blocktree/TestMSBVLong.java diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/FieldReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/FieldReader.java index 7891ce18957e..39caaeb622dc 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/FieldReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/FieldReader.java @@ -110,7 +110,13 @@ long readVLongOutput(DataInput in) throws IOException { } } - private static long readMSBVLong(DataInput in) throws IOException { + /** + * Decodes a variable length byte[] in MSB order back to long, as written by {@link + * Lucene90BlockTreeTermsWriter#writeMSBVLong}. + * + *

Package private for testing. + */ + static long readMSBVLong(DataInput in) throws IOException { long l = 0L; while (true) { byte b = in.readByte(); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java index 66081a1494ae..d0114ce0e79d 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java @@ -430,6 +430,25 @@ static String brToString(byte[] b) { return brToString(new BytesRef(b)); } + /** + * Encodes long value to variable length byte[], in MSB order. Use {@link + * FieldReader#readMSBVLong} to decode. + * + *

Package private for testing + */ + static void writeMSBVLong(long l, DataOutput scratchBytes) throws IOException { + assert l >= 0; + // Keep zero bits on most significant byte to have more chance to get prefix bytes shared. + // e.g. we expect 0x7FFF stored as [0x81, 0xFF, 0x7F] but not [0xFF, 0xFF, 0x40] + final int bytesNeeded = (Long.SIZE - Long.numberOfLeadingZeros(l) - 1) / 7 + 1; + l <<= Long.SIZE - bytesNeeded * 7; + for (int i = 1; i < bytesNeeded; i++) { + scratchBytes.writeByte((byte) (((l >>> 57) & 0x7FL) | 0x80)); + l = l << 7; + } + scratchBytes.writeByte((byte) (((l >>> 57) & 0x7FL))); + } + private static final class PendingBlock extends PendingEntry { public final BytesRef prefix; public final long fp; @@ -460,19 +479,6 @@ public String toString() { return "BLOCK: prefix=" + brToString(prefix); } - private static void writeMSBVLong(long l, DataOutput scratchBytes) throws IOException { - assert l >= 0; - // Keep zero bits on most significant byte to have more chance to get prefix bytes shared. - // e.g. we expect 0x7FFF stored as [0x81, 0xFF, 0x7F] but not [0xFF, 0xFF, 0x40] - final int bytesNeeded = (Long.SIZE - Long.numberOfLeadingZeros(l) - 1) / 7 + 1; - l <<= Long.SIZE - bytesNeeded * 7; - for (int i = 1; i < bytesNeeded; i++) { - scratchBytes.writeByte((byte) (((l >>> 57) & 0x7FL) | 0x80)); - l = l << 7; - } - scratchBytes.writeByte((byte) (((l >>> 57) & 0x7FL))); - } - public void compileIndex( List blocks, ByteBuffersDataOutput scratchBytes, diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/blocktree/TestMSBVLong.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/blocktree/TestMSBVLong.java new file mode 100644 index 000000000000..4671d7255959 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/blocktree/TestMSBVLong.java @@ -0,0 +1,28 @@ +package org.apache.lucene.codecs.lucene90.blocktree; + +import java.io.IOException; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.ByteArrayDataOutput; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.util.ArrayUtil; + +public class TestMSBVLong extends LuceneTestCase { + public void testMSBVLong() throws IOException { + assertMSBVLong(0); + assertMSBVLong(Long.MAX_VALUE); + int iter = atLeast(10000); + for (int i = 0; i < iter; i++) { + assertMSBVLong(random().nextLong(Long.MAX_VALUE)); + } + } + + private static void assertMSBVLong(long l) throws IOException { + byte[] bytes = new byte[10]; + ByteArrayDataOutput output = new ByteArrayDataOutput(bytes); + Lucene90BlockTreeTermsWriter.writeMSBVLong(l, output); + ByteArrayDataInput in = + new ByteArrayDataInput(ArrayUtil.copyOfSubArray(bytes, 0, output.getPosition())); + long recovered = FieldReader.readMSBVLong(in); + assertEquals(l + " != " + recovered, l, recovered); + } +} From ea4560f0ee17eeef85b5912db580bbb0f02c7df5 Mon Sep 17 00:00:00 2001 From: "guofeng.my" Date: Mon, 9 Oct 2023 19:52:06 +0800 Subject: [PATCH 14/17] iter --- .../codecs/lucene90/blocktree/TestMSBVLong.java | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/blocktree/TestMSBVLong.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/blocktree/TestMSBVLong.java index 4671d7255959..121ffa40addf 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/blocktree/TestMSBVLong.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/blocktree/TestMSBVLong.java @@ -1,3 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.lucene.codecs.lucene90.blocktree; import java.io.IOException; @@ -7,6 +23,7 @@ import org.apache.lucene.util.ArrayUtil; public class TestMSBVLong extends LuceneTestCase { + public void testMSBVLong() throws IOException { assertMSBVLong(0); assertMSBVLong(Long.MAX_VALUE); From 039cec49a123c332bd2c3b71c81ba2fd394f8d1a Mon Sep 17 00:00:00 2001 From: gf2121 <52390227+gf2121@users.noreply.github.com> Date: Mon, 9 Oct 2023 20:15:26 +0800 Subject: [PATCH 15/17] only test first 10k values Co-authored-by: Adrien Grand --- .../apache/lucene/codecs/lucene90/blocktree/TestMSBVLong.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/blocktree/TestMSBVLong.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/blocktree/TestMSBVLong.java index 121ffa40addf..5ba14d40d28b 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/blocktree/TestMSBVLong.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/blocktree/TestMSBVLong.java @@ -29,7 +29,7 @@ public void testMSBVLong() throws IOException { assertMSBVLong(Long.MAX_VALUE); int iter = atLeast(10000); for (int i = 0; i < iter; i++) { - assertMSBVLong(random().nextLong(Long.MAX_VALUE)); + assertMSBVLong(i); } } From 9c807094afb39805900e4778b1ecf008df513f8c Mon Sep 17 00:00:00 2001 From: "guofeng.my" Date: Mon, 9 Oct 2023 21:34:47 +0800 Subject: [PATCH 16/17] add CHANGES --- lucene/CHANGES.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 736b7f2f4ef1..0552300acaed 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -178,6 +178,9 @@ Optimizations * GITHUB#12623: Use a MergeSorter taking advantage of extra storage for StableMSBRadixSorter. (Guo Feng) +* GITHUB#12623: Write MSB VLong for better outputs sharing in block tree index, decreasing ~14% size + of .tip file. (Guo Feng) + Changes in runtime behavior --------------------- From 4282b87b28cc0d335d80bf5603f5b6facd06a632 Mon Sep 17 00:00:00 2001 From: "guofeng.my" Date: Tue, 10 Oct 2023 11:47:38 +0800 Subject: [PATCH 17/17] no need to specialize assertMSBVLong(0) as already done in loop --- .../apache/lucene/codecs/lucene90/blocktree/TestMSBVLong.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/blocktree/TestMSBVLong.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/blocktree/TestMSBVLong.java index 5ba14d40d28b..1ebab9262099 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/blocktree/TestMSBVLong.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/blocktree/TestMSBVLong.java @@ -25,10 +25,9 @@ public class TestMSBVLong extends LuceneTestCase { public void testMSBVLong() throws IOException { - assertMSBVLong(0); assertMSBVLong(Long.MAX_VALUE); int iter = atLeast(10000); - for (int i = 0; i < iter; i++) { + for (long i = 0; i < iter; i++) { assertMSBVLong(i); } }