Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Write MSB VLong for better outputs sharing in block tree index #12631

Merged
merged 18 commits into from
Oct 10, 2023
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,15 @@
*/
package org.apache.lucene.codecs.lucene90.blocktree;

import static org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsReader.VERSION_MSB_VLONG_OUTPUT;

import java.io.IOException;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.CompiledAutomaton;
Expand Down Expand Up @@ -82,7 +85,7 @@ public final class FieldReader extends Terms {
// + rootCode + " divisor=" + indexDivisor);
// }
rootBlockFP =
(new ByteArrayDataInput(rootCode.bytes, rootCode.offset, rootCode.length)).readVLong()
readVLongOutput(new ByteArrayDataInput(rootCode.bytes, rootCode.offset, rootCode.length))
>>> Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS;
// Initialize FST always off-heap.
final IndexInput clone = indexIn.clone();
Expand All @@ -99,6 +102,32 @@ public final class FieldReader extends Terms {
*/
}

long readVLongOutput(DataInput in) throws IOException {
if (parent.version >= VERSION_MSB_VLONG_OUTPUT) {
return readMSBVLong(in);
} else {
return in.readVLong();
}
}

/**
* Decodes a variable length byte[] in MSB order back to long, as written by {@link
* Lucene90BlockTreeTermsWriter#writeMSBVLong}.
*
* <p>Package private for testing.
*/
static long readMSBVLong(DataInput in) throws IOException {
long l = 0L;
while (true) {
byte b = in.readByte();
l = (l << 7) | (b & 0x7FL);
if ((b & 0x80) == 0) {
break;
}
}
return l;
}

@Override
public BytesRef getMin() throws IOException {
if (minTerm == null) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ void load(BytesRef frameIndexData) throws IOException {
floorDataReader.reset(frameIndexData.bytes, frameIndexData.offset, frameIndexData.length);
// Skip first long -- has redundant fp, hasTerms
// flag, isFloor flag
final long code = floorDataReader.readVLong();
final long code = ite.fr.readVLongOutput(floorDataReader);
if ((code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0) {
// Floor frame
numFollowFloorBlocks = floorDataReader.readVInt();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,13 @@ public final class Lucene90BlockTreeTermsReader extends FieldsProducer {
/** Initial terms format. */
public static final int VERSION_START = 0;

/**
* Version that encode output as MSB VLong for better outputs sharing in FST, see GITHUB#12620.
*/
public static final int VERSION_MSB_VLONG_OUTPUT = 1;

/** Current terms format. */
public static final int VERSION_CURRENT = VERSION_START;
public static final int VERSION_CURRENT = VERSION_MSB_VLONG_OUTPUT;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add a comment explaining what changed, and link to the GitHub issue?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wrote some comments on VERSION_MSB_VLONG_OUTPUT. I prefer to leave comments there, as VERSION_CURRENT could probably link to other version in feature. WDYT?

Issue added :)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, sorry, yes - VERSION_MSB_VLONG_OUTPUT is the right place for the comment! Thanks.


/** Extension of terms index file */
static final String TERMS_INDEX_EXTENSION = "tip";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -430,6 +430,25 @@ static String brToString(byte[] b) {
return brToString(new BytesRef(b));
}

/**
* Encodes long value to variable length byte[], in MSB order. Use {@link
* FieldReader#readMSBVLong} to decode.
*
* <p>Package private for testing
*/
static void writeMSBVLong(long l, DataOutput scratchBytes) throws IOException {
assert l >= 0;
// Keep zero bits on most significant byte to have more chance to get prefix bytes shared.
// e.g. we expect 0x7FFF stored as [0x81, 0xFF, 0x7F] but not [0xFF, 0xFF, 0x40]
final int bytesNeeded = (Long.SIZE - Long.numberOfLeadingZeros(l) - 1) / 7 + 1;
l <<= Long.SIZE - bytesNeeded * 7;
for (int i = 1; i < bytesNeeded; i++) {
scratchBytes.writeByte((byte) (((l >>> 57) & 0x7FL) | 0x80));
l = l << 7;
}
scratchBytes.writeByte((byte) (((l >>> 57) & 0x7FL)));
}

private static final class PendingBlock extends PendingEntry {
public final BytesRef prefix;
public final long fp;
Expand Down Expand Up @@ -472,10 +491,8 @@ public void compileIndex(

assert scratchBytes.size() == 0;

// TODO: try writing the leading vLong in MSB order
// (opposite of what Lucene does today), for better
// outputs sharing in the FST
scratchBytes.writeVLong(encodeOutput(fp, hasTerms, isFloor));
// write the leading vLong in MSB order for better outputs sharing in the FST
writeMSBVLong(encodeOutput(fp, hasTerms, isFloor), scratchBytes);
if (isFloor) {
scratchBytes.writeVInt(blocks.size() - 1);
for (int i = 1; i < blocks.size(); i++) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ private FST.Arc<BytesRef> getArc(int ord) {
SegmentTermsEnumFrame pushFrame(FST.Arc<BytesRef> arc, BytesRef frameData, int length)
throws IOException {
scratchReader.reset(frameData.bytes, frameData.offset, frameData.length);
final long code = scratchReader.readVLong();
final long code = fr.readVLongOutput(scratchReader);
final long fpSeek = code >>> Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS;
final SegmentTermsEnumFrame f = getFrame(1 + currentFrame.ord);
f.hasTerms = (code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS) != 0;
Expand Down Expand Up @@ -980,7 +980,7 @@ private void printSeekState(PrintStream out) throws IOException {
} else if (isSeekFrame && !f.isFloor) {
final ByteArrayDataInput reader =
new ByteArrayDataInput(output.bytes, output.offset, output.length);
final long codeOrig = reader.readVLong();
final long codeOrig = fr.readVLongOutput(reader);
final long code =
(f.fp << Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS)
| (f.hasTerms ? Lucene90BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS : 0)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene90.blocktree;

import java.io.IOException;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.tests.util.LuceneTestCase;
import org.apache.lucene.util.ArrayUtil;

public class TestMSBVLong extends LuceneTestCase {

public void testMSBVLong() throws IOException {
assertMSBVLong(0);
assertMSBVLong(Long.MAX_VALUE);
int iter = atLeast(10000);
for (int i = 0; i < iter; i++) {
assertMSBVLong(random().nextLong(Long.MAX_VALUE));
gf2121 marked this conversation as resolved.
Show resolved Hide resolved
}
}

private static void assertMSBVLong(long l) throws IOException {
byte[] bytes = new byte[10];
ByteArrayDataOutput output = new ByteArrayDataOutput(bytes);
Lucene90BlockTreeTermsWriter.writeMSBVLong(l, output);
ByteArrayDataInput in =
new ByteArrayDataInput(ArrayUtil.copyOfSubArray(bytes, 0, output.getPosition()));
long recovered = FieldReader.readMSBVLong(in);
assertEquals(l + " != " + recovered, l, recovered);
}
}
Loading