Skip to content

Commit

Permalink
Use group-varint encoding for the tail of postings (#12782)
Browse files Browse the repository at this point in the history
Co-authored-by: Adrien Grand <jpountz@gmail.com>
  • Loading branch information
easyice and jpountz committed Nov 20, 2023
1 parent 43e4bb9 commit 36c727c
Show file tree
Hide file tree
Showing 6 changed files with 202 additions and 24 deletions.
10 changes: 7 additions & 3 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -144,10 +144,14 @@ Optimizations

* GITHUB#12381: Skip docs with DocValues in NumericLeafComparator. (Lu Xugang, Adrien Grand)

* GITHUB#12748: Specialize arc store for continuous label in FST. (Guo Feng, Zhang Chao)

* GITHUB#12784: Cache buckets to speed up BytesRefHash#sort. (Guo Feng)

* GITHUB#12806: Utilize exact kNN search when gathering k >= numVectors in a segment (Ben Trent)

* GITHUB#12782: Use group-varint encoding for the tail of postings. (Adrien Grand, Zhang Chao)

Changes in runtime behavior
---------------------

Expand Down Expand Up @@ -258,7 +262,7 @@ GITHUB#12491: Hunspell: speed up the dictionary enumeration on suggestion (Peter
Optimizations
---------------------

* GITHUB#12377: Avoid redundant loop for compute min value in DirectMonotonicWriter. (Chao Zhang)
* GITHUB#12377: Avoid redundant loop for compute min value in DirectMonotonicWriter. (Zhang Chao)

* GITHUB#12361: Faster top-level disjunctions sorted by descending score.
(Adrien Grand)
Expand All @@ -273,7 +277,7 @@ Optimizations

* GITHUB#12385: Restore parallel knn query rewrite across segments rather than slices (Luca Cavanna)

* GITHUB#12381: Speed up NumericDocValuesWriter with index sorting. (Chao Zhang)
* GITHUB#12381: Speed up NumericDocValuesWriter with index sorting. (Zhang Chao)

* GITHUB#12453: Faster bulk numeric reads from BufferedIndexInput (Armin Braun)

Expand Down Expand Up @@ -340,7 +344,7 @@ Other
* GITHUB#12428: Replace consecutive close() calls and close() calls with null checks with IOUtils.close().
(Shubham Chaudhary)

* GITHUB#12512: Remove unused variable in BKDWriter. (Chao Zhang)
* GITHUB#12512: Remove unused variable in BKDWriter. (Zhang Chao)

======================== Lucene 9.7.0 =======================

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene99;

import java.io.IOException;
import org.apache.lucene.store.DataInput;

/** Decode integers using group-varint. */
public class GroupVIntReader {

public static void readValues(DataInput in, long[] docs, int limit) throws IOException {
int i;
for (i = 0; i <= limit - 4; i += 4) {
final int flag = in.readByte() & 0xFF;

final int n1Minus1 = flag >> 6;
final int n2Minus1 = (flag >> 4) & 0x03;
final int n3Minus1 = (flag >> 2) & 0x03;
final int n4Minus1 = flag & 0x03;

docs[i] = readLong(in, n1Minus1);
docs[i + 1] = readLong(in, n2Minus1);
docs[i + 2] = readLong(in, n3Minus1);
docs[i + 3] = readLong(in, n4Minus1);
}
for (; i < limit; ++i) {
docs[i] = in.readVInt();
}
}

private static long readLong(DataInput in, int numBytesMinus1) throws IOException {
switch (numBytesMinus1) {
case 0:
return in.readByte() & 0xFFL;
case 1:
return in.readShort() & 0xFFFFL;
case 2:
return (in.readShort() & 0xFFFFL) | ((in.readByte() & 0xFFL) << 16);
default:
return in.readInt() & 0xFFFFFFFFL;
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene99;

import java.io.IOException;
import org.apache.lucene.store.DataOutput;

/**
* Encode integers using group-varint. It uses VInt to encode tail values that are not enough for a
* group
*/
public class GroupVIntWriter {

// the maximum size of one group is 4 integers + 1 byte flag.
private byte[] bytes = new byte[17];
private int byteOffset = 0;

public GroupVIntWriter() {}

private int encodeValue(int v) {
int lastOff = byteOffset;
do {
bytes[byteOffset++] = (byte) (v & 0xFF);
v >>>= 8;
} while (v != 0);
return byteOffset - lastOff;
}

public void writeValues(DataOutput out, long[] values, int limit) throws IOException {
int off = 0;

// encode each group
while ((limit - off) >= 4) {
byte flag = 0;
byteOffset = 1;
flag |= (encodeValue((int) values[off++]) - 1) << 6;
flag |= (encodeValue((int) values[off++]) - 1) << 4;
flag |= (encodeValue((int) values[off++]) - 1) << 2;
flag |= (encodeValue((int) values[off++]) - 1);
bytes[0] = flag;
out.writeBytes(bytes, byteOffset);
}

// tail vints
for (; off < limit; off++) {
out.writeVInt((int) values[off]);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -144,20 +144,15 @@ public void init(IndexInput termsIn, SegmentReadState state) throws IOException
static void readVIntBlock(
IndexInput docIn, long[] docBuffer, long[] freqBuffer, int num, boolean indexHasFreq)
throws IOException {
GroupVIntReader.readValues(docIn, docBuffer, num);
if (indexHasFreq) {
for (int i = 0; i < num; i++) {
final int code = docIn.readVInt();
docBuffer[i] = code >>> 1;
if ((code & 1) != 0) {
freqBuffer[i] = 1;
} else {
for (int i = 0; i < num; ++i) {
freqBuffer[i] = docBuffer[i] & 0x01;
docBuffer[i] >>= 1;
if (freqBuffer[i] == 0) {
freqBuffer[i] = docIn.readVInt();
}
}
} else {
for (int i = 0; i < num; i++) {
docBuffer[i] = docIn.readVInt();
}
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ public final class Lucene99PostingsWriter extends PushPostingsWriterBase {
private final PForUtil pforUtil;
private final ForDeltaUtil forDeltaUtil;
private final Lucene99SkipWriter skipWriter;
private final GroupVIntWriter docGroupVIntWriter;

private boolean fieldHasNorms;
private NumericDocValues norms;
Expand Down Expand Up @@ -172,6 +173,7 @@ public Lucene99PostingsWriter(SegmentWriteState state) throws IOException {
skipWriter =
new Lucene99SkipWriter(
MAX_SKIP_LEVELS, BLOCK_SIZE, state.segmentInfo.maxDoc(), docOut, posOut, payOut);
docGroupVIntWriter = new GroupVIntWriter();
}

@Override
Expand Down Expand Up @@ -370,17 +372,19 @@ public void finishTerm(BlockTermState _state) throws IOException {
singletonDocID = (int) docDeltaBuffer[0];
} else {
singletonDocID = -1;
// vInt encode the remaining doc deltas and freqs:
for (int i = 0; i < docBufferUpto; i++) {
final int docDelta = (int) docDeltaBuffer[i];
final int freq = (int) freqBuffer[i];
if (!writeFreqs) {
docOut.writeVInt(docDelta);
} else if (freq == 1) {
docOut.writeVInt((docDelta << 1) | 1);
} else {
docOut.writeVInt(docDelta << 1);
docOut.writeVInt(freq);
// Group vInt encode the remaining doc deltas and freqs:
if (writeFreqs) {
for (int i = 0; i < docBufferUpto; i++) {
docDeltaBuffer[i] = (docDeltaBuffer[i] << 1) | (freqBuffer[i] == 1 ? 1 : 0);
}
}
docGroupVIntWriter.writeValues(docOut, docDeltaBuffer, docBufferUpto);
if (writeFreqs) {
for (int i = 0; i < docBufferUpto; i++) {
final int freq = (int) freqBuffer[i];
if (freq != 1) {
docOut.writeVInt(freq);
}
}
}
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene99;

import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
import java.io.IOException;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.tests.util.LuceneTestCase;
import org.apache.lucene.tests.util.TestUtil;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.packed.PackedInts;

public class TestGroupVInt extends LuceneTestCase {

public void testEncodeDecode() throws IOException {
long[] values = new long[ForUtil.BLOCK_SIZE];
long[] restored = new long[ForUtil.BLOCK_SIZE];
final int iterations = atLeast(100);

final GroupVIntWriter w = new GroupVIntWriter();
byte[] encoded = new byte[(int) (Integer.BYTES * ForUtil.BLOCK_SIZE * 1.25)];

for (int i = 0; i < iterations; i++) {
final int bpv = TestUtil.nextInt(random(), 1, 31);
final int numValues = TestUtil.nextInt(random(), 1, ForUtil.BLOCK_SIZE);

// encode
for (int j = 0; j < numValues; j++) {
values[j] = RandomNumbers.randomIntBetween(random(), 0, (int) PackedInts.maxValue(bpv));
}
w.writeValues(new ByteArrayDataOutput(encoded), values, numValues);

// decode
GroupVIntReader.readValues(new ByteArrayDataInput(encoded), restored, numValues);
assertArrayEquals(
ArrayUtil.copyOfSubArray(values, 0, numValues),
ArrayUtil.copyOfSubArray(restored, 0, numValues));
}
}
}

0 comments on commit 36c727c

Please sign in to comment.