Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

LUCENE-9378: Make it possible to configure how to trade speed for compression on doc values. #2069

Merged
merged 7 commits into from
Nov 12, 2020
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,9 @@ New Features

* LUCENE-9553: New XYPoint query that accepts an array of XYGeometries. (Ignacio Vera)

* LUCENE-9378: Doc values now allow configuring how to trade compression for
retrieval speed. (Adrien Grand)

Improvements
---------------------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
import org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat;
import org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat;
import org.apache.lucene.codecs.lucene60.Lucene60FieldInfosFormat;
import org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat;
import org.apache.lucene.codecs.lucene80.Lucene80NormsFormat;
import org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat;
import org.apache.lucene.codecs.lucene86.Lucene86PointsFormat;
Expand All @@ -56,6 +57,23 @@
* @lucene.experimental
*/
public class Lucene87Codec extends Codec {

/** Configuration option for the codec. */
public static enum Mode {
/** Trade compression ratio for retrieval speed. */
BEST_SPEED(Lucene87StoredFieldsFormat.Mode.BEST_SPEED, Lucene80DocValuesFormat.Mode.BEST_SPEED),
/** Trade retrieval speed for compression ratio. */
BEST_COMPRESSION(Lucene87StoredFieldsFormat.Mode.BEST_COMPRESSION, Lucene80DocValuesFormat.Mode.BEST_COMPRESSION);

private final Lucene87StoredFieldsFormat.Mode storedMode;
private final Lucene80DocValuesFormat.Mode dvMode;

private Mode(Lucene87StoredFieldsFormat.Mode storedMode, Lucene80DocValuesFormat.Mode dvMode) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice! So we roll up the tradeoffs to Codec level which will then tell each format how to tradeoff.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right. It's still possible to made different choices for stored fields and doc values given that we allow configuration of doc values on a per-field basis, but this should at least keep simple use simple with one switch that configures stored fields and doc values at the same time.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great! Simple for common use cases ("I want best compression" or "I want fastest search"), and complex for complex use cases (I want separate control for each part of the index).

this.storedMode = Objects.requireNonNull(storedMode);
this.dvMode = Objects.requireNonNull(dvMode);
}
}

private final TermVectorsFormat vectorsFormat = new Lucene50TermVectorsFormat();
private final FieldInfosFormat fieldInfosFormat = new Lucene60FieldInfosFormat();
private final SegmentInfoFormat segmentInfosFormat = new Lucene86SegmentInfoFormat();
Expand Down Expand Up @@ -84,7 +102,7 @@ public DocValuesFormat getDocValuesFormatForField(String field) {
* Instantiates a new codec.
*/
public Lucene87Codec() {
this(Lucene87StoredFieldsFormat.Mode.BEST_SPEED);
this(Mode.BEST_SPEED);
}

/**
Expand All @@ -93,10 +111,11 @@ public Lucene87Codec() {
* @param mode stored fields compression mode to use for newly
* flushed/merged segments.
*/
public Lucene87Codec(Lucene87StoredFieldsFormat.Mode mode) {
public Lucene87Codec(Mode mode) {
super("Lucene87");
this.storedFieldsFormat = new Lucene87StoredFieldsFormat(Objects.requireNonNull(mode));
this.storedFieldsFormat = new Lucene87StoredFieldsFormat(Objects.requireNonNull(mode).storedMode);
this.defaultFormat = new Lucene84PostingsFormat();
this.defaultDVFormat = new Lucene80DocValuesFormat(mode.dvMode);
}

@Override
Expand Down Expand Up @@ -173,7 +192,7 @@ public final DocValuesFormat docValuesFormat() {
return docValuesFormat;
}

private final DocValuesFormat defaultDVFormat = DocValuesFormat.forName("Lucene80");
private final DocValuesFormat defaultDVFormat;

private final NormsFormat normsFormat = new Lucene80NormsFormat();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,12 +64,14 @@
/** writer for {@link Lucene80DocValuesFormat} */
final class Lucene80DocValuesConsumer extends DocValuesConsumer implements Closeable {

final Lucene80DocValuesFormat.Mode mode;
IndexOutput data, meta;
final int maxDoc;
private final SegmentWriteState state;

/** expert: Creates a new writer */
public Lucene80DocValuesConsumer(SegmentWriteState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
public Lucene80DocValuesConsumer(SegmentWriteState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension, Lucene80DocValuesFormat.Mode mode) throws IOException {
this.mode = mode;
boolean success = false;
try {
this.state = state;
Expand Down Expand Up @@ -490,13 +492,86 @@ public void close() throws IOException {
}

}


@Override
public void addBinaryField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException {
meta.writeInt(field.number);
meta.writeByte(Lucene80DocValuesFormat.BINARY);

switch (mode) {
case BEST_SPEED:
meta.writeByte((byte) 0);
doAddUncompressedBinaryField(field, valuesProducer);
break;
case BEST_COMPRESSION:
meta.writeByte((byte) 1);
doAddCompressedBinaryField(field, valuesProducer);
break;
default:
throw new AssertionError();
}
}

private void doAddUncompressedBinaryField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException {
BinaryDocValues values = valuesProducer.getBinary(field);
long start = data.getFilePointer();
meta.writeLong(start); // dataOffset
int numDocsWithField = 0;
int minLength = Integer.MAX_VALUE;
int maxLength = 0;
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
numDocsWithField++;
BytesRef v = values.binaryValue();
int length = v.length;
data.writeBytes(v.bytes, v.offset, v.length);
minLength = Math.min(length, minLength);
maxLength = Math.max(length, maxLength);
}
assert numDocsWithField <= maxDoc;
meta.writeLong(data.getFilePointer() - start); // dataLength

if (numDocsWithField == 0) {
meta.writeLong(-2); // docsWithFieldOffset
meta.writeLong(0L); // docsWithFieldLength
meta.writeShort((short) -1); // jumpTableEntryCount
meta.writeByte((byte) -1); // denseRankPower
} else if (numDocsWithField == maxDoc) {
meta.writeLong(-1); // docsWithFieldOffset
meta.writeLong(0L); // docsWithFieldLength
meta.writeShort((short) -1); // jumpTableEntryCount
meta.writeByte((byte) -1); // denseRankPower
} else {
long offset = data.getFilePointer();
meta.writeLong(offset); // docsWithFieldOffset
values = valuesProducer.getBinary(field);
final short jumpTableEntryCount = IndexedDISI.writeBitSet(values, data, IndexedDISI.DEFAULT_DENSE_RANK_POWER);
meta.writeLong(data.getFilePointer() - offset); // docsWithFieldLength
meta.writeShort(jumpTableEntryCount);
meta.writeByte(IndexedDISI.DEFAULT_DENSE_RANK_POWER);
}

meta.writeInt(numDocsWithField);
meta.writeInt(minLength);
meta.writeInt(maxLength);
if (maxLength > minLength) {
start = data.getFilePointer();
meta.writeLong(start);
meta.writeVInt(DIRECT_MONOTONIC_BLOCK_SHIFT);

final DirectMonotonicWriter writer = DirectMonotonicWriter.getInstance(meta, data, numDocsWithField + 1, DIRECT_MONOTONIC_BLOCK_SHIFT);
long addr = 0;
writer.add(addr);
values = valuesProducer.getBinary(field);
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
addr += values.binaryValue().length;
writer.add(addr);
}
writer.finish();
meta.writeLong(data.getFilePointer() - start);
}
}

private void doAddCompressedBinaryField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException {
try (CompressedBinaryBlockWriter blockWriter = new CompressedBinaryBlockWriter()){
BinaryDocValues values = valuesProducer.getBinary(field);
long start = data.getFilePointer();
Expand Down Expand Up @@ -542,7 +617,6 @@ public void addBinaryField(FieldInfo field, DocValuesProducer valuesProducer) th
meta.writeInt(maxLength);

blockWriter.writeMetaData();

}

}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@


import java.io.IOException;
import java.util.Objects;

import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.codecs.DocValuesFormat;
Expand Down Expand Up @@ -131,14 +132,30 @@
*/
public final class Lucene80DocValuesFormat extends DocValuesFormat {

/** Sole Constructor */
/** Configuration option for doc values. */
public static enum Mode {
/** Trade compression ratio for retrieval speed. */
BEST_SPEED,
/** Trade retrieval speed for compression ratio. */
BEST_COMPRESSION
}

private final Mode mode;

/** Default constructor. */
public Lucene80DocValuesFormat() {
this(Mode.BEST_SPEED);
}

/** Constructor */
public Lucene80DocValuesFormat(Mode mode) {
super("Lucene80");
this.mode = Objects.requireNonNull(mode);
}

@Override
public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
return new Lucene80DocValuesConsumer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION);
return new Lucene80DocValuesConsumer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION, mode);
}

@Override
Expand All @@ -152,7 +169,8 @@ public DocValuesProducer fieldsProducer(SegmentReadState state) throws IOExcepti
static final String META_EXTENSION = "dvm";
static final int VERSION_START = 0;
static final int VERSION_BIN_COMPRESSED = 1;
static final int VERSION_CURRENT = VERSION_BIN_COMPRESSED;
static final int VERSION_CONFIGURABLE_COMPRESSION = 2;
static final int VERSION_CURRENT = VERSION_CONFIGURABLE_COMPRESSION;

// indicates docvalues type
static final byte NUMERIC = 0;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,20 @@ private void readNumeric(ChecksumIndexInput meta, NumericEntry entry) throws IOE

private BinaryEntry readBinary(ChecksumIndexInput meta) throws IOException {
BinaryEntry entry = new BinaryEntry();
if (version >= Lucene80DocValuesFormat.VERSION_CONFIGURABLE_COMPRESSION) {
int b = meta.readByte();
switch (b) {
case 0:
case 1:
// valid
break;
default:
throw new CorruptIndexException("Unexpected byte: " + b + ", expected 0 or 1", meta);
}
entry.compressed = b != 0;
} else {
entry.compressed = version >= Lucene80DocValuesFormat.VERSION_BIN_COMPRESSED;
}
entry.dataOffset = meta.readLong();
entry.dataLength = meta.readLong();
entry.docsWithFieldOffset = meta.readLong();
Expand All @@ -183,19 +197,19 @@ private BinaryEntry readBinary(ChecksumIndexInput meta) throws IOException {
entry.numDocsWithField = meta.readInt();
entry.minLength = meta.readInt();
entry.maxLength = meta.readInt();
if ((version >= Lucene80DocValuesFormat.VERSION_BIN_COMPRESSED && entry.numDocsWithField > 0) || entry.minLength < entry.maxLength) {
if ((entry.compressed && entry.numDocsWithField > 0) || entry.minLength < entry.maxLength) {
entry.addressesOffset = meta.readLong();

// Old count of uncompressed addresses
long numAddresses = entry.numDocsWithField + 1L;
// New count of compressed addresses - the number of compresseed blocks
if (version >= Lucene80DocValuesFormat.VERSION_BIN_COMPRESSED) {
if (entry.compressed) {
entry.numCompressedChunks = meta.readVInt();
entry.docsPerChunkShift = meta.readVInt();
entry.maxUncompressedChunkSize = meta.readVInt();
numAddresses = entry.numCompressedChunks;
}

final int blockShift = meta.readVInt();
entry.addressesMeta = DirectMonotonicReader.loadMeta(meta, numAddresses, blockShift);
ramBytesUsed += entry.addressesMeta.ramBytesUsed();
Expand Down Expand Up @@ -303,6 +317,7 @@ private static class NumericEntry {
}

private static class BinaryEntry {
boolean compressed;
long dataOffset;
long dataLength;
long docsWithFieldOffset;
Expand Down Expand Up @@ -680,9 +695,7 @@ public boolean advanceExact(int target) throws IOException {
}
}

// BWC - old binary format
private BinaryDocValues getUncompressedBinary(FieldInfo field) throws IOException {
BinaryEntry entry = binaries.get(field.name);
private BinaryDocValues getUncompressedBinary(BinaryEntry entry) throws IOException {
if (entry.docsWithFieldOffset == -2) {
return DocValues.emptyBinary();
}
Expand Down Expand Up @@ -844,11 +857,16 @@ BytesRef decode(int docNumber) throws IOException {

@Override
public BinaryDocValues getBinary(FieldInfo field) throws IOException {
if (version < Lucene80DocValuesFormat.VERSION_BIN_COMPRESSED) {
return getUncompressedBinary(field);
BinaryEntry entry = binaries.get(field.name);
if (entry.compressed) {
return getCompressedBinary(entry);
} else {
return getUncompressedBinary(entry);
}
}

private BinaryDocValues getCompressedBinary(BinaryEntry entry) throws IOException {

BinaryEntry entry = binaries.get(field.name);
if (entry.docsWithFieldOffset == -2) {
return DocValues.emptyBinary();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
import org.apache.lucene.codecs.lucene50.Lucene50CompoundFormat;
import org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat;
import org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat;
import org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat;
import org.apache.lucene.codecs.lucene80.Lucene80NormsFormat;
import org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat;
import org.apache.lucene.codecs.lucene86.Lucene86PointsFormat;
Expand All @@ -53,6 +54,23 @@
* @lucene.experimental
*/
public class Lucene90Codec extends Codec {

/** Configuration option for the codec. */
public static enum Mode {
/** Trade compression ratio for retrieval speed. */
BEST_SPEED(Lucene87StoredFieldsFormat.Mode.BEST_SPEED, Lucene80DocValuesFormat.Mode.BEST_SPEED),
/** Trade retrieval speed for compression ratio. */
BEST_COMPRESSION(Lucene87StoredFieldsFormat.Mode.BEST_COMPRESSION, Lucene80DocValuesFormat.Mode.BEST_COMPRESSION);

private final Lucene87StoredFieldsFormat.Mode storedMode;
private final Lucene80DocValuesFormat.Mode dvMode;

private Mode(Lucene87StoredFieldsFormat.Mode storedMode, Lucene80DocValuesFormat.Mode dvMode) {
this.storedMode = Objects.requireNonNull(storedMode);
this.dvMode = Objects.requireNonNull(dvMode);
}
}

private final TermVectorsFormat vectorsFormat = new Lucene50TermVectorsFormat();
private final FieldInfosFormat fieldInfosFormat = new Lucene90FieldInfosFormat();
private final SegmentInfoFormat segmentInfosFormat = new Lucene86SegmentInfoFormat();
Expand Down Expand Up @@ -82,7 +100,7 @@ public DocValuesFormat getDocValuesFormatForField(String field) {
* Instantiates a new codec.
*/
public Lucene90Codec() {
this(Lucene87StoredFieldsFormat.Mode.BEST_SPEED);
this(Mode.BEST_SPEED);
}

/**
Expand All @@ -91,10 +109,11 @@ public Lucene90Codec() {
* @param mode stored fields compression mode to use for newly
* flushed/merged segments.
*/
public Lucene90Codec(Lucene87StoredFieldsFormat.Mode mode) {
public Lucene90Codec(Mode mode) {
super("Lucene90");
this.storedFieldsFormat = new Lucene87StoredFieldsFormat(Objects.requireNonNull(mode));
this.storedFieldsFormat = new Lucene87StoredFieldsFormat(Objects.requireNonNull(mode).storedMode);
this.defaultFormat = new Lucene84PostingsFormat();
this.defaultDVFormat = new Lucene80DocValuesFormat(mode.dvMode);
}

@Override
Expand Down Expand Up @@ -172,7 +191,7 @@ public final DocValuesFormat docValuesFormat() {
return docValuesFormat;
}

private final DocValuesFormat defaultDVFormat = DocValuesFormat.forName("Lucene80");
private final DocValuesFormat defaultDVFormat;

private final NormsFormat normsFormat = new Lucene80NormsFormat();

Expand Down
Loading