Skip to content

Commit

Permalink
LUCENE-9539: Use more compact datastructures for sorting doc-values (#…
Browse files Browse the repository at this point in the history
…1908)

This change cuts over from object based data-structures to primitive / compressed data-structures.
  • Loading branch information
s1monw committed Sep 22, 2020
1 parent 1611586 commit c82b994
Show file tree
Hide file tree
Showing 5 changed files with 135 additions and 128 deletions.
3 changes: 3 additions & 0 deletions lucene/CHANGES.txt
Expand Up @@ -238,6 +238,9 @@ Improvements
* LUCENE-9523: In query shapes over shape fields, skip points while traversing the
BKD tree when the relationship with the document is already known. (Ignacio Vera)

* LUCENE-9539: Use more compact datastructures to represent sorted doc-values in memory when
sorting a segment before flush and in SortingCodecReader. (Simon Willnauer)

Optimizations
---------------------

Expand Down
Expand Up @@ -24,11 +24,10 @@
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BitSet;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefArray;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.Counter;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.PagedBytes;
import org.apache.lucene.util.packed.PackedInts;
import org.apache.lucene.util.packed.PackedLongValues;
Expand Down Expand Up @@ -58,7 +57,7 @@ class BinaryDocValuesWriter extends DocValuesWriter<BinaryDocValues> {

private PackedLongValues finalLengths;

public BinaryDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) {
BinaryDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) {
this.fieldInfo = fieldInfo;
this.bytes = new PagedBytes(BLOCK_BITS);
this.bytesOut = bytes.getDataOutput();
Expand Down Expand Up @@ -100,21 +99,6 @@ private void updateBytesUsed() {
bytesUsed = newBytesUsed;
}

static CachedBinaryDVs sortDocValues(int maxDoc, Sorter.DocMap sortMap, BinaryDocValues oldValues) throws IOException {
FixedBitSet docsWithField = new FixedBitSet(maxDoc);
BytesRef[] values = new BytesRef[maxDoc];
while (true) {
int docID = oldValues.nextDoc();
if (docID == NO_MORE_DOCS) {
break;
}
int newDocID = sortMap.oldToNew(docID);
docsWithField.set(newDocID);
values[newDocID] = BytesRef.deepCopyOf(oldValues.binaryValue());
}
return new CachedBinaryDVs(values, docsWithField);
}

@Override
BinaryDocValues getDocValues() {
if (finalLengths == null) {
Expand All @@ -131,7 +115,7 @@ public void flush(SegmentWriteState state, Sorter.DocMap sortMap, DocValuesConsu
}
final CachedBinaryDVs sorted;
if (sortMap != null) {
sorted = sortDocValues(state.segmentInfo.maxDoc(), sortMap,
sorted = new CachedBinaryDVs(state.segmentInfo.maxDoc(), sortMap,
new BufferedBinaryDocValues(finalLengths, maxLength, bytes.getDataInput(), docsWithField.iterator()));
} else {
sorted = null;
Expand Down Expand Up @@ -206,20 +190,21 @@ public BytesRef binaryValue() {

static class SortingBinaryDocValues extends BinaryDocValues {
private final CachedBinaryDVs dvs;
private final BytesRefBuilder spare = new BytesRefBuilder();
private int docID = -1;
private long cost = -1;

SortingBinaryDocValues(CachedBinaryDVs dvs) {
this.dvs = dvs;
}

@Override
public int nextDoc() {
if (docID+1 == dvs.docsWithField.length()) {
docID = NO_MORE_DOCS;
} else {
docID = dvs.docsWithField.nextSetBit(docID+1);
}
do {
docID++;
if (docID == dvs.offsets.length) {
return docID = NO_MORE_DOCS;
}
} while (dvs.offsets[docID] <= 0);
return docID;
}

Expand All @@ -240,26 +225,29 @@ public boolean advanceExact(int target) throws IOException {

@Override
public BytesRef binaryValue() {
return dvs.values[docID];
dvs.values.get(spare, dvs.offsets[docID]-1);
return spare.get();
}

@Override
public long cost() {
if (cost == -1) {
cost = dvs.docsWithField.cardinality();
}
return cost;
return dvs.values.size();
}
}

static class CachedBinaryDVs {
// TODO: at least cutover to BytesRefArray here:
private final BytesRef[] values;
private final BitSet docsWithField;

CachedBinaryDVs(BytesRef[] values, BitSet docsWithField) {
this.values = values;
this.docsWithField = docsWithField;
static final class CachedBinaryDVs {
final int[] offsets;
final BytesRefArray values;
CachedBinaryDVs(int maxDoc, Sorter.DocMap sortMap, BinaryDocValues oldValues) throws IOException {
offsets = new int[maxDoc];
values = new BytesRefArray(Counter.newCounter());
int offset = 1; // 0 means no values for this document
int docID;
while ((docID = oldValues.nextDoc()) != NO_MORE_DOCS) {
int newDocID = sortMap.oldToNew(docID);
values.append(oldValues.binaryValue());
offsets[newDocID] = offset++;
}
}
}
}
Expand Up @@ -39,13 +39,13 @@ class SortedNumericDocValuesWriter extends DocValuesWriter<SortedNumericDocValue
private long bytesUsed; // this only tracks differences in 'pending' and 'pendingCounts'
private final FieldInfo fieldInfo;
private int currentDoc = -1;
private long currentValues[] = new long[8];
private long[] currentValues = new long[8];
private int currentUpto = 0;

private PackedLongValues finalValues;
private PackedLongValues finalValuesCount;

public SortedNumericDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) {
SortedNumericDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) {
this.fieldInfo = fieldInfo;
this.iwBytesUsed = iwBytesUsed;
pending = PackedLongValues.deltaPackedBuilder(PackedInts.COMPACT);
Expand Down Expand Up @@ -108,18 +108,28 @@ SortedNumericDocValues getDocValues() {
return new BufferedSortedNumericDocValues(finalValues, finalValuesCount, docsWithField.iterator());
}

static long[][] sortDocValues(int maxDoc, Sorter.DocMap sortMap, SortedNumericDocValues oldValues) throws IOException {
long[][] values = new long[maxDoc][];
int docID;
while ((docID = oldValues.nextDoc()) != NO_MORE_DOCS) {
int newDocID = sortMap.oldToNew(docID);
long[] docValues = new long[oldValues.docValueCount()];
for (int i = 0; i < docValues.length; i++) {
docValues[i] = oldValues.nextValue();
static final class LongValues {
final long[] offsets;
final PackedLongValues values;
LongValues(int maxDoc, Sorter.DocMap sortMap, SortedNumericDocValues oldValues, float acceptableOverheadRatio) throws IOException {
offsets = new long[maxDoc];
PackedLongValues.Builder valuesBuiler = PackedLongValues.packedBuilder(acceptableOverheadRatio);
int docID;
long offsetIndex = 1; // 0 means the doc has no values
while ((docID = oldValues.nextDoc()) != NO_MORE_DOCS) {
int newDocID = sortMap.oldToNew(docID);
int numValues = oldValues.docValueCount();
valuesBuiler.add(numValues);
offsets[newDocID] = offsetIndex++;
for (int i = 0; i < numValues; i++) {
valuesBuiler.add(oldValues.nextValue());
offsetIndex++;
}
}
values[newDocID] = docValues;
values = valuesBuiler.build();
}
return values;


}

@Override
Expand All @@ -135,10 +145,10 @@ public void flush(SegmentWriteState state, Sorter.DocMap sortMap, DocValuesConsu
valueCounts = finalValuesCount;
}

final long[][] sorted;
final LongValues sorted;
if (sortMap != null) {
sorted = sortDocValues(state.segmentInfo.maxDoc(), sortMap,
new BufferedSortedNumericDocValues(values, valueCounts, docsWithField.iterator()));
sorted = new LongValues(state.segmentInfo.maxDoc(), sortMap,
new BufferedSortedNumericDocValues(values, valueCounts, docsWithField.iterator()), PackedInts.FASTEST);
} else {
sorted = null;
}
Expand Down Expand Up @@ -225,11 +235,13 @@ public long cost() {

static class SortingSortedNumericDocValues extends SortedNumericDocValues {
private final SortedNumericDocValues in;
private final long[][] values;
private final LongValues values;
private int docID = -1;
private int upto;
private long upto;
private int numValues = - 1;
private long limit;

SortingSortedNumericDocValues(SortedNumericDocValues in, long[][] values) {
SortingSortedNumericDocValues(SortedNumericDocValues in, LongValues values) {
this.in = in;
this.values = values;
}
Expand All @@ -241,18 +253,15 @@ public int docID() {

@Override
public int nextDoc() {
while (true) {
do {
docID++;
if (docID == values.length) {
docID = NO_MORE_DOCS;
break;
if (docID >= values.offsets.length) {
return docID = NO_MORE_DOCS;
}
if (values[docID] != null) {
break;
}
// skip missing docs
}
upto = 0;
} while (values.offsets[docID] <= 0);
upto = values.offsets[docID];
numValues = Math.toIntExact(values.values.get(upto-1));
limit = upto + numValues;
return docID;
}

Expand All @@ -264,16 +273,23 @@ public int advance(int target) {
@Override
public boolean advanceExact(int target) throws IOException {
docID = target;
upto = 0;
return values[docID] != null;
upto = values.offsets[docID];
if (values.offsets[docID] > 0) {
numValues = Math.toIntExact(values.values.get(upto-1));
limit = upto + numValues;
return true;
} else {
limit = upto;
}
return false;
}

@Override
public long nextValue() {
if (upto == values[docID].length) {
if (upto == limit) {
throw new AssertionError();
} else {
return values[docID][upto++];
return values.values.get(upto++);
}
}

Expand All @@ -284,7 +300,7 @@ public long cost() {

@Override
public int docValueCount() {
return values[docID].length;
return numValues;
}
}
}

0 comments on commit c82b994

Please sign in to comment.