Skip to content

Commit

Permalink
LUCENE-10633: Dynamic pruning for sorting on SORTED(_SET) fields. (#1023
Browse files Browse the repository at this point in the history
)

This commit enables dynamic pruning for queries sorted on SORTED(_SET) fields
by using postings to filter competitive documents.
  • Loading branch information
jpountz committed Jul 29, 2022
1 parent 317e1fa commit 261db55
Show file tree
Hide file tree
Showing 7 changed files with 900 additions and 297 deletions.
4 changes: 4 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@ Optimizations

* GITHUB#1020: Support #scoreSupplier and small optimizations to DocValuesRewriteMethod. (Greg Miller)

* LUCENE-10633: Added support for dynamic pruning to queries sorted by a string
field that is indexed with terms and SORTED or SORTED_SET doc values.
(Adrien Grand)

Bug Fixes
---------------------
(No changes)
Expand Down
277 changes: 0 additions & 277 deletions lucene/core/src/java/org/apache/lucene/search/FieldComparator.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;

Expand Down Expand Up @@ -211,282 +210,6 @@ public int compareTop(int doc) throws IOException {
}
}

/**
* Sorts by field's natural Term sort order, using ordinals. This is functionally equivalent to
* {@link org.apache.lucene.search.FieldComparator.TermValComparator}, but it first resolves the
* string to their relative ordinal positions (using the index returned by {@link
* org.apache.lucene.index.LeafReader#getSortedDocValues(String)}), and does most comparisons
* using the ordinals. For medium to large results, this comparator will be much faster than
* {@link org.apache.lucene.search.FieldComparator.TermValComparator}. For very small result sets
* it may be slower.
*/
public static class TermOrdValComparator extends FieldComparator<BytesRef>
implements LeafFieldComparator {
/* Ords for each slot.
@lucene.internal */
final int[] ords;

/* Values for each slot.
@lucene.internal */
final BytesRef[] values;
private final BytesRefBuilder[] tempBRs;

/* Which reader last copied a value into the slot. When
we compare two slots, we just compare-by-ord if the
readerGen is the same; else we must compare the
values (slower).
@lucene.internal */
final int[] readerGen;

/* Gen of current reader we are on.
@lucene.internal */
int currentReaderGen = -1;

/* Current reader's doc ord/values.
@lucene.internal */
SortedDocValues termsIndex;

private final String field;

/* Bottom slot, or -1 if queue isn't full yet
@lucene.internal */
int bottomSlot = -1;

/* Bottom ord (same as ords[bottomSlot] once bottomSlot
is set). Cached for faster compares.
@lucene.internal */
int bottomOrd;

/* True if current bottom slot matches the current
reader.
@lucene.internal */
boolean bottomSameReader;

/* Bottom value (same as values[bottomSlot] once
bottomSlot is set). Cached for faster compares.
@lucene.internal */
BytesRef bottomValue;

/** Set by setTopValue. */
BytesRef topValue;

boolean topSameReader;
int topOrd;

/** -1 if missing values are sorted first, 1 if they are sorted last */
final int missingSortCmp;

/** Which ordinal to use for a missing value. */
final int missingOrd;

/** Creates this, sorting missing values first. */
public TermOrdValComparator(int numHits, String field) {
this(numHits, field, false);
}

/**
* Creates this, with control over how missing values are sorted. Pass sortMissingLast=true to
* put missing values at the end.
*/
public TermOrdValComparator(int numHits, String field, boolean sortMissingLast) {
ords = new int[numHits];
values = new BytesRef[numHits];
tempBRs = new BytesRefBuilder[numHits];
readerGen = new int[numHits];
this.field = field;
if (sortMissingLast) {
missingSortCmp = 1;
missingOrd = Integer.MAX_VALUE;
} else {
missingSortCmp = -1;
missingOrd = -1;
}
}

private int getOrdForDoc(int doc) throws IOException {
if (termsIndex.advanceExact(doc)) {
return termsIndex.ordValue();
} else {
return -1;
}
}

@Override
public int compare(int slot1, int slot2) {
if (readerGen[slot1] == readerGen[slot2]) {
return ords[slot1] - ords[slot2];
}

final BytesRef val1 = values[slot1];
final BytesRef val2 = values[slot2];
if (val1 == null) {
if (val2 == null) {
return 0;
}
return missingSortCmp;
} else if (val2 == null) {
return -missingSortCmp;
}
return val1.compareTo(val2);
}

@Override
public int compareBottom(int doc) throws IOException {
assert bottomSlot != -1;
int docOrd = getOrdForDoc(doc);
if (docOrd == -1) {
docOrd = missingOrd;
}
if (bottomSameReader) {
// ord is precisely comparable, even in the equal case
return bottomOrd - docOrd;
} else if (bottomOrd >= docOrd) {
// the equals case always means bottom is > doc
// (because we set bottomOrd to the lower bound in
// setBottom):
return 1;
} else {
return -1;
}
}

@Override
public void copy(int slot, int doc) throws IOException {
int ord = getOrdForDoc(doc);
if (ord == -1) {
ord = missingOrd;
values[slot] = null;
} else {
assert ord >= 0;
if (tempBRs[slot] == null) {
tempBRs[slot] = new BytesRefBuilder();
}
tempBRs[slot].copyBytes(termsIndex.lookupOrd(ord));
values[slot] = tempBRs[slot].get();
}
ords[slot] = ord;
readerGen[slot] = currentReaderGen;
}

/** Retrieves the SortedDocValues for the field in this segment */
protected SortedDocValues getSortedDocValues(LeafReaderContext context, String field)
throws IOException {
return DocValues.getSorted(context.reader(), field);
}

@Override
public LeafFieldComparator getLeafComparator(LeafReaderContext context) throws IOException {
termsIndex = getSortedDocValues(context, field);
currentReaderGen++;

if (topValue != null) {
// Recompute topOrd/SameReader
int ord = termsIndex.lookupTerm(topValue);
if (ord >= 0) {
topSameReader = true;
topOrd = ord;
} else {
topSameReader = false;
topOrd = -ord - 2;
}
} else {
topOrd = missingOrd;
topSameReader = true;
}
// System.out.println(" getLeafComparator topOrd=" + topOrd + " topSameReader=" +
// topSameReader);

if (bottomSlot != -1) {
// Recompute bottomOrd/SameReader
setBottom(bottomSlot);
}

return this;
}

@Override
public void setBottom(final int bottom) throws IOException {
bottomSlot = bottom;

bottomValue = values[bottomSlot];
if (currentReaderGen == readerGen[bottomSlot]) {
bottomOrd = ords[bottomSlot];
bottomSameReader = true;
} else {
if (bottomValue == null) {
// missingOrd is null for all segments
assert ords[bottomSlot] == missingOrd;
bottomOrd = missingOrd;
bottomSameReader = true;
readerGen[bottomSlot] = currentReaderGen;
} else {
final int ord = termsIndex.lookupTerm(bottomValue);
if (ord < 0) {
bottomOrd = -ord - 2;
bottomSameReader = false;
} else {
bottomOrd = ord;
// exact value match
bottomSameReader = true;
readerGen[bottomSlot] = currentReaderGen;
ords[bottomSlot] = bottomOrd;
}
}
}
}

@Override
public void setTopValue(BytesRef value) {
// null is fine: it means the last doc of the prior
// search was missing this value
topValue = value;
// System.out.println("setTopValue " + topValue);
}

@Override
public BytesRef value(int slot) {
return values[slot];
}

@Override
public int compareTop(int doc) throws IOException {

int ord = getOrdForDoc(doc);
if (ord == -1) {
ord = missingOrd;
}

if (topSameReader) {
// ord is precisely comparable, even in the equal
// case
// System.out.println("compareTop doc=" + doc + " ord=" + ord + " ret=" + (topOrd-ord));
return topOrd - ord;
} else if (ord <= topOrd) {
// the equals case always means doc is < value
// (because we set lastOrd to the lower bound)
return 1;
} else {
return -1;
}
}

@Override
public int compareValues(BytesRef val1, BytesRef val2) {
if (val1 == null) {
if (val2 == null) {
return 0;
}
return missingSortCmp;
} else if (val2 == null) {
return -missingSortCmp;
}
return val1.compareTo(val2);
}

@Override
public void setScorer(Scorable scorer) {}
}

/**
* Sorts by field's natural Term sort order. All comparisons are done using BytesRef.compareTo,
* which is slow for medium to large result sets but possibly very fast for very small results
Expand Down
4 changes: 2 additions & 2 deletions lucene/core/src/java/org/apache/lucene/search/SortField.java
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import org.apache.lucene.search.comparators.FloatComparator;
import org.apache.lucene.search.comparators.IntComparator;
import org.apache.lucene.search.comparators.LongComparator;
import org.apache.lucene.search.comparators.TermOrdValComparator;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.BytesRef;
Expand Down Expand Up @@ -536,8 +537,7 @@ public FieldComparator<?> getComparator(final int numHits, boolean enableSkippin
break;

case STRING:
return new FieldComparator.TermOrdValComparator(
numHits, field, missingValue == STRING_LAST);
return new TermOrdValComparator(numHits, field, missingValue == STRING_LAST, reverse);

case STRING_VAL:
fieldComparator =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import org.apache.lucene.index.SortFieldProvider;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.search.comparators.TermOrdValComparator;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;

Expand Down Expand Up @@ -178,8 +179,7 @@ public void setMissingValue(Object missingValue) {

@Override
public FieldComparator<?> getComparator(int numHits, boolean enableSkipping) {
return new FieldComparator.TermOrdValComparator(
numHits, getField(), missingValue == STRING_LAST) {
return new TermOrdValComparator(numHits, getField(), missingValue == STRING_LAST, reverse) {
@Override
protected SortedDocValues getSortedDocValues(LeafReaderContext context, String field)
throws IOException {
Expand Down

0 comments on commit 261db55

Please sign in to comment.