Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

LUCENE-10633: Dynamic pruning for sorting on SORTED(_SET) fields. #1023

Merged
merged 7 commits into from
Jul 29, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,10 @@ Optimizations

* GITHUB#1020: Support #scoreSupplier and small optimizations to DocValuesRewriteMethod. (Greg Miller)

* LUCENE-10633: Added support for dynamic pruning to queries sorted by a string
field that is indexed with terms and SORTED or SORTED_SET doc values.
(Adrien Grand)

Bug Fixes
---------------------
* LUCENE-10663: Fix KnnVectorQuery explain with multiple segments. (Shiming Li)
Expand Down
277 changes: 0 additions & 277 deletions lucene/core/src/java/org/apache/lucene/search/FieldComparator.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;

Expand Down Expand Up @@ -211,282 +210,6 @@ public int compareTop(int doc) throws IOException {
}
}

/**
* Sorts by field's natural Term sort order, using ordinals. This is functionally equivalent to
* {@link org.apache.lucene.search.FieldComparator.TermValComparator}, but it first resolves the
* string to their relative ordinal positions (using the index returned by {@link
* org.apache.lucene.index.LeafReader#getSortedDocValues(String)}), and does most comparisons
* using the ordinals. For medium to large results, this comparator will be much faster than
* {@link org.apache.lucene.search.FieldComparator.TermValComparator}. For very small result sets
* it may be slower.
*/
public static class TermOrdValComparator extends FieldComparator<BytesRef>
implements LeafFieldComparator {
/* Ords for each slot.
@lucene.internal */
final int[] ords;

/* Values for each slot.
@lucene.internal */
final BytesRef[] values;
private final BytesRefBuilder[] tempBRs;

/* Which reader last copied a value into the slot. When
we compare two slots, we just compare-by-ord if the
readerGen is the same; else we must compare the
values (slower).
@lucene.internal */
final int[] readerGen;

/* Gen of current reader we are on.
@lucene.internal */
int currentReaderGen = -1;

/* Current reader's doc ord/values.
@lucene.internal */
SortedDocValues termsIndex;

private final String field;

/* Bottom slot, or -1 if queue isn't full yet
@lucene.internal */
int bottomSlot = -1;

/* Bottom ord (same as ords[bottomSlot] once bottomSlot
is set). Cached for faster compares.
@lucene.internal */
int bottomOrd;

/* True if current bottom slot matches the current
reader.
@lucene.internal */
boolean bottomSameReader;

/* Bottom value (same as values[bottomSlot] once
bottomSlot is set). Cached for faster compares.
@lucene.internal */
BytesRef bottomValue;

/** Set by setTopValue. */
BytesRef topValue;

boolean topSameReader;
int topOrd;

/** -1 if missing values are sorted first, 1 if they are sorted last */
final int missingSortCmp;

/** Which ordinal to use for a missing value. */
final int missingOrd;

/** Creates this, sorting missing values first. */
public TermOrdValComparator(int numHits, String field) {
this(numHits, field, false);
}

/**
* Creates this, with control over how missing values are sorted. Pass sortMissingLast=true to
* put missing values at the end.
*/
public TermOrdValComparator(int numHits, String field, boolean sortMissingLast) {
ords = new int[numHits];
values = new BytesRef[numHits];
tempBRs = new BytesRefBuilder[numHits];
readerGen = new int[numHits];
this.field = field;
if (sortMissingLast) {
missingSortCmp = 1;
missingOrd = Integer.MAX_VALUE;
} else {
missingSortCmp = -1;
missingOrd = -1;
}
}

private int getOrdForDoc(int doc) throws IOException {
if (termsIndex.advanceExact(doc)) {
return termsIndex.ordValue();
} else {
return -1;
}
}

@Override
public int compare(int slot1, int slot2) {
if (readerGen[slot1] == readerGen[slot2]) {
return ords[slot1] - ords[slot2];
}

final BytesRef val1 = values[slot1];
final BytesRef val2 = values[slot2];
if (val1 == null) {
if (val2 == null) {
return 0;
}
return missingSortCmp;
} else if (val2 == null) {
return -missingSortCmp;
}
return val1.compareTo(val2);
}

@Override
public int compareBottom(int doc) throws IOException {
assert bottomSlot != -1;
int docOrd = getOrdForDoc(doc);
if (docOrd == -1) {
docOrd = missingOrd;
}
if (bottomSameReader) {
// ord is precisely comparable, even in the equal case
return bottomOrd - docOrd;
} else if (bottomOrd >= docOrd) {
// the equals case always means bottom is > doc
// (because we set bottomOrd to the lower bound in
// setBottom):
return 1;
} else {
return -1;
}
}

@Override
public void copy(int slot, int doc) throws IOException {
int ord = getOrdForDoc(doc);
if (ord == -1) {
ord = missingOrd;
values[slot] = null;
} else {
assert ord >= 0;
if (tempBRs[slot] == null) {
tempBRs[slot] = new BytesRefBuilder();
}
tempBRs[slot].copyBytes(termsIndex.lookupOrd(ord));
values[slot] = tempBRs[slot].get();
}
ords[slot] = ord;
readerGen[slot] = currentReaderGen;
}

/** Retrieves the SortedDocValues for the field in this segment */
protected SortedDocValues getSortedDocValues(LeafReaderContext context, String field)
throws IOException {
return DocValues.getSorted(context.reader(), field);
}

@Override
public LeafFieldComparator getLeafComparator(LeafReaderContext context) throws IOException {
termsIndex = getSortedDocValues(context, field);
currentReaderGen++;

if (topValue != null) {
// Recompute topOrd/SameReader
int ord = termsIndex.lookupTerm(topValue);
if (ord >= 0) {
topSameReader = true;
topOrd = ord;
} else {
topSameReader = false;
topOrd = -ord - 2;
}
} else {
topOrd = missingOrd;
topSameReader = true;
}
// System.out.println(" getLeafComparator topOrd=" + topOrd + " topSameReader=" +
// topSameReader);

if (bottomSlot != -1) {
// Recompute bottomOrd/SameReader
setBottom(bottomSlot);
}

return this;
}

@Override
public void setBottom(final int bottom) throws IOException {
bottomSlot = bottom;

bottomValue = values[bottomSlot];
if (currentReaderGen == readerGen[bottomSlot]) {
bottomOrd = ords[bottomSlot];
bottomSameReader = true;
} else {
if (bottomValue == null) {
// missingOrd is null for all segments
assert ords[bottomSlot] == missingOrd;
bottomOrd = missingOrd;
bottomSameReader = true;
readerGen[bottomSlot] = currentReaderGen;
} else {
final int ord = termsIndex.lookupTerm(bottomValue);
if (ord < 0) {
bottomOrd = -ord - 2;
bottomSameReader = false;
} else {
bottomOrd = ord;
// exact value match
bottomSameReader = true;
readerGen[bottomSlot] = currentReaderGen;
ords[bottomSlot] = bottomOrd;
}
}
}
}

@Override
public void setTopValue(BytesRef value) {
// null is fine: it means the last doc of the prior
// search was missing this value
topValue = value;
// System.out.println("setTopValue " + topValue);
}

@Override
public BytesRef value(int slot) {
return values[slot];
}

@Override
public int compareTop(int doc) throws IOException {

int ord = getOrdForDoc(doc);
if (ord == -1) {
ord = missingOrd;
}

if (topSameReader) {
// ord is precisely comparable, even in the equal
// case
// System.out.println("compareTop doc=" + doc + " ord=" + ord + " ret=" + (topOrd-ord));
return topOrd - ord;
} else if (ord <= topOrd) {
// the equals case always means doc is < value
// (because we set lastOrd to the lower bound)
return 1;
} else {
return -1;
}
}

@Override
public int compareValues(BytesRef val1, BytesRef val2) {
if (val1 == null) {
if (val2 == null) {
return 0;
}
return missingSortCmp;
} else if (val2 == null) {
return -missingSortCmp;
}
return val1.compareTo(val2);
}

@Override
public void setScorer(Scorable scorer) {}
}

/**
* Sorts by field's natural Term sort order. All comparisons are done using BytesRef.compareTo,
* which is slow for medium to large result sets but possibly very fast for very small results
Expand Down
4 changes: 2 additions & 2 deletions lucene/core/src/java/org/apache/lucene/search/SortField.java
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import org.apache.lucene.search.comparators.FloatComparator;
import org.apache.lucene.search.comparators.IntComparator;
import org.apache.lucene.search.comparators.LongComparator;
import org.apache.lucene.search.comparators.TermOrdValComparator;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.BytesRef;
Expand Down Expand Up @@ -536,8 +537,7 @@ public FieldComparator<?> getComparator(final int numHits, boolean enableSkippin
break;

case STRING:
return new FieldComparator.TermOrdValComparator(
numHits, field, missingValue == STRING_LAST);
return new TermOrdValComparator(numHits, field, missingValue == STRING_LAST, reverse);

case STRING_VAL:
fieldComparator =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import org.apache.lucene.index.SortFieldProvider;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.search.comparators.TermOrdValComparator;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;

Expand Down Expand Up @@ -178,8 +179,7 @@ public void setMissingValue(Object missingValue) {

@Override
public FieldComparator<?> getComparator(int numHits, boolean enableSkipping) {
return new FieldComparator.TermOrdValComparator(
numHits, getField(), missingValue == STRING_LAST) {
return new TermOrdValComparator(numHits, getField(), missingValue == STRING_LAST, reverse) {
@Override
protected SortedDocValues getSortedDocValues(LeafReaderContext context, String field)
throws IOException {
Expand Down
Loading