Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

LUCENE-10070: Skip deleted documents during facet counting for all documents #2577

Closed
wants to merge 10 commits into from
Closed
6 changes: 5 additions & 1 deletion dev-tools/scripts/buildAndPushRelease.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,7 +356,11 @@ def main():
print('Next run the smoker tester:')
p = re.compile(".*/")
m = p.match(sys.argv[0])
print('%s -u %ssmokeTestRelease.py %s' % (sys.executable, m.group(), url))
if not c.sign:
signed = "--not-signed"
else:
signed = ""
print('%s -u %ssmokeTestRelease.py %s %s' % (sys.executable, m.group(), signed, url))

if __name__ == '__main__':
try:
Expand Down
12 changes: 8 additions & 4 deletions dev-tools/scripts/smokeTestRelease.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,11 +170,15 @@ def checkJARMetaData(desc, jarFile, gitRevision, version):

if gitRevision != 'skip':
# Make sure this matches the version and git revision we think we are releasing:
# TODO: LUCENE-7023: is it OK that Implementation-Version's value now spans two lines?
verifyRevision = 'Implementation-Version: %s %s' % (version, gitRevision)
if s.find(verifyRevision) == -1:
raise RuntimeError('%s is missing "%s" inside its META-INF/MANIFEST.MF (wrong git revision?)' % \
match = re.search("Implementation-Version: (.+\r\n .+)", s, re.MULTILINE)
if match:
implLine = match.group(1).replace("\r\n ", "")
verifyRevision = '%s %s' % (version, gitRevision)
if implLine.find(verifyRevision) == -1:
raise RuntimeError('%s is missing "%s" inside its META-INF/MANIFEST.MF (wrong git revision?)' % \
(desc, verifyRevision))
else:
raise RuntimeError('%s is missing Implementation-Version inside its META-INF/MANIFEST.MF' % desc)

notice = decodeUTF8(z.read(NOTICE_FILE_NAME))
license = decodeUTF8(z.read(LICENSE_FILE_NAME))
Expand Down
12 changes: 11 additions & 1 deletion lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,15 @@ Optimizations

Bug Fixes
---------------------
(No changes)

* LUCENE-10110: MultiCollector now handles single leaf collector that wants to skip low-scoring hits
but the combined score mode doesn't allow it. (Jim Ferenczi)

* LUCENE-10111: Missing calculating the bytes used of DocsWithFieldSet in NormValuesWriter.
(Lu Xugang)

* LUCENE-10116: Missing calculating the bytes used of DocsWithFieldSet and currentValues in SortedSetDocValuesWriter.
(Lu Xugang)

Build
---------------------
Expand Down Expand Up @@ -155,6 +163,8 @@ Bug Fixes

* LUCENE-10060: Ensure DrillSidewaysQuery instances never get cached. (Greg Miller, Zachary Chen)

* LUCENE-10070 Skip deleted docs when accumulating facet counts for all docs. (Ankur Goel, Greg Miller)

* LUCENE-10081: KoreanTokenizer should check the max backtrace gap on whitespaces.
(Jim Ferenczi)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ public void addValue(int docID, long value) {
}

private void updateBytesUsed() {
final long newBytesUsed = pending.ramBytesUsed();
final long newBytesUsed = pending.ramBytesUsed() + docsWithField.ramBytesUsed();
iwBytesUsed.addAndGet(newBytesUsed - bytesUsed);
bytesUsed = newBytesUsed;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import org.apache.lucene.util.BytesRefHash;
import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray;
import org.apache.lucene.util.Counter;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.packed.PackedInts;
import org.apache.lucene.util.packed.PackedLongValues;

Expand Down Expand Up @@ -65,7 +66,11 @@ class SortedSetDocValuesWriter extends DocValuesWriter<SortedSetDocValues> {
pending = PackedLongValues.packedBuilder(PackedInts.COMPACT);
pendingCounts = PackedLongValues.deltaPackedBuilder(PackedInts.COMPACT);
docsWithField = new DocsWithFieldSet();
bytesUsed = pending.ramBytesUsed() + pendingCounts.ramBytesUsed();
bytesUsed =
pending.ramBytesUsed()
+ pendingCounts.ramBytesUsed()
+ docsWithField.ramBytesUsed()
+ RamUsageEstimator.sizeOf(currentValues);
iwBytesUsed.addAndGet(bytesUsed);
}

Expand Down Expand Up @@ -133,7 +138,11 @@ private void addOneValue(BytesRef value) {
}

private void updateBytesUsed() {
final long newBytesUsed = pending.ramBytesUsed() + pendingCounts.ramBytesUsed();
final long newBytesUsed =
pending.ramBytesUsed()
+ pendingCounts.ramBytesUsed()
+ docsWithField.ramBytesUsed()
+ RamUsageEstimator.sizeOf(currentValues);
iwBytesUsed.addAndGet(newBytesUsed - bytesUsed);
bytesUsed = newBytesUsed;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -138,11 +138,6 @@ private FieldValueHitQueue(SortField[] fields, int size) {
reverseMul[i] = field.reverse ? -1 : 1;
comparators[i] = field.getComparator(size, i);
}
if (numComparators == 1) {
// inform a comparator that sort is based on this single field
// to enable some optimizations for skipping over non-competitive documents
comparators[0].setSingleSort();
}
}

/**
Expand Down
46 changes: 30 additions & 16 deletions lucene/core/src/java/org/apache/lucene/search/MultiCollector.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,14 @@
import org.apache.lucene.index.LeafReaderContext;

/**
* A {@link Collector} which allows running a search with several
* {@link Collector}s. It offers a static {@link #wrap} method which accepts a
* list of collectors and wraps them with {@link MultiCollector}, while
* filtering out the <code>null</code> null ones.
* A {@link Collector} which allows running a search with several {@link Collector}s. It offers a
* static {@link #wrap} method which accepts a list of collectors and wraps them with {@link
* MultiCollector}, while filtering out the <code>null</code> null ones.
*
* <p><b>NOTE:</b>When mixing collectors that want to skip low-scoring hits ({@link
* ScoreMode#TOP_SCORES}) with ones that require to see all hits, such as mixing {@link
* TopScoreDocCollector} and {@link TotalHitCountCollector}, it should be faster to run the query
* twice, once for each collector, rather than using this wrapper on a single search.
*/
public class MultiCollector implements Collector {

Expand All @@ -48,7 +52,7 @@ public static Collector wrap(Collector... collectors) {
* <li>Otherwise the method returns a {@link MultiCollector} which wraps the
* non-<code>null</code> ones.
* </ul>
*
*
* @throws IllegalArgumentException
* if either 0 collectors were input, or all collectors are
* <code>null</code>.
Expand Down Expand Up @@ -87,7 +91,7 @@ public static Collector wrap(Iterable<? extends Collector> collectors) {
return new MultiCollector(colls);
}
}

private final boolean cacheScores;
private final Collector[] collectors;

Expand Down Expand Up @@ -118,6 +122,7 @@ public ScoreMode scoreMode() {
@Override
public LeafCollector getLeafCollector(LeafReaderContext context) throws IOException {
final List<LeafCollector> leafCollectors = new ArrayList<>(collectors.length);
ScoreMode leafScoreMode = null;
for (Collector collector : collectors) {
final LeafCollector leafCollector;
try {
Expand All @@ -126,15 +131,24 @@ public LeafCollector getLeafCollector(LeafReaderContext context) throws IOExcept
// this leaf collector does not need this segment
continue;
}
if (leafScoreMode == null) {
leafScoreMode = collector.scoreMode();
} else if (leafScoreMode != collector.scoreMode()) {
leafScoreMode = ScoreMode.COMPLETE;
}
leafCollectors.add(leafCollector);
}
switch (leafCollectors.size()) {
case 0:
throw new CollectionTerminatedException();
case 1:
if (leafCollectors.isEmpty()) {
throw new CollectionTerminatedException();
} else {
// Wraps single leaf collector that wants to skip low-scoring hits (ScoreMode.TOP_SCORES)
// but the global score mode doesn't allow it.
if (leafCollectors.size() == 1
&& (scoreMode() == ScoreMode.TOP_SCORES || leafScoreMode != ScoreMode.TOP_SCORES)) {
return leafCollectors.get(0);
default:
return new MultiLeafCollector(leafCollectors, cacheScores, scoreMode() == ScoreMode.TOP_SCORES);
}
return new MultiLeafCollector(
leafCollectors, cacheScores, scoreMode() == ScoreMode.TOP_SCORES);
}
}

Expand Down Expand Up @@ -210,9 +224,9 @@ private boolean allCollectorsTerminated() {
}

}

final static class MinCompetitiveScoreAwareScorable extends FilterScorable {

private final int idx;
private final float[] minScores;

Expand All @@ -221,7 +235,7 @@ final static class MinCompetitiveScoreAwareScorable extends FilterScorable {
this.idx = idx;
this.minScores = minScores;
}

@Override
public void setMinCompetitiveScore(float minScore) throws IOException {
if (minScore > minScores[idx]) {
Expand All @@ -239,7 +253,7 @@ private float minScore() {
}
return min;
}

}

}
Original file line number Diff line number Diff line change
Expand Up @@ -517,6 +517,13 @@ static TopFieldCollector create(Sort sort, int numHits, FieldDoc after,
FieldValueHitQueue<Entry> queue = FieldValueHitQueue.create(sort.fields, numHits);

if (after == null) {
// inform a comparator that sort is based on this single field
// to enable some optimizations for skipping over non-competitive documents
// We can't set single sort when the `after` parameter is non-null as it's
// an implicit sort over the document id.
if (queue.comparators.length == 1) {
queue.comparators[0].setSingleSort();
}
return new SimpleFieldCollector(sort, queue, numHits, hitsThresholdChecker, minScoreAcc);
} else {
if (after.fields == null) {
Expand Down
Loading