-
Notifications
You must be signed in to change notification settings - Fork 977
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
LUCENE-10061: Implements dynamic pruning support for CombinedFieldsQuery #418
Open
zacharymorn
wants to merge
17
commits into
apache:main
Choose a base branch
from
zacharymorn:LUCENE-10061-CombinedFieldsQuery-Pruning-Support
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
Show all changes
17 commits
Select commit
Hold shift + click to select a range
3c60209
LUCENE-10061: Implements basic dynamic pruning support for CombinedFi…
zacharymorn 2ba435e
replace cartesian product calculation with simple upper-bound
zacharymorn 1a71469
Bug fix and optimize per CPU profiler
zacharymorn 8ecdc69
Fix style
zacharymorn b4ac2b4
Update test
zacharymorn 4244f39
Update amplifiedMaxFreq calculation
zacharymorn 81206d4
Simplify and reduce maxFreq
zacharymorn 68eb9b8
Merge branch 'main' into LUCENE-10061-CombinedFieldsQuery-Pruning-Sup…
zacharymorn 6d5e780
cache leadingImpactsPerField
zacharymorn db2446f
test leadImpact from highest weighted field
zacharymorn 3d0a215
Fix bug
zacharymorn 8a7ea99
Address feedback - avoid repeated computation for max weighted field,…
zacharymorn 0a9bdcc
Refactoring - extract out logic that merges impacts within the same f…
zacharymorn 808fec2
Refactoring - use ImpactsMergingUtils in CombinedFieldsQuery
zacharymorn 75c5b04
Use getDocIdUpTo to determine lead impacts
zacharymorn cbbd28b
Use leading impacts across fields
zacharymorn 502d44e
Use higher abstraction SynonymImpactsSource in CombinedFieldQuery
zacharymorn File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
192 changes: 192 additions & 0 deletions
192
lucene/core/src/java/org/apache/lucene/search/ImpactsMergingUtils.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,192 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package org.apache.lucene.search; | ||
|
||
import java.util.ArrayList; | ||
import java.util.Collections; | ||
import java.util.Iterator; | ||
import java.util.List; | ||
import java.util.stream.Collectors; | ||
import org.apache.lucene.index.Impact; | ||
import org.apache.lucene.index.Impacts; | ||
import org.apache.lucene.index.ImpactsEnum; | ||
import org.apache.lucene.util.PriorityQueue; | ||
import org.apache.lucene.util.SmallFloat; | ||
|
||
/** | ||
* Utils for merging impacts for SynonymQuery, CombinedFieldsQuery etc | ||
* | ||
* @lucene.internal | ||
*/ | ||
public final class ImpactsMergingUtils { | ||
/** Cache of decoded norms. */ | ||
private static final float[] LENGTH_TABLE = new float[256]; | ||
|
||
static { | ||
for (int i = 0; i < 256; i++) { | ||
LENGTH_TABLE[i] = SmallFloat.byte4ToInt((byte) i); | ||
} | ||
} | ||
|
||
/** | ||
* Return the minimum level whose impacts are valid up to {@code docIdUpTo}, or {@code -1} if | ||
* there is no such level. | ||
*/ | ||
private static int getLevel(Impacts impacts, int docIdUpTo) { | ||
for (int level = 0, numLevels = impacts.numLevels(); level < numLevels; ++level) { | ||
if (impacts.getDocIdUpTo(level) >= docIdUpTo) { | ||
return level; | ||
} | ||
} | ||
return -1; | ||
} | ||
|
||
private static class SubIterator { | ||
final Iterator<Impact> iterator; | ||
int previousFreq; | ||
Impact current; | ||
|
||
SubIterator(Iterator<Impact> iterator) { | ||
this.iterator = iterator; | ||
this.current = iterator.next(); | ||
} | ||
|
||
void next() { | ||
previousFreq = current.freq; | ||
if (iterator.hasNext() == false) { | ||
current = null; | ||
} else { | ||
current = iterator.next(); | ||
} | ||
} | ||
} | ||
|
||
private static double normToLength(long norm) { | ||
return LENGTH_TABLE[Byte.toUnsignedInt((byte) norm)]; | ||
} | ||
|
||
/** | ||
* Merge impacts from multiple impactsEnum (terms matches) within the same field. The high level | ||
* logic is to combine freqs that have the same norm from impacts. | ||
*/ | ||
public static List<Impact> mergeImpactsPerField( | ||
ImpactsEnum[] impactsEnum, | ||
Impacts[] impacts, | ||
float[] termBoosts, | ||
int docIdUpTo, | ||
boolean combineMultiNorms) { | ||
assert impactsEnum.length == impacts.length; | ||
assert impactsEnum.length == termBoosts.length; | ||
|
||
List<List<Impact>> toMerge = new ArrayList<>(); | ||
|
||
for (int i = 0; i < impactsEnum.length; ++i) { | ||
if (impactsEnum[i].docID() <= docIdUpTo) { | ||
int impactsLevel = getLevel(impacts[i], docIdUpTo); | ||
if (impactsLevel == -1) { | ||
// One instance doesn't have impacts that cover up to docIdUpTo | ||
// Return impacts that trigger the maximum score | ||
return Collections.singletonList(new Impact(Integer.MAX_VALUE, 1L)); | ||
} | ||
final List<Impact> impactList; | ||
if (termBoosts[i] != 1f) { | ||
float boost = termBoosts[i]; | ||
impactList = | ||
impacts[i].getImpacts(impactsLevel).stream() | ||
.map( | ||
impact -> { | ||
int boostedFreq = (int) Math.ceil(impact.freq * boost); | ||
long boostedNorm = | ||
combineMultiNorms | ||
? SmallFloat.intToByte4( | ||
(int) Math.floor(normToLength(impact.norm) * boost)) | ||
: impact.norm; | ||
return new Impact(boostedFreq, boostedNorm); | ||
}) | ||
.collect(Collectors.toList()); | ||
} else { | ||
impactList = impacts[i].getImpacts(impactsLevel); | ||
} | ||
toMerge.add(impactList); | ||
} | ||
} | ||
|
||
// all impactEnums for this field were positioned beyond docIdUpTo, which is possible when | ||
// 1. there are multiple fields involved. | ||
// 2. docIdUpTo was taken from minimum from all impactEnums across fields | ||
if (toMerge.size() == 0) { | ||
return new ArrayList<>(); | ||
} | ||
|
||
if (toMerge.size() == 1) { | ||
// common if one synonym is common and the other one is rare | ||
return toMerge.get(0); | ||
} | ||
|
||
PriorityQueue<SubIterator> pq = | ||
new PriorityQueue<SubIterator>(impacts.length) { | ||
@Override | ||
protected boolean lessThan(SubIterator a, SubIterator b) { | ||
if (a.current == null) { // means iteration is finished | ||
return false; | ||
} | ||
if (b.current == null) { | ||
return true; | ||
} | ||
return Long.compareUnsigned(a.current.norm, b.current.norm) < 0; | ||
} | ||
}; | ||
for (List<Impact> toMergeImpacts : toMerge) { | ||
pq.add(new SubIterator(toMergeImpacts.iterator())); | ||
} | ||
|
||
List<Impact> mergedImpacts = new ArrayList<>(); | ||
|
||
// Idea: merge impacts by norm. The tricky thing is that we need to | ||
// consider norm values that are not in the impacts too. For | ||
// instance if the list of impacts is [{freq=2,norm=10}, {freq=4,norm=12}], | ||
// there might well be a document that has a freq of 2 and a length of 11, | ||
// which was just not added to the list of impacts because {freq=2,norm=10} | ||
// is more competitive. So the way it works is that we track the sum of | ||
// the term freqs that we have seen so far in order to account for these | ||
// implicit impacts. | ||
|
||
long sumTf = 0; | ||
SubIterator top = pq.top(); | ||
do { | ||
final long norm = top.current.norm; | ||
do { | ||
sumTf += top.current.freq - top.previousFreq; | ||
top.next(); | ||
top = pq.updateTop(); | ||
} while (top.current != null && top.current.norm == norm); | ||
|
||
final int freqUpperBound = (int) Math.min(Integer.MAX_VALUE, sumTf); | ||
if (mergedImpacts.isEmpty()) { | ||
mergedImpacts.add(new Impact(freqUpperBound, norm)); | ||
} else { | ||
Impact prevImpact = mergedImpacts.get(mergedImpacts.size() - 1); | ||
assert Long.compareUnsigned(prevImpact.norm, norm) < 0; | ||
if (freqUpperBound > prevImpact.freq) { | ||
mergedImpacts.add(new Impact(freqUpperBound, norm)); | ||
} // otherwise the previous impact is already more competitive | ||
} | ||
} while (top.current != null); | ||
|
||
return mergedImpacts; | ||
} | ||
} |
88 changes: 88 additions & 0 deletions
88
lucene/core/src/java/org/apache/lucene/search/SynonymImpactsSource.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package org.apache.lucene.search; | ||
|
||
import java.io.IOException; | ||
import java.util.List; | ||
import org.apache.lucene.index.Impact; | ||
import org.apache.lucene.index.Impacts; | ||
import org.apache.lucene.index.ImpactsEnum; | ||
import org.apache.lucene.index.ImpactsSource; | ||
|
||
public class SynonymImpactsSource implements ImpactsSource { | ||
|
||
private final ImpactsEnum[] impactsEnums; | ||
private final Impacts[] impacts; | ||
private final float[] boosts; | ||
private Impacts lead; | ||
|
||
public SynonymImpactsSource(ImpactsEnum[] impactsEnums, float[] boosts) { | ||
this.impactsEnums = impactsEnums; | ||
this.boosts = boosts; | ||
this.impacts = new Impacts[impactsEnums.length]; | ||
} | ||
|
||
@Override | ||
public Impacts getImpacts() throws IOException { | ||
// Use the impacts that have the lower next boundary as a lead. | ||
// It will decide on the number of levels and the block boundaries. | ||
if (lead == null) { | ||
Impacts tmpLead = null; | ||
for (int i = 0; i < impactsEnums.length; ++i) { | ||
impacts[i] = impactsEnums[i].getImpacts(); | ||
if (tmpLead == null || impacts[i].getDocIdUpTo(0) < tmpLead.getDocIdUpTo(0)) { | ||
tmpLead = impacts[i]; | ||
} | ||
} | ||
lead = tmpLead; | ||
} | ||
return new Impacts() { | ||
|
||
@Override | ||
public int numLevels() { | ||
// Delegate to the lead | ||
return lead.numLevels(); | ||
} | ||
|
||
@Override | ||
public int getDocIdUpTo(int level) { | ||
// Delegate to the lead | ||
return lead.getDocIdUpTo(level); | ||
} | ||
|
||
@Override | ||
public List<Impact> getImpacts(int level) { | ||
final int docIdUpTo = getDocIdUpTo(level); | ||
return ImpactsMergingUtils.mergeImpactsPerField( | ||
impactsEnums, impacts, boosts, docIdUpTo, false); | ||
} | ||
}; | ||
} | ||
|
||
@Override | ||
public void advanceShallow(int target) throws IOException { | ||
for (ImpactsEnum impactsEnum : impactsEnums) { | ||
if (impactsEnum.docID() < target) { | ||
impactsEnum.advanceShallow(target); | ||
} | ||
} | ||
} | ||
|
||
public Impacts[] impacts() { | ||
return impacts; | ||
} | ||
} |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
MixedMutabilityReturnType: This method returns both mutable and immutable collections or maps from different paths. This may be confusing for users of the method. (details)
(at-me in a reply with
help
orignore
)