diff --git a/lucene/core/src/java/org/apache/lucene/search/ImpactsMergingUtils.java b/lucene/core/src/java/org/apache/lucene/search/ImpactsMergingUtils.java new file mode 100644 index 00000000000..049fae8c6d9 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/ImpactsMergingUtils.java @@ -0,0 +1,192 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import java.util.stream.Collectors; +import org.apache.lucene.index.Impact; +import org.apache.lucene.index.Impacts; +import org.apache.lucene.index.ImpactsEnum; +import org.apache.lucene.util.PriorityQueue; +import org.apache.lucene.util.SmallFloat; + +/** + * Utils for merging impacts for SynonymQuery, CombinedFieldsQuery etc + * + * @lucene.internal + */ +public final class ImpactsMergingUtils { + /** Cache of decoded norms. */ + private static final float[] LENGTH_TABLE = new float[256]; + + static { + for (int i = 0; i < 256; i++) { + LENGTH_TABLE[i] = SmallFloat.byte4ToInt((byte) i); + } + } + + /** + * Return the minimum level whose impacts are valid up to {@code docIdUpTo}, or {@code -1} if + * there is no such level. + */ + private static int getLevel(Impacts impacts, int docIdUpTo) { + for (int level = 0, numLevels = impacts.numLevels(); level < numLevels; ++level) { + if (impacts.getDocIdUpTo(level) >= docIdUpTo) { + return level; + } + } + return -1; + } + + private static class SubIterator { + final Iterator iterator; + int previousFreq; + Impact current; + + SubIterator(Iterator iterator) { + this.iterator = iterator; + this.current = iterator.next(); + } + + void next() { + previousFreq = current.freq; + if (iterator.hasNext() == false) { + current = null; + } else { + current = iterator.next(); + } + } + } + + private static double normToLength(long norm) { + return LENGTH_TABLE[Byte.toUnsignedInt((byte) norm)]; + } + + /** + * Merge impacts from multiple impactsEnum (terms matches) within the same field. The high level + * logic is to combine freqs that have the same norm from impacts. + */ + public static List mergeImpactsPerField( + ImpactsEnum[] impactsEnum, + Impacts[] impacts, + float[] termBoosts, + int docIdUpTo, + boolean combineMultiNorms) { + assert impactsEnum.length == impacts.length; + assert impactsEnum.length == termBoosts.length; + + List> toMerge = new ArrayList<>(); + + for (int i = 0; i < impactsEnum.length; ++i) { + if (impactsEnum[i].docID() <= docIdUpTo) { + int impactsLevel = getLevel(impacts[i], docIdUpTo); + if (impactsLevel == -1) { + // One instance doesn't have impacts that cover up to docIdUpTo + // Return impacts that trigger the maximum score + return Collections.singletonList(new Impact(Integer.MAX_VALUE, 1L)); + } + final List impactList; + if (termBoosts[i] != 1f) { + float boost = termBoosts[i]; + impactList = + impacts[i].getImpacts(impactsLevel).stream() + .map( + impact -> { + int boostedFreq = (int) Math.ceil(impact.freq * boost); + long boostedNorm = + combineMultiNorms + ? SmallFloat.intToByte4( + (int) Math.floor(normToLength(impact.norm) * boost)) + : impact.norm; + return new Impact(boostedFreq, boostedNorm); + }) + .collect(Collectors.toList()); + } else { + impactList = impacts[i].getImpacts(impactsLevel); + } + toMerge.add(impactList); + } + } + + // all impactEnums for this field were positioned beyond docIdUpTo, which is possible when + // 1. there are multiple fields involved. + // 2. docIdUpTo was taken from minimum from all impactEnums across fields + if (toMerge.size() == 0) { + return new ArrayList<>(); + } + + if (toMerge.size() == 1) { + // common if one synonym is common and the other one is rare + return toMerge.get(0); + } + + PriorityQueue pq = + new PriorityQueue(impacts.length) { + @Override + protected boolean lessThan(SubIterator a, SubIterator b) { + if (a.current == null) { // means iteration is finished + return false; + } + if (b.current == null) { + return true; + } + return Long.compareUnsigned(a.current.norm, b.current.norm) < 0; + } + }; + for (List toMergeImpacts : toMerge) { + pq.add(new SubIterator(toMergeImpacts.iterator())); + } + + List mergedImpacts = new ArrayList<>(); + + // Idea: merge impacts by norm. The tricky thing is that we need to + // consider norm values that are not in the impacts too. For + // instance if the list of impacts is [{freq=2,norm=10}, {freq=4,norm=12}], + // there might well be a document that has a freq of 2 and a length of 11, + // which was just not added to the list of impacts because {freq=2,norm=10} + // is more competitive. So the way it works is that we track the sum of + // the term freqs that we have seen so far in order to account for these + // implicit impacts. + + long sumTf = 0; + SubIterator top = pq.top(); + do { + final long norm = top.current.norm; + do { + sumTf += top.current.freq - top.previousFreq; + top.next(); + top = pq.updateTop(); + } while (top.current != null && top.current.norm == norm); + + final int freqUpperBound = (int) Math.min(Integer.MAX_VALUE, sumTf); + if (mergedImpacts.isEmpty()) { + mergedImpacts.add(new Impact(freqUpperBound, norm)); + } else { + Impact prevImpact = mergedImpacts.get(mergedImpacts.size() - 1); + assert Long.compareUnsigned(prevImpact.norm, norm) < 0; + if (freqUpperBound > prevImpact.freq) { + mergedImpacts.add(new Impact(freqUpperBound, norm)); + } // otherwise the previous impact is already more competitive + } + } while (top.current != null); + + return mergedImpacts; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/search/SynonymImpactsSource.java b/lucene/core/src/java/org/apache/lucene/search/SynonymImpactsSource.java new file mode 100644 index 00000000000..4366c012670 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/SynonymImpactsSource.java @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import java.io.IOException; +import java.util.List; +import org.apache.lucene.index.Impact; +import org.apache.lucene.index.Impacts; +import org.apache.lucene.index.ImpactsEnum; +import org.apache.lucene.index.ImpactsSource; + +public class SynonymImpactsSource implements ImpactsSource { + + private final ImpactsEnum[] impactsEnums; + private final Impacts[] impacts; + private final float[] boosts; + private Impacts lead; + + public SynonymImpactsSource(ImpactsEnum[] impactsEnums, float[] boosts) { + this.impactsEnums = impactsEnums; + this.boosts = boosts; + this.impacts = new Impacts[impactsEnums.length]; + } + + @Override + public Impacts getImpacts() throws IOException { + // Use the impacts that have the lower next boundary as a lead. + // It will decide on the number of levels and the block boundaries. + if (lead == null) { + Impacts tmpLead = null; + for (int i = 0; i < impactsEnums.length; ++i) { + impacts[i] = impactsEnums[i].getImpacts(); + if (tmpLead == null || impacts[i].getDocIdUpTo(0) < tmpLead.getDocIdUpTo(0)) { + tmpLead = impacts[i]; + } + } + lead = tmpLead; + } + return new Impacts() { + + @Override + public int numLevels() { + // Delegate to the lead + return lead.numLevels(); + } + + @Override + public int getDocIdUpTo(int level) { + // Delegate to the lead + return lead.getDocIdUpTo(level); + } + + @Override + public List getImpacts(int level) { + final int docIdUpTo = getDocIdUpTo(level); + return ImpactsMergingUtils.mergeImpactsPerField( + impactsEnums, impacts, boosts, docIdUpTo, false); + } + }; + } + + @Override + public void advanceShallow(int target) throws IOException { + for (ImpactsEnum impactsEnum : impactsEnums) { + if (impactsEnum.docID() < target) { + impactsEnum.advanceShallow(target); + } + } + } + + public Impacts[] impacts() { + return impacts; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java b/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java index 2aefe3f7860..f8e38a07d24 100644 --- a/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java @@ -21,12 +21,9 @@ import java.util.Arrays; import java.util.Collections; import java.util.Comparator; -import java.util.Iterator; import java.util.List; import java.util.Objects; import java.util.stream.Collectors; -import org.apache.lucene.index.Impact; -import org.apache.lucene.index.Impacts; import org.apache.lucene.index.ImpactsEnum; import org.apache.lucene.index.ImpactsSource; import org.apache.lucene.index.IndexReader; @@ -40,7 +37,6 @@ import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.PriorityQueue; /** * A query that treats multiple terms as synonyms. @@ -344,169 +340,7 @@ public boolean isCacheable(LeafReaderContext ctx) { /** Merge impacts for multiple synonyms. */ static ImpactsSource mergeImpacts(ImpactsEnum[] impactsEnums, float[] boosts) { assert impactsEnums.length == boosts.length; - return new ImpactsSource() { - - class SubIterator { - final Iterator iterator; - int previousFreq; - Impact current; - - SubIterator(Iterator iterator) { - this.iterator = iterator; - this.current = iterator.next(); - } - - void next() { - previousFreq = current.freq; - if (iterator.hasNext() == false) { - current = null; - } else { - current = iterator.next(); - } - } - } - - @Override - public Impacts getImpacts() throws IOException { - final Impacts[] impacts = new Impacts[impactsEnums.length]; - // Use the impacts that have the lower next boundary as a lead. - // It will decide on the number of levels and the block boundaries. - Impacts tmpLead = null; - for (int i = 0; i < impactsEnums.length; ++i) { - impacts[i] = impactsEnums[i].getImpacts(); - if (tmpLead == null || impacts[i].getDocIdUpTo(0) < tmpLead.getDocIdUpTo(0)) { - tmpLead = impacts[i]; - } - } - final Impacts lead = tmpLead; - return new Impacts() { - - @Override - public int numLevels() { - // Delegate to the lead - return lead.numLevels(); - } - - @Override - public int getDocIdUpTo(int level) { - // Delegate to the lead - return lead.getDocIdUpTo(level); - } - - /** - * Return the minimum level whose impacts are valid up to {@code docIdUpTo}, or {@code -1} - * if there is no such level. - */ - private int getLevel(Impacts impacts, int docIdUpTo) { - for (int level = 0, numLevels = impacts.numLevels(); level < numLevels; ++level) { - if (impacts.getDocIdUpTo(level) >= docIdUpTo) { - return level; - } - } - return -1; - } - - @Override - public List getImpacts(int level) { - final int docIdUpTo = getDocIdUpTo(level); - - List> toMerge = new ArrayList<>(); - - for (int i = 0; i < impactsEnums.length; ++i) { - if (impactsEnums[i].docID() <= docIdUpTo) { - int impactsLevel = getLevel(impacts[i], docIdUpTo); - if (impactsLevel == -1) { - // One instance doesn't have impacts that cover up to docIdUpTo - // Return impacts that trigger the maximum score - return Collections.singletonList(new Impact(Integer.MAX_VALUE, 1L)); - } - final List impactList; - if (boosts[i] != 1f) { - float boost = boosts[i]; - impactList = - impacts[i].getImpacts(impactsLevel).stream() - .map( - impact -> - new Impact((int) Math.ceil(impact.freq * boost), impact.norm)) - .collect(Collectors.toList()); - } else { - impactList = impacts[i].getImpacts(impactsLevel); - } - toMerge.add(impactList); - } - } - assert toMerge.size() - > 0; // otherwise it would mean the docID is > docIdUpTo, which is wrong - - if (toMerge.size() == 1) { - // common if one synonym is common and the other one is rare - return toMerge.get(0); - } - - PriorityQueue pq = - new PriorityQueue(impacts.length) { - @Override - protected boolean lessThan(SubIterator a, SubIterator b) { - if (a.current == null) { // means iteration is finished - return false; - } - if (b.current == null) { - return true; - } - return Long.compareUnsigned(a.current.norm, b.current.norm) < 0; - } - }; - for (List impacts : toMerge) { - pq.add(new SubIterator(impacts.iterator())); - } - - List mergedImpacts = new ArrayList<>(); - - // Idea: merge impacts by norm. The tricky thing is that we need to - // consider norm values that are not in the impacts too. For - // instance if the list of impacts is [{freq=2,norm=10}, {freq=4,norm=12}], - // there might well be a document that has a freq of 2 and a length of 11, - // which was just not added to the list of impacts because {freq=2,norm=10} - // is more competitive. So the way it works is that we track the sum of - // the term freqs that we have seen so far in order to account for these - // implicit impacts. - - long sumTf = 0; - SubIterator top = pq.top(); - do { - final long norm = top.current.norm; - do { - sumTf += top.current.freq - top.previousFreq; - top.next(); - top = pq.updateTop(); - } while (top.current != null && top.current.norm == norm); - - final int freqUpperBound = (int) Math.min(Integer.MAX_VALUE, sumTf); - if (mergedImpacts.isEmpty()) { - mergedImpacts.add(new Impact(freqUpperBound, norm)); - } else { - Impact prevImpact = mergedImpacts.get(mergedImpacts.size() - 1); - assert Long.compareUnsigned(prevImpact.norm, norm) < 0; - if (freqUpperBound > prevImpact.freq) { - mergedImpacts.add(new Impact(freqUpperBound, norm)); - } // otherwise the previous impact is already more competitive - } - } while (top.current != null); - - return mergedImpacts; - } - }; - } - - @Override - public void advanceShallow(int target) throws IOException { - for (ImpactsEnum impactsEnum : impactsEnums) { - if (impactsEnum.docID() < target) { - impactsEnum.advanceShallow(target); - } - } - } - }; + return new SynonymImpactsSource(impactsEnums, boosts); } private static class SynonymScorer extends Scorer { diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java index fccd6ce3eca..d8ed894f150 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java @@ -29,6 +29,10 @@ import java.util.TreeMap; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.Impact; +import org.apache.lucene.index.Impacts; +import org.apache.lucene.index.ImpactsEnum; +import org.apache.lucene.index.ImpactsSource; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.PostingsEnum; @@ -44,6 +48,8 @@ import org.apache.lucene.search.DisjunctionDISIApproximation; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.Explanation; +import org.apache.lucene.search.ImpactsDISI; +import org.apache.lucene.search.ImpactsMergingUtils; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.LeafSimScorer; import org.apache.lucene.search.Matches; @@ -51,6 +57,7 @@ import org.apache.lucene.search.QueryVisitor; import org.apache.lucene.search.ScoreMode; import org.apache.lucene.search.Scorer; +import org.apache.lucene.search.SynonymImpactsSource; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TermScorer; import org.apache.lucene.search.TermStatistics; @@ -319,16 +326,19 @@ class CombinedFieldWeight extends Weight { private final IndexSearcher searcher; private final TermStates[] termStates; private final Similarity.SimScorer simWeight; + private final ScoreMode scoreMode; CombinedFieldWeight(Query query, IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { super(query); assert scoreMode.needsScores(); + this.scoreMode = scoreMode; this.searcher = searcher; long docFreq = 0; long totalTermFreq = 0; termStates = new TermStates[fieldTerms.length]; - for (int i = 0; i < termStates.length; i++) { + + for (int i = 0; i < fieldTerms.length; i++) { FieldAndWeight field = fieldAndWeights.get(fieldTerms[i].field()); TermStates ts = TermStates.build(searcher.getTopReaderContext(), fieldTerms[i], true); termStates[i] = ts; @@ -356,6 +366,7 @@ private CollectionStatistics mergeCollectionStatistics(IndexSearcher searcher) long docCount = 0; long sumTotalTermFreq = 0; long sumDocFreq = 0; + for (FieldAndWeight fieldWeight : fieldAndWeights.values()) { CollectionStatistics collectionStats = searcher.collectionStatistics(fieldWeight.field); if (collectionStats != null) { @@ -402,14 +413,27 @@ public Explanation explain(LeafReaderContext context, int doc) throws IOExceptio public Scorer scorer(LeafReaderContext context) throws IOException { List iterators = new ArrayList<>(); List fields = new ArrayList<>(); + Map> tempFieldImpactsEnums = new HashMap<>(fieldAndWeights.size()); + + float maxWeight = Float.MIN_VALUE; for (int i = 0; i < fieldTerms.length; i++) { TermState state = termStates[i].get(context); if (state != null) { - TermsEnum termsEnum = context.reader().terms(fieldTerms[i].field()).iterator(); + String fieldName = fieldTerms[i].field(); + fields.add(fieldAndWeights.get(fieldName)); + tempFieldImpactsEnums.putIfAbsent(fieldName, new ArrayList<>()); + + TermsEnum termsEnum = context.reader().terms(fieldName).iterator(); termsEnum.seekExact(fieldTerms[i].bytes(), state); - PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.FREQS); - iterators.add(postingsEnum); - fields.add(fieldAndWeights.get(fieldTerms[i].field())); + + if (scoreMode == ScoreMode.TOP_SCORES) { + ImpactsEnum impactsEnum = termsEnum.impacts(PostingsEnum.FREQS); + iterators.add(impactsEnum); + tempFieldImpactsEnums.get(fieldName).add(impactsEnum); + } else { + PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.FREQS); + iterators.add(postingsEnum); + } } } @@ -421,18 +445,40 @@ public Scorer scorer(LeafReaderContext context) throws IOException { new MultiNormsLeafSimScorer(simWeight, context.reader(), fieldAndWeights.values(), true); LeafSimScorer nonScoringSimScorer = new LeafSimScorer(simWeight, context.reader(), "pseudo_field", false); + // we use termscorers + disjunction as an impl detail DisiPriorityQueue queue = new DisiPriorityQueue(iterators.size()); for (int i = 0; i < iterators.size(); i++) { - float weight = fields.get(i).weight; + FieldAndWeight fieldAndWeight = fields.get(i); queue.add( new WeightedDisiWrapper( - new TermScorer(this, iterators.get(i), nonScoringSimScorer), weight)); + new TermScorer(this, iterators.get(i), nonScoringSimScorer), + fieldAndWeight.weight)); } + // Even though it is called approximation, it is accurate since none of // the sub iterators are two-phase iterators. DocIdSetIterator iterator = new DisjunctionDISIApproximation(queue); - return new CombinedFieldScorer(this, queue, iterator, scoringSimScorer); + ImpactsDISI impactsDisi = null; + + if (scoreMode == ScoreMode.TOP_SCORES) { + + Map fieldImpactsEnums = new HashMap<>(); + Map fieldWeights = new HashMap<>(); + + for (Map.Entry> e : tempFieldImpactsEnums.entrySet()) { + fieldImpactsEnums.put(e.getKey(), e.getValue().toArray(new ImpactsEnum[0])); + + float[] weights = new float[e.getValue().size()]; + Arrays.fill(weights, fieldAndWeights.get(e.getKey()).weight); + fieldWeights.put(e.getKey(), weights); + } + + ImpactsSource impactsSource = mergeImpacts(fieldImpactsEnums, fieldWeights); + iterator = impactsDisi = new ImpactsDISI(iterator, impactsSource, simWeight); + } + + return new CombinedFieldScorer(this, queue, iterator, impactsDisi, scoringSimScorer); } @Override @@ -441,6 +487,148 @@ public boolean isCacheable(LeafReaderContext ctx) { } } + /** Merge impacts for combined field. */ + static ImpactsSource mergeImpacts( + Map fieldImpactsEnum, Map fieldWeights) { + + return new ImpactsSource() { + Map fieldImpactsSource = null; + + @Override + public Impacts getImpacts() throws IOException { + if (fieldImpactsSource == null) { + fieldImpactsSource = new HashMap<>(); + for (Map.Entry e : fieldImpactsEnum.entrySet()) { + SynonymImpactsSource source = + new SynonymImpactsSource(e.getValue(), fieldWeights.get(e.getKey())); + fieldImpactsSource.put(e.getKey(), source); + } + } + + return new Impacts() { + @Override + public int numLevels() { + // max of levels across fields' impactEnums + int result = 0; + + for (SynonymImpactsSource s : fieldImpactsSource.values()) { + try { + result = Math.max(result, s.getImpacts().numLevels()); + } catch (IOException e) { + // nocommit to be handled + e.printStackTrace(); + } + } + + return result; + } + + @Override + public int getDocIdUpTo(int level) { + // min of docIdUpTo across fields' impactEnums + int result = Integer.MAX_VALUE; + + for (SynonymImpactsSource s : fieldImpactsSource.values()) { + Impacts impacts; + try { + impacts = s.getImpacts(); + if (impacts.numLevels() > level) { + result = Math.min(result, impacts.getDocIdUpTo(level)); + } + } catch (IOException e) { + // nocommit to be handled + e.printStackTrace(); + } + } + + return result; + } + + // this can't loop over each field's SynonymImpactsSource.getImpacts().getImpacts(level) + // and then combine impacts, + // as docIdUpTo of each SynonymImpactsSource.getImpacts().getImpacts(level) might be + // different for the same level + @Override + public List getImpacts(int level) { + final int docIdUpTo = getDocIdUpTo(level); + final Map> mergedImpactsPerField = + getMergedImpactsPerField(docIdUpTo); + + return mergeImpactsAcrossFields(mergedImpactsPerField); + } + + private Map> getMergedImpactsPerField(int docIdUpTo) { + final Map> result = new HashMap<>(fieldImpactsEnum.size()); + + for (Map.Entry e : fieldImpactsEnum.entrySet()) { + String field = e.getKey(); + ImpactsEnum[] impactsEnums = e.getValue(); + List mergedImpacts = + ImpactsMergingUtils.mergeImpactsPerField( + impactsEnums, + fieldImpactsSource.get(field).impacts(), + fieldWeights.get(field), + docIdUpTo, + true); + + if (mergedImpacts.size() == 0) { + // all impactEnums for this field were positioned beyond docIdUpTo, continue to next + // field + continue; + } else if (mergedImpacts.size() == 1 + && mergedImpacts.get(0).freq == Integer.MAX_VALUE + && mergedImpacts.get(0).norm == 1L) { + // one field gets impacts that trigger maximum score, pass it up + return Collections.singletonMap(field, mergedImpacts); + } else { + result.put(field, mergedImpacts); + } + } + + return result; + } + + private List mergeImpactsAcrossFields( + Map> mergedImpactsPerField) { + if (mergedImpactsPerField.size() == 1) { + return mergedImpactsPerField.values().iterator().next(); + } + + // upper-bound by creating an impact that should be most competitive: + // this is done to avoid the potential costly combinatorial explosion from accurate + // computation + // on merged impacts across fields + long maxFreqSum = 0; + long minNorm = Long.MAX_VALUE; + for (List impacts : mergedImpactsPerField.values()) { + // highest freq at the end of each impact list + maxFreqSum += impacts.get(impacts.size() - 1).freq; + // lowest norm at the start of each impact list + minNorm = Math.min(minNorm, impacts.get(0).norm); + } + + return Collections.singletonList( + new Impact((int) Math.min(maxFreqSum, Integer.MAX_VALUE), minNorm)); + } + }; + } + + @Override + public void advanceShallow(int target) throws IOException { + for (ImpactsEnum[] impactsEnums : fieldImpactsEnum.values()) { + for (ImpactsEnum impactsEnum : impactsEnums) { + if (impactsEnum.docID() < target) { + impactsEnum.advanceShallow(target); + } + } + } + + fieldImpactsSource = null; + } + }; + } + private static class WeightedDisiWrapper extends DisiWrapper { final float weight; @@ -458,15 +646,18 @@ private static class CombinedFieldScorer extends Scorer { private final DisiPriorityQueue queue; private final DocIdSetIterator iterator; private final MultiNormsLeafSimScorer simScorer; + private final ImpactsDISI impactsDISI; CombinedFieldScorer( Weight weight, DisiPriorityQueue queue, DocIdSetIterator iterator, + ImpactsDISI impactsDISI, MultiNormsLeafSimScorer simScorer) { super(weight); this.queue = queue; this.iterator = iterator; + this.impactsDISI = impactsDISI; this.simScorer = simScorer; } @@ -499,7 +690,29 @@ public DocIdSetIterator iterator() { @Override public float getMaxScore(int upTo) throws IOException { - return Float.POSITIVE_INFINITY; + if (impactsDISI != null) { + return impactsDISI.getMaxScore(upTo); + } else { + return Float.POSITIVE_INFINITY; + } + } + + @Override + public int advanceShallow(int target) throws IOException { + if (impactsDISI != null) { + return impactsDISI.advanceShallow(target); + } else { + return super.advanceShallow(target); + } + } + + @Override + public void setMinCompetitiveScore(float minScore) throws IOException { + if (impactsDISI != null) { + impactsDISI.setMinCompetitiveScore(minScore); + } else { + super.setMinCompetitiveScore(minScore); + } } } } diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCombinedFieldQuery.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCombinedFieldQuery.java index 4a18b2c1253..7026edb1c81 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCombinedFieldQuery.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCombinedFieldQuery.java @@ -169,6 +169,118 @@ public void testSameScore() throws IOException { dir.close(); } + public void testSameScoreAndCollectionBetweenCompleteAndTopScores() throws IOException { + int numDocs = + randomBoolean() + ? atLeast(1000) + : atLeast(128 * 8 * 8 * 3); // make sure some terms have skip data + int numMatchDoc = randomIntBetween(200, 500); + int numHits = atMost(100); + int boost1 = Math.max(1, random().nextInt(5)); + int boost2 = Math.max(1, random().nextInt(5)); + + Directory dir = newDirectory(); + Similarity similarity = randomCompatibleSimilarity(); + + IndexWriterConfig iwc = new IndexWriterConfig(); + iwc.setSimilarity(similarity); + RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc); + + // adding non-matching docs + for (int i = 0; i < numDocs - numMatchDoc; ++i) { + Document doc = new Document(); + + int freqA = random().nextInt(50) + 1; + for (int j = 0; j < freqA; j++) { + doc.add(new TextField("a", "bar" + j, Store.NO)); + } + + int freqB = random().nextInt(50) + 1; + for (int j = 0; j < freqB; j++) { + doc.add(new TextField("b", "bla" + j, Store.NO)); + } + int freqC = random().nextInt(50) + 1; + for (int j = 0; j < freqC; j++) { + doc.add(new TextField("c", "bla" + j, Store.NO)); + } + w.addDocument(doc); + } + + // adding potentially matching doc + for (int i = 0; i < numMatchDoc; i++) { + Document doc = new Document(); + + int freqA = random().nextInt(20) + 1; + + if (randomBoolean()) { + for (int j = 0; j < freqA; j++) { + doc.add(new TextField("a", "foo", Store.NO)); + } + } + + freqA = random().nextInt(20) + 1; + if (randomBoolean()) { + for (int j = 0; j < freqA; j++) { + doc.add(new TextField("a", "foo" + j, Store.NO)); + } + } + + freqA = random().nextInt(20) + 1; + if (randomBoolean()) { + for (int j = 0; j < freqA; j++) { + doc.add(new TextField("a", "zoo", Store.NO)); + } + } + + int freqB = random().nextInt(20) + 1; + if (randomBoolean()) { + for (int j = 0; j < freqB; j++) { + doc.add(new TextField("b", "zoo", Store.NO)); + } + } + + freqB = random().nextInt(20) + 1; + if (randomBoolean()) { + for (int j = 0; j < freqB; j++) { + doc.add(new TextField("b", "zoo" + j, Store.NO)); + } + } + + int freqC = random().nextInt(20) + 1; + for (int j = 0; j < freqC; j++) { + doc.add(new TextField("c", "bla" + j, Store.NO)); + } + w.addDocument(doc); + } + + IndexReader reader = w.getReader(); + IndexSearcher searcher = newSearcher(reader); + searcher.setSimilarity(similarity); + + CombinedFieldQuery query = + new CombinedFieldQuery.Builder() + .addField("a", (float) boost1) + .addField("b", (float) boost2) + .addTerm(new BytesRef("foo")) + .addTerm(new BytesRef("zoo")) + .build(); + + TopScoreDocCollector topScoresCollector = + TopScoreDocCollector.create(numHits, null, numHits); // TOP_SCORES + searcher.search(query, topScoresCollector); + + TopScoreDocCollector completeCollector = + TopScoreDocCollector.create(numHits, null, Integer.MAX_VALUE); // COMPLETE + searcher.search(query, completeCollector); + + CheckHits.checkEqual( + query, completeCollector.topDocs().scoreDocs, topScoresCollector.topDocs().scoreDocs); + + reader.close(); + w.close(); + dir.close(); + } + public void testScoringWithMultipleFieldTermsMatch() throws IOException { int numMatchDoc = randomIntBetween(100, 500); int numHits = atMost(100);