From 3c60209d88a74614e4ce294e07ca3483a92cc3da Mon Sep 17 00:00:00 2001 From: Zach Chen Date: Thu, 28 Oct 2021 22:06:04 -0700 Subject: [PATCH 01/16] LUCENE-10061: Implements basic dynamic pruning support for CombinedFieldsQuery // HighMed: from office # freq=3224339 freq=225338 // top scores time usage 425 milliseconds // complete time usage 401 milliseconds // HighHigh: but publisher # freq=1456553 freq=1289029 // top scores time usage 469 milliseconds // complete time usage 322 milliseconds // HighLow: with fung # freq=3709421 freq=1344 // top scores time usage 241 milliseconds // complete time usage 428 milliseconds // HighLow: date insult # freq=2020626 freq=4424 // top scores time usage 171 milliseconds // complete time usage 225 milliseconds --- .../randomization/policies/tests.policy | 5 + .../sandbox/search/CombinedFieldQuery.java | 407 +++++++++++++++++- .../search/TestCombinedFieldQuery.java | 81 ++++ 3 files changed, 483 insertions(+), 10 deletions(-) diff --git a/gradle/testing/randomization/policies/tests.policy b/gradle/testing/randomization/policies/tests.policy index 71dc7991fdb..178fc6bbfa5 100644 --- a/gradle/testing/randomization/policies/tests.policy +++ b/gradle/testing/randomization/policies/tests.policy @@ -121,6 +121,11 @@ grant codeBase "file:${gradle.lib.dir}${/}-" { permission java.security.AllPermission; }; +// nocommit - hack to run test on indices stored elsewhere +grant { + permission java.security.AllPermission; +}; + grant codeBase "file:${gradle.worker.jar}" { permission java.security.AllPermission; }; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java index d3187a0896e..eda88ba5472 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java @@ -20,15 +20,24 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; +import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Objects; import java.util.Set; import java.util.TreeMap; +import java.util.stream.Collectors; + import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.Impact; +import org.apache.lucene.index.Impacts; +import org.apache.lucene.index.ImpactsEnum; +import org.apache.lucene.index.ImpactsSource; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.PostingsEnum; @@ -44,6 +53,7 @@ import org.apache.lucene.search.DisjunctionDISIApproximation; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.Explanation; +import org.apache.lucene.search.ImpactsDISI; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.LeafSimScorer; import org.apache.lucene.search.Matches; @@ -61,6 +71,7 @@ import org.apache.lucene.search.similarities.SimilarityBase; import org.apache.lucene.util.Accountable; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.PriorityQueue; import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.SmallFloat; @@ -94,8 +105,17 @@ * @lucene.experimental */ public final class CombinedFieldQuery extends Query implements Accountable { + /** Cache of decoded norms. */ + private static final float[] LENGTH_TABLE = new float[256]; + + static { + for (int i = 0; i < 256; i++) { + LENGTH_TABLE[i] = SmallFloat.byte4ToInt((byte) i); + } + } + private static final long BASE_RAM_BYTES = - RamUsageEstimator.shallowSizeOfInstance(CombinedFieldQuery.class); + RamUsageEstimator.shallowSizeOfInstance(CombinedFieldQuery.class); /** A builder for {@link CombinedFieldQuery}. */ public static class Builder { @@ -319,16 +339,19 @@ class CombinedFieldWeight extends Weight { private final IndexSearcher searcher; private final TermStates[] termStates; private final Similarity.SimScorer simWeight; + private final ScoreMode scoreMode; CombinedFieldWeight(Query query, IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { super(query); assert scoreMode.needsScores(); + this.scoreMode = scoreMode; this.searcher = searcher; long docFreq = 0; long totalTermFreq = 0; termStates = new TermStates[fieldTerms.length]; - for (int i = 0; i < termStates.length; i++) { + + for (int i = 0; i < fieldTerms.length; i++) { FieldAndWeight field = fieldAndWeights.get(fieldTerms[i].field()); TermStates ts = TermStates.build(searcher.getTopReaderContext(), fieldTerms[i], true); termStates[i] = ts; @@ -356,6 +379,7 @@ private CollectionStatistics mergeCollectionStatistics(IndexSearcher searcher) long docCount = 0; long sumTotalTermFreq = 0; long sumDocFreq = 0; + for (FieldAndWeight fieldWeight : fieldAndWeights.values()) { CollectionStatistics collectionStats = searcher.collectionStatistics(fieldWeight.field); if (collectionStats != null) { @@ -402,14 +426,25 @@ public Explanation explain(LeafReaderContext context, int doc) throws IOExceptio public Scorer scorer(LeafReaderContext context) throws IOException { List iterators = new ArrayList<>(); List fields = new ArrayList<>(); + Map> fieldImpacts = new HashMap<>(); + for (int i = 0; i < fieldTerms.length; i++) { TermState state = termStates[i].get(context); if (state != null) { - TermsEnum termsEnum = context.reader().terms(fieldTerms[i].field()).iterator(); + String fieldName = fieldTerms[i].field(); + fields.add(fieldAndWeights.get(fieldName)); + fieldImpacts.putIfAbsent(fieldName, new ArrayList<>()); + + TermsEnum termsEnum = context.reader().terms(fieldName).iterator(); termsEnum.seekExact(fieldTerms[i].bytes(), state); - PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.FREQS); - iterators.add(postingsEnum); - fields.add(fieldAndWeights.get(fieldTerms[i].field())); + if (scoreMode == ScoreMode.TOP_SCORES) { + ImpactsEnum impactsEnum = termsEnum.impacts(PostingsEnum.FREQS); + iterators.add(impactsEnum); + fieldImpacts.get(fieldName).add(impactsEnum); + } else { + PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.FREQS); + iterators.add(postingsEnum); + } } } @@ -421,18 +456,33 @@ public Scorer scorer(LeafReaderContext context) throws IOException { new MultiNormsLeafSimScorer(simWeight, context.reader(), fields, true); LeafSimScorer nonScoringSimScorer = new LeafSimScorer(simWeight, context.reader(), "pseudo_field", false); + // we use termscorers + disjunction as an impl detail DisiPriorityQueue queue = new DisiPriorityQueue(iterators.size()); + Map fieldWeights = new HashMap<>(); for (int i = 0; i < iterators.size(); i++) { - float weight = fields.get(i).weight; + FieldAndWeight fieldAndWeight = fields.get(i); + if (fieldWeights.containsKey(fieldAndWeight.field)) { + assert fieldWeights.get(fieldAndWeight.field).equals(fieldAndWeight.weight); + } else { + fieldWeights.put(fieldAndWeight.field, fieldAndWeight.weight); + } queue.add( new WeightedDisiWrapper( - new TermScorer(this, iterators.get(i), nonScoringSimScorer), weight)); + new TermScorer(this, iterators.get(i), nonScoringSimScorer), fieldAndWeight.weight)); } + // Even though it is called approximation, it is accurate since none of // the sub iterators are two-phase iterators. DocIdSetIterator iterator = new DisjunctionDISIApproximation(queue); - return new CombinedFieldScorer(this, queue, iterator, scoringSimScorer); + ImpactsDISI impactsDisi = null; + + if (scoreMode == ScoreMode.TOP_SCORES) { + ImpactsSource impactsSource = mergeImpacts(fieldImpacts, fieldWeights); + iterator = impactsDisi = new ImpactsDISI(iterator, impactsSource, simWeight); + } + + return new CombinedFieldScorer(this, queue, iterator, impactsDisi, scoringSimScorer); } @Override @@ -441,6 +491,318 @@ public boolean isCacheable(LeafReaderContext ctx) { } } + /** Merge impacts for combined field. */ + static ImpactsSource mergeImpacts(Map> fieldsWithImpactsEnums, Map fieldWeights) { + return new ImpactsSource() { + + class SubIterator { + final Iterator iterator; + int previousFreq; + Impact current; + + SubIterator(Iterator iterator) { + this.iterator = iterator; + this.current = iterator.next(); + } + + void next() { + previousFreq = current.freq; + if (iterator.hasNext() == false) { + current = null; + } else { + current = iterator.next(); + } + } + } + + Map> fieldsWithImpacts; + + @Override + public Impacts getImpacts() throws IOException { + fieldsWithImpacts = new HashMap<>(); + + // Use the impacts that have the lower next boundary (doc id in skip entry) as a lead for each field + // They collectively will decide on the number of levels and the block boundaries. + Map leadingImpactsPerField = new HashMap<>(fieldsWithImpactsEnums.keySet().size()); + + for (Map.Entry> fieldImpacts : fieldsWithImpactsEnums.entrySet()) { + String field = fieldImpacts.getKey(); + List impactsEnums = fieldImpacts.getValue(); + fieldsWithImpacts.put(field, new ArrayList<>(impactsEnums.size())); + + Impacts tmpLead = null; + // find the impact that has the lowest next boundary for this field + for (int i = 0; i < impactsEnums.size(); ++i) { + Impacts impacts = impactsEnums.get(i).getImpacts(); + fieldsWithImpacts.get(field).add(impacts); + + if (tmpLead == null || + impacts.getDocIdUpTo(0) < tmpLead.getDocIdUpTo(0)) { + tmpLead = impacts; + } + } + + leadingImpactsPerField.put(field, tmpLead); + } + + return new Impacts() { + + @Override + public int numLevels() { + // max of levels across fields' impactEnums + return leadingImpactsPerField.values().stream().map(Impacts::numLevels).max(Integer::compareTo).get(); + } + + @Override + public int getDocIdUpTo(int level) { + // min of docIdUpTo across fields' impactEnums + return leadingImpactsPerField.values().stream().filter(i -> i.numLevels() > level).map(i -> i.getDocIdUpTo(level)).min(Integer::compareTo).get(); + } + + @Override + public List getImpacts(int level) { + final int docIdUpTo = getDocIdUpTo(level); + final Map> mergedImpactsPerField = mergeImpactsPerField(docIdUpTo); + + return mergeImpactsAcrossFields(mergedImpactsPerField); + } + + private Map> mergeImpactsPerField(int docIdUpTo) { + final Map> result = new HashMap<>(); + + for (Map.Entry> impactsPerField : fieldsWithImpactsEnums.entrySet()) { + String field = impactsPerField.getKey(); + List impactsEnums = impactsPerField.getValue(); + List impacts = fieldsWithImpacts.get(field); + + List mergedImpacts = doMergeImpactsPerField(field, impactsEnums, impacts, docIdUpTo); + + if (mergedImpacts.size() == 0) { + // all impactEnums for this field were positioned beyond docIdUpTo, continue to next field + continue; + } else if (mergedImpacts.size() == 1 && mergedImpacts.get(0).freq == Integer.MAX_VALUE && mergedImpacts.get(0).norm == 1L) { + // one field gets impacts that trigger maximum score, pass it up + return Collections.singletonMap(field, mergedImpacts); + } else { + result.put(field, mergedImpacts); + } + } + + return result; + } + + + // Merge impacts from same field by summing freqs with the same norms - the same logic used for SynonymQuery + private List doMergeImpactsPerField(String field, List impactsEnums, List impacts, int docIdUpTo) { + List> toMerge = new ArrayList<>(); + + for (int i = 0; i < impactsEnums.size(); ++i) { + if (impactsEnums.get(i).docID() <= docIdUpTo) { + int impactsLevel = getLevel(impacts.get(i), docIdUpTo); + if (impactsLevel == -1) { + // one instance doesn't have impacts that cover up to docIdUpTo + // return impacts that trigger the maximum score + return Collections.singletonList(new Impact(Integer.MAX_VALUE, 1L)); + } + final List impactList; + float weight = fieldWeights.get(field); + if (weight != 1f) { + impactList = + impacts.get(i).getImpacts(impactsLevel).stream() + .map( + impact -> + new Impact((int) Math.ceil(impact.freq * weight), + SmallFloat.intToByte4((int) Math.floor(normToLength(impact.norm) * weight)))) + .collect(Collectors.toList()); + } else { + impactList = impacts.get(i).getImpacts(impactsLevel); + } + toMerge.add(impactList); + } + } + + // all impactEnums for this field were positioned beyond docIdUpTo + if (toMerge.size() == 0) { + return new ArrayList<>(); + } + + if (toMerge.size() == 1) { + // common if one term is common and the other one is rare + return toMerge.get(0); + } + + PriorityQueue pq = + new PriorityQueue(impacts.size()) { + @Override + protected boolean lessThan(SubIterator a, SubIterator b) { + if (a.current == null) { // means iteration is finished + return false; + } + if (b.current == null) { + return true; + } + return Long.compareUnsigned(a.current.norm, b.current.norm) < 0; + } + }; + + for (List toMergeImpacts : toMerge) { + pq.add(new SubIterator(toMergeImpacts.iterator())); + } + + List mergedImpacts = new LinkedList<>(); + + // Idea: merge impacts by norm. The tricky thing is that we need to + // consider norm values that are not in the impacts too. For + // instance if the list of impacts is [{freq=2,norm=10}, {freq=4,norm=12}], + // there might well be a document that has a freq of 2 and a length of 11, + // which was just not added to the list of impacts because {freq=2,norm=10} + // is more competitive. So the way it works is that we track the sum of + // the term freqs that we have seen so far in order to account for these + // implicit impacts. + + long sumTf = 0; + SubIterator top = pq.top(); + do { + final long norm = top.current.norm; + do { + sumTf += top.current.freq - top.previousFreq; + top.next(); + top = pq.updateTop(); + } while (top.current != null && top.current.norm == norm); + + final int freqUpperBound = (int) Math.min(Integer.MAX_VALUE, sumTf); + if (mergedImpacts.isEmpty()) { + mergedImpacts.add(new Impact(freqUpperBound, norm)); + } else { + Impact prevImpact = mergedImpacts.get(mergedImpacts.size() - 1); + assert Long.compareUnsigned(prevImpact.norm, norm) < 0; + if (freqUpperBound > prevImpact.freq) { + mergedImpacts.add(new Impact(freqUpperBound, norm)); + } // otherwise the previous impact is already more competitive + } + } while (top.current != null); + + return mergedImpacts; + } + + private int getLevel(Impacts impacts, int docIdUpTo) { + for (int level = 0, numLevels = impacts.numLevels(); level < numLevels; ++level) { + if (impacts.getDocIdUpTo(level) >= docIdUpTo) { + return level; + } + } + return -1; + } + + private List mergeImpactsAcrossFields(Map> mergedImpactsPerField) { + if (mergedImpactsPerField.size() == 1) { + return mergedImpactsPerField.values().iterator().next(); + } + + List> impactLists = new ArrayList<>(mergedImpactsPerField.size()); + for (List impacts : mergedImpactsPerField.values()) { + // add empty impact for cartesian product calculation + List newList = new ArrayList<>(impacts.size() + 1); + newList.addAll(impacts); + newList.add(new Impact(0, 0)); + impactLists.add(newList); + } + + List result = getCartesianProductOfImpacts(impactLists, 0); + + Collections.sort(result, new Comparator() { + @Override + public int compare(Impact o1, Impact o2) { + return (int) (o1.norm - o2.norm); + } + }); + + // remove the dummy Impact {0, 0} added above + result.remove(0); + + return result; + } + + // merge impacts across fields by doing cartesian products recursively, and only keep competitive ones. + // this way of merging impacts across fields mirror the scoring logic in MultiNormsLeafSimScorer#score, where norms were summed along with freqs + // this operation would be expensive for many fields with long impact lists + // + // e.g. + // 1. field a impacts: + // 2. field b impacts: + // 3. cartesian products of above: + // + // + // + // + // 4. keep only the competitive ones from #3 + // 5. after step #4, treat the result as competitive impacts for merged fields a & b, and merge it with the rest of fields' impacts + + private List getCartesianProductOfImpacts(List> values, int index) { + if (index == values.size() - 1) { + return values.get(values.size() - 1); + } + + List firstImpactList = values.get(index); + + List cartesianProductOfRestOfImpacts = getCartesianProductOfImpacts(values, index + 1); + + Map maxFreq = new HashMap<>(); + + for (int i = 0; i < firstImpactList.size(); i++) { + for (int j = 0; j < cartesianProductOfRestOfImpacts.size(); j++) { + Impact impactA = firstImpactList.get(i); + Impact impactB = cartesianProductOfRestOfImpacts.get(j); + + int tempNorm = (int) Math.floor(normToLength(impactA.norm) + normToLength(impactB.norm)); + long normSum = SmallFloat.intToByte4(tempNorm); + + long freqSum = (long) impactA.freq + (long) impactB.freq; + int freqUpperBound = (int) Math.min(Integer.MAX_VALUE, freqSum); + + if (maxFreq.containsKey(normSum) == false || + (maxFreq.containsKey(normSum) && freqUpperBound > maxFreq.get(normSum))) { + maxFreq.put(normSum, freqUpperBound); + } + } + } + + // remove duplicate entries where freq is same, but norm larger +// Map reversedMaxFreq = new HashMap<>(); +// for (Map.Entry normFreqPair : maxFreq.entrySet()) { +// Long norm = normFreqPair.getKey(); +// Integer freq = normFreqPair.getValue(); +// +// if (reversedMaxFreq.containsKey(freq) == false || +// (reversedMaxFreq.containsKey(freq) && norm < reversedMaxFreq.get(freq))) { +// reversedMaxFreq.put(freq, norm); +// } +// } + + return maxFreq.entrySet().stream().map(e -> new Impact(e.getValue(), e.getKey())).collect(Collectors.toList()); + } + + }; + } + + private float normToLength(long norm) { + return LENGTH_TABLE[Byte.toUnsignedInt((byte) norm)]; + } + + @Override + public void advanceShallow(int target) throws IOException { + for (List impactsEnums: fieldsWithImpactsEnums.values()) { + for (ImpactsEnum impactsEnum : impactsEnums) { + if (impactsEnum.docID() < target) { + impactsEnum.advanceShallow(target); + } + } + } + } + }; + } + + private static class WeightedDisiWrapper extends DisiWrapper { final float weight; @@ -458,15 +820,18 @@ private static class CombinedFieldScorer extends Scorer { private final DisiPriorityQueue queue; private final DocIdSetIterator iterator; private final MultiNormsLeafSimScorer simScorer; + private final ImpactsDISI impactsDISI; CombinedFieldScorer( Weight weight, DisiPriorityQueue queue, DocIdSetIterator iterator, + ImpactsDISI impactsDISI, MultiNormsLeafSimScorer simScorer) { super(weight); this.queue = queue; this.iterator = iterator; + this.impactsDISI = impactsDISI; this.simScorer = simScorer; } @@ -499,7 +864,29 @@ public DocIdSetIterator iterator() { @Override public float getMaxScore(int upTo) throws IOException { - return Float.POSITIVE_INFINITY; + if (impactsDISI != null) { + return impactsDISI.getMaxScore(upTo); + } else { + return Float.POSITIVE_INFINITY; + } + } + + @Override + public int advanceShallow(int target) throws IOException { + if (impactsDISI != null) { + return impactsDISI.advanceShallow(target); + } else { + return super.advanceShallow(target); + } + } + + @Override + public void setMinCompetitiveScore(float minScore) throws IOException { + if (impactsDISI != null) { + impactsDISI.setMinCompetitiveScore(minScore); + } else { + super.setMinCompetitiveScore(minScore); + } } } } diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCombinedFieldQuery.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCombinedFieldQuery.java index 7798a2a7017..4d5d3ac0b19 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCombinedFieldQuery.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCombinedFieldQuery.java @@ -17,12 +17,18 @@ package org.apache.lucene.sandbox.search; import com.carrotsearch.randomizedtesting.generators.RandomPicks; + import java.io.IOException; +import java.nio.file.Path; +import java.nio.file.Paths; import java.util.Arrays; +import java.util.concurrent.TimeUnit; + import org.apache.lucene.document.Document; import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; +import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.FieldInvertState; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriterConfig; @@ -46,6 +52,7 @@ import org.apache.lucene.search.similarities.LMJelinekMercerSimilarity; import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.MMapDirectory; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; @@ -165,6 +172,80 @@ public void testSameScore() throws IOException { dir.close(); } + public void testSameScoreAndCollectionBetweenCompleteAndTopScores() throws IOException { + Path path = Paths.get("/Users/xichen/IdeaProjects/benchmarks/indices/wikimedium10m.lucene_baseline.facets.taxonomy:Date.taxonomy:Month.taxonomy:DayOfYear.sortedset:Month.sortedset:DayOfYear.Lucene90.Lucene90.nd10M/index"); + + Directory index = FSDirectory.open(path); + IndexReader reader = DirectoryReader.open(index); + IndexSearcher searcher = new IndexSearcher(reader); + + // HighHigh: but publisher # freq=1456553 freq=1289029 +// CombinedFieldQuery query = +// new CombinedFieldQuery.Builder() +// .addField("title", (float) 4.0) +// .addField("body", (float) 2.0) +// .addTerm(new BytesRef("but")) +// .addTerm(new BytesRef("publisher")) +// .build(); + + // HighMed: from office # freq=3224339 freq=225338 +// CombinedFieldQuery query = +// new CombinedFieldQuery.Builder() +// .addField("title", (float) 4.0) +// .addField("body", (float) 2.0) +// .addTerm(new BytesRef("from")) +// .addTerm(new BytesRef("office")) +// .build(); + + // HighLow: with fung # freq=3709421 freq=1344 +// CombinedFieldQuery query = +// new CombinedFieldQuery.Builder() +// .addField("title", (float) 4.0) +// .addField("body", (float) 2.0) +// .addTerm(new BytesRef("with")) +// .addTerm(new BytesRef("fung")) +// .build(); + + // HighLow: date insult # freq=2020626 freq=4424 + CombinedFieldQuery query = + new CombinedFieldQuery.Builder() + .addField("title", (float) 4.0) + .addField("body", (float) 2.0) + .addTerm(new BytesRef("date")) + .addTerm(new BytesRef("insult")) + .build(); + + long start; + long end; + long duration; + int runCount = 20; + + TopScoreDocCollector topScoresCollector = null; + start = System.nanoTime(); + for (int i = 0 ; i < runCount ; i++) { + topScoresCollector = TopScoreDocCollector.create(100, null, 100); // TOP_SCORES + searcher.search(query, topScoresCollector); + } + end = System.nanoTime(); + duration = (end - start) / runCount; + System.out.println("// top scores time usage\t" + TimeUnit.MILLISECONDS.convert(duration, TimeUnit.NANOSECONDS) + " milliseconds"); + + + TopScoreDocCollector completeCollector = null; + start = System.nanoTime(); + for (int i = 0 ; i < runCount ; i++) { + completeCollector = TopScoreDocCollector.create(100, null, Integer.MAX_VALUE); // COMPLETE + searcher.search(query, completeCollector); + } + end = System.nanoTime(); + duration = (end - start) / runCount; + System.out.println("// complete time usage\t" + TimeUnit.MILLISECONDS.convert(duration, TimeUnit.NANOSECONDS) + " milliseconds"); + + CheckHits.checkEqual(query, completeCollector.topDocs().scoreDocs, topScoresCollector.topDocs().scoreDocs); + + reader.close(); + } + public void testNormsDisabled() throws IOException { Directory dir = newDirectory(); Similarity similarity = randomCompatibleSimilarity(); From 2ba435e5c83f870be95662c951c9818111843a59 Mon Sep 17 00:00:00 2001 From: Zach Chen Date: Wed, 3 Nov 2021 23:28:25 -0700 Subject: [PATCH 02/16] replace cartesian product calculation with simple upper-bound MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Perf tests result Run 1:                             TaskQPS baseline      StdDevQPS my_modified_version      StdDev                Pct diff p-value                      CFQHighHigh        3.53      (3.3%)        2.92      (5.3%)  -17.2% ( -25% -   -8%) 0.000                         PKLookup      108.13      (7.7%)      119.85      (8.2%)   10.8% (  -4% -   28%) 0.000                       CFQHighLow       14.88      (3.9%)       16.69     (12.5%)   12.2% (  -3% -   29%) 0.000                       CFQHighMed       21.11      (4.1%)       25.87     (11.8%)   22.6% (   6% -   40%) 0.000 Run 2:                             TaskQPS baseline      StdDevQPS my_modified_version      StdDev                Pct diff p-value                      CFQHighHigh        6.64      (3.1%)        5.63     (10.2%)  -15.2% ( -27% -   -1%) 0.000                       CFQHighLow        8.35      (2.8%)        8.05     (15.0%)   -3.6% ( -20% -   14%) 0.297                       CFQHighMed       24.51      (5.3%)       24.90     (19.9%)    1.6% ( -22% -   28%) 0.733                         PKLookup      110.06     (10.0%)      128.54      (7.9%)   16.8% (  -1% -   38%) 0.000 Run 3:                             TaskQPS baseline      StdDevQPS my_modified_version      StdDev                Pct diff p-value                       CFQHighMed       13.01      (2.9%)        9.82      (7.8%)  -24.5% ( -34% -  -14%) 0.000                         PKLookup      107.85      (8.1%)      111.79     (11.2%)    3.7% ( -14% -   24%) 0.236                      CFQHighHigh        4.83      (2.6%)        5.06      (8.6%)    4.7% (  -6% -   16%) 0.018                       CFQHighLow       14.95      (3.0%)       18.31     (19.0%)   22.5% (   0% -   45%) 0.000 Run 4:                             TaskQPS baseline      StdDevQPS my_modified_version      StdDev                Pct diff p-value                       CFQHighMed       11.11      (2.9%)        6.69      (4.1%)  -39.7% ( -45% -  -33%) 0.000                       CFQHighLow       27.55      (3.8%)       25.46     (11.0%)   -7.6% ( -21% -    7%) 0.003                      CFQHighHigh        5.25      (3.2%)        4.96      (6.1%)   -5.7% ( -14% -    3%) 0.000                         PKLookup      107.61      (6.7%)      121.19      (4.6%)   12.6% (   1% -   25%) 0.000 --- .../sandbox/search/CombinedFieldQuery.java | 88 ++----------------- 1 file changed, 9 insertions(+), 79 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java index eda88ba5472..144f060329a 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java @@ -20,7 +20,6 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; -import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; @@ -699,88 +698,20 @@ private List mergeImpactsAcrossFields(Map> mergedIm return mergedImpactsPerField.values().iterator().next(); } - List> impactLists = new ArrayList<>(mergedImpactsPerField.size()); + // upper-bound by creating an impact that should be most competitive: + // this is done to avoid the potential combinatorial explosion from accurate computation on merged impacts across fields + int maxFreq = 0; + long minNorm = Long.MIN_VALUE; for (List impacts : mergedImpactsPerField.values()) { - // add empty impact for cartesian product calculation - List newList = new ArrayList<>(impacts.size() + 1); - newList.addAll(impacts); - newList.add(new Impact(0, 0)); - impactLists.add(newList); + // highest freq at the end of each impact list + maxFreq = Math.max(maxFreq, impacts.get(impacts.size() - 1).freq); + // lowest norm at the start of each impact list + minNorm = Math.min(minNorm, impacts.get(0).norm); } - List result = getCartesianProductOfImpacts(impactLists, 0); - - Collections.sort(result, new Comparator() { - @Override - public int compare(Impact o1, Impact o2) { - return (int) (o1.norm - o2.norm); - } - }); - - // remove the dummy Impact {0, 0} added above - result.remove(0); - - return result; + return Collections.singletonList(new Impact(maxFreq * mergedImpactsPerField.size(), minNorm)); } - // merge impacts across fields by doing cartesian products recursively, and only keep competitive ones. - // this way of merging impacts across fields mirror the scoring logic in MultiNormsLeafSimScorer#score, where norms were summed along with freqs - // this operation would be expensive for many fields with long impact lists - // - // e.g. - // 1. field a impacts: - // 2. field b impacts: - // 3. cartesian products of above: - // - // - // - // - // 4. keep only the competitive ones from #3 - // 5. after step #4, treat the result as competitive impacts for merged fields a & b, and merge it with the rest of fields' impacts - - private List getCartesianProductOfImpacts(List> values, int index) { - if (index == values.size() - 1) { - return values.get(values.size() - 1); - } - - List firstImpactList = values.get(index); - - List cartesianProductOfRestOfImpacts = getCartesianProductOfImpacts(values, index + 1); - - Map maxFreq = new HashMap<>(); - - for (int i = 0; i < firstImpactList.size(); i++) { - for (int j = 0; j < cartesianProductOfRestOfImpacts.size(); j++) { - Impact impactA = firstImpactList.get(i); - Impact impactB = cartesianProductOfRestOfImpacts.get(j); - - int tempNorm = (int) Math.floor(normToLength(impactA.norm) + normToLength(impactB.norm)); - long normSum = SmallFloat.intToByte4(tempNorm); - - long freqSum = (long) impactA.freq + (long) impactB.freq; - int freqUpperBound = (int) Math.min(Integer.MAX_VALUE, freqSum); - - if (maxFreq.containsKey(normSum) == false || - (maxFreq.containsKey(normSum) && freqUpperBound > maxFreq.get(normSum))) { - maxFreq.put(normSum, freqUpperBound); - } - } - } - - // remove duplicate entries where freq is same, but norm larger -// Map reversedMaxFreq = new HashMap<>(); -// for (Map.Entry normFreqPair : maxFreq.entrySet()) { -// Long norm = normFreqPair.getKey(); -// Integer freq = normFreqPair.getValue(); -// -// if (reversedMaxFreq.containsKey(freq) == false || -// (reversedMaxFreq.containsKey(freq) && norm < reversedMaxFreq.get(freq))) { -// reversedMaxFreq.put(freq, norm); -// } -// } - - return maxFreq.entrySet().stream().map(e -> new Impact(e.getValue(), e.getKey())).collect(Collectors.toList()); - } }; } @@ -802,7 +733,6 @@ public void advanceShallow(int target) throws IOException { }; } - private static class WeightedDisiWrapper extends DisiWrapper { final float weight; From 1a71469bf7afa39a052dcb7cd8965ac6b5a96804 Mon Sep 17 00:00:00 2001 From: Zach Chen Date: Thu, 4 Nov 2021 21:12:31 -0700 Subject: [PATCH 03/16] Bug fix and optimize per CPU profiler Run 1 TaskQPS baseline StdDevQPS my_modified_version StdDev Pct diff p-value CFQHighHighHigh 4.64 (6.5%) 3.30 (4.7%) -29.0% ( -37% - -19%) 0.000 CFQHighHigh 11.09 (6.0%) 9.61 (6.0%) -13.3% ( -23% - -1%) 0.000 PKLookup 103.38 (4.4%) 108.04 (4.3%) 4.5% ( -4% - 13%) 0.001 CFQHighMedLow 10.58 (6.1%) 12.30 (8.7%) 16.2% ( 1% - 33%) 0.000 CFQHighMed 10.70 (7.4%) 15.51 (11.2%) 44.9% ( 24% - 68%) 0.000 CFQHighLowLow 8.18 (8.2%) 12.87 (11.6%) 57.3% ( 34% - 84%) 0.000 CFQHighLow 14.57 (7.5%) 30.81 (15.1%) 111.4% ( 82% - 144%) 0.000 Run 2 TaskQPS baseline StdDevQPS my_modified_version StdDev Pct diff p-value CFQHighHighHigh 5.33 (5.7%) 4.02 (7.7%) -24.4% ( -35% - -11%) 0.000 CFQHighLowLow 17.14 (6.2%) 13.06 (5.4%) -23.8% ( -33% - -13%) 0.000 CFQHighMed 17.37 (5.8%) 14.38 (7.7%) -17.2% ( -29% - -3%) 0.000 PKLookup 103.57 (5.5%) 108.84 (5.9%) 5.1% ( -6% - 17%) 0.005 CFQHighMedLow 11.25 (7.2%) 12.70 (9.0%) 12.9% ( -3% - 31%) 0.000 CFQHighHigh 5.00 (6.2%) 7.54 (12.1%) 51.0% ( 30% - 73%) 0.000 CFQHighLow 21.60 (5.2%) 34.57 (14.1%) 60.0% ( 38% - 83%) 0.000 Run 3 TaskQPS baseline StdDevQPS my_modified_version StdDev Pct diff p-value CFQHighHighHigh 5.40 (6.9%) 4.06 (5.1%) -24.8% ( -34% - -13%) 0.000 CFQHighMedLow 7.64 (7.4%) 5.79 (6.3%) -24.2% ( -35% - -11%) 0.000 CFQHighHigh 11.11 (7.0%) 9.60 (5.9%) -13.6% ( -24% - 0%) 0.000 CFQHighLowLow 21.21 (7.6%) 21.22 (6.6%) 0.0% ( -13% - 15%) 0.993 PKLookup 103.15 (5.9%) 107.60 (6.9%) 4.3% ( -8% - 18%) 0.034 CFQHighLow 21.85 (8.1%) 34.18 (13.5%) 56.4% ( 32% - 84%) 0.000 CFQHighMed 12.07 (8.4%) 19.98 (16.7%) 65.5% ( 37% - 98%) 0.000 Run 4 TaskQPS baseline StdDevQPS my_modified_version StdDev Pct diff p-value CFQHighHigh 8.50 (5.8%) 6.85 (5.2%) -19.5% ( -28% - -8%) 0.000 CFQHighMedLow 10.89 (5.7%) 8.96 (5.4%) -17.8% ( -27% - -7%) 0.000 CFQHighMed 8.41 (5.8%) 7.74 (5.6%) -7.9% ( -18% - 3%) 0.000 CFQHighHighHigh 3.45 (6.7%) 3.38 (5.3%) -2.0% ( -13% - 10%) 0.287 CFQHighLowLow 7.82 (6.4%) 8.20 (7.5%) 4.8% ( -8% - 20%) 0.030 PKLookup 103.50 (5.0%) 110.69 (5.4%) 6.9% ( -3% - 18%) 0.000 CFQHighLow 11.46 (6.0%) 13.16 (6.7%) 14.8% ( 1% - 29%) 0.000 --- .../sandbox/search/CombinedFieldQuery.java | 68 ++++++++++++------- 1 file changed, 43 insertions(+), 25 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java index 144f060329a..bfd97da2ba2 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java @@ -29,7 +29,6 @@ import java.util.Objects; import java.util.Set; import java.util.TreeMap; -import java.util.stream.Collectors; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; @@ -425,21 +424,23 @@ public Explanation explain(LeafReaderContext context, int doc) throws IOExceptio public Scorer scorer(LeafReaderContext context) throws IOException { List iterators = new ArrayList<>(); List fields = new ArrayList<>(); - Map> fieldImpacts = new HashMap<>(); + Map> fieldImpactsEnum = new HashMap<>(); + Map> fieldImpacts = new HashMap<>(); for (int i = 0; i < fieldTerms.length; i++) { TermState state = termStates[i].get(context); if (state != null) { String fieldName = fieldTerms[i].field(); fields.add(fieldAndWeights.get(fieldName)); - fieldImpacts.putIfAbsent(fieldName, new ArrayList<>()); + fieldImpactsEnum.putIfAbsent(fieldName, new ArrayList<>()); TermsEnum termsEnum = context.reader().terms(fieldName).iterator(); termsEnum.seekExact(fieldTerms[i].bytes(), state); + if (scoreMode == ScoreMode.TOP_SCORES) { ImpactsEnum impactsEnum = termsEnum.impacts(PostingsEnum.FREQS); iterators.add(impactsEnum); - fieldImpacts.get(fieldName).add(impactsEnum); + fieldImpactsEnum.get(fieldName).add(impactsEnum); } else { PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.FREQS); iterators.add(postingsEnum); @@ -477,7 +478,7 @@ public Scorer scorer(LeafReaderContext context) throws IOException { ImpactsDISI impactsDisi = null; if (scoreMode == ScoreMode.TOP_SCORES) { - ImpactsSource impactsSource = mergeImpacts(fieldImpacts, fieldWeights); + ImpactsSource impactsSource = mergeImpacts(fieldImpactsEnum, fieldImpacts, fieldWeights); iterator = impactsDisi = new ImpactsDISI(iterator, impactsSource, simWeight); } @@ -491,7 +492,7 @@ public boolean isCacheable(LeafReaderContext ctx) { } /** Merge impacts for combined field. */ - static ImpactsSource mergeImpacts(Map> fieldsWithImpactsEnums, Map fieldWeights) { + static ImpactsSource mergeImpacts(Map> fieldsWithImpactsEnums, Map> fieldsWithImpacts, Map fieldWeights) { return new ImpactsSource() { class SubIterator { @@ -514,15 +515,12 @@ void next() { } } - Map> fieldsWithImpacts; @Override public Impacts getImpacts() throws IOException { - fieldsWithImpacts = new HashMap<>(); - // Use the impacts that have the lower next boundary (doc id in skip entry) as a lead for each field // They collectively will decide on the number of levels and the block boundaries. - Map leadingImpactsPerField = new HashMap<>(fieldsWithImpactsEnums.keySet().size()); + Map leadingImpactsPerField = new HashMap<>(fieldsWithImpactsEnums.size()); for (Map.Entry> fieldImpacts : fieldsWithImpactsEnums.entrySet()) { String field = fieldImpacts.getKey(); @@ -549,13 +547,27 @@ public Impacts getImpacts() throws IOException { @Override public int numLevels() { // max of levels across fields' impactEnums - return leadingImpactsPerField.values().stream().map(Impacts::numLevels).max(Integer::compareTo).get(); + int result = 0; + + for (Impacts impacts : leadingImpactsPerField.values()) { + result = Math.max(result, impacts.numLevels()); + } + + return result; } @Override public int getDocIdUpTo(int level) { // min of docIdUpTo across fields' impactEnums - return leadingImpactsPerField.values().stream().filter(i -> i.numLevels() > level).map(i -> i.getDocIdUpTo(level)).min(Integer::compareTo).get(); + int result = Integer.MAX_VALUE; + + for (Impacts impacts : leadingImpactsPerField.values()) { + if (impacts.numLevels() > level) { + result = Math.min(result, impacts.getDocIdUpTo(level)); + } + } + + return result; } @Override @@ -567,7 +579,7 @@ public List getImpacts(int level) { } private Map> mergeImpactsPerField(int docIdUpTo) { - final Map> result = new HashMap<>(); + final Map> result = new HashMap<>(fieldsWithImpactsEnums.size()); for (Map.Entry> impactsPerField : fieldsWithImpactsEnums.entrySet()) { String field = impactsPerField.getKey(); @@ -593,7 +605,7 @@ private Map> mergeImpactsPerField(int docIdUpTo) { // Merge impacts from same field by summing freqs with the same norms - the same logic used for SynonymQuery private List doMergeImpactsPerField(String field, List impactsEnums, List impacts, int docIdUpTo) { - List> toMerge = new ArrayList<>(); + List> toMerge = new ArrayList<>(impactsEnums.size()); for (int i = 0; i < impactsEnums.size(); ++i) { if (impactsEnums.get(i).docID() <= docIdUpTo) { @@ -603,20 +615,19 @@ private List doMergeImpactsPerField(String field, List impa // return impacts that trigger the maximum score return Collections.singletonList(new Impact(Integer.MAX_VALUE, 1L)); } - final List impactList; float weight = fieldWeights.get(field); if (weight != 1f) { - impactList = - impacts.get(i).getImpacts(impactsLevel).stream() - .map( - impact -> - new Impact((int) Math.ceil(impact.freq * weight), - SmallFloat.intToByte4((int) Math.floor(normToLength(impact.norm) * weight)))) - .collect(Collectors.toList()); + final List originalImpactList = impacts.get(i).getImpacts(impactsLevel); + final List impactList = new ArrayList<>(originalImpactList.size()); + for (Impact impact : originalImpactList) { + impactList.add(new Impact((int) Math.ceil(impact.freq * weight), + SmallFloat.intToByte4((int) Math.floor(normToLength(impact.norm) * weight)))); + + } + toMerge.add(impactList); } else { - impactList = impacts.get(i).getImpacts(impactsLevel); + toMerge.add(impacts.get(i).getImpacts(impactsLevel)); } - toMerge.add(impactList); } } @@ -709,7 +720,14 @@ private List mergeImpactsAcrossFields(Map> mergedIm minNorm = Math.min(minNorm, impacts.get(0).norm); } - return Collections.singletonList(new Impact(maxFreq * mergedImpactsPerField.size(), minNorm)); + int amplifiedMaxFreq = maxFreq * mergedImpactsPerField.size(); + + // overflow + if (amplifiedMaxFreq < 0) { + amplifiedMaxFreq = Integer.MAX_VALUE; + } + + return Collections.singletonList(new Impact(amplifiedMaxFreq, minNorm)); } From 8ecdc694336c26312be2e3fc12c7b67594541b39 Mon Sep 17 00:00:00 2001 From: Zach Chen Date: Thu, 4 Nov 2021 21:16:52 -0700 Subject: [PATCH 04/16] Fix style --- .../sandbox/search/CombinedFieldQuery.java | 88 +++++++++++-------- .../search/TestCombinedFieldQuery.java | 78 ++++++++-------- 2 files changed, 92 insertions(+), 74 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java index bfd97da2ba2..a1ba083fc1e 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java @@ -29,7 +29,6 @@ import java.util.Objects; import java.util.Set; import java.util.TreeMap; - import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.Impact; @@ -113,7 +112,7 @@ public final class CombinedFieldQuery extends Query implements Accountable { } private static final long BASE_RAM_BYTES = - RamUsageEstimator.shallowSizeOfInstance(CombinedFieldQuery.class); + RamUsageEstimator.shallowSizeOfInstance(CombinedFieldQuery.class); /** A builder for {@link CombinedFieldQuery}. */ public static class Builder { @@ -469,7 +468,8 @@ public Scorer scorer(LeafReaderContext context) throws IOException { } queue.add( new WeightedDisiWrapper( - new TermScorer(this, iterators.get(i), nonScoringSimScorer), fieldAndWeight.weight)); + new TermScorer(this, iterators.get(i), nonScoringSimScorer), + fieldAndWeight.weight)); } // Even though it is called approximation, it is accurate since none of @@ -492,7 +492,10 @@ public boolean isCacheable(LeafReaderContext ctx) { } /** Merge impacts for combined field. */ - static ImpactsSource mergeImpacts(Map> fieldsWithImpactsEnums, Map> fieldsWithImpacts, Map fieldWeights) { + static ImpactsSource mergeImpacts( + Map> fieldsWithImpactsEnums, + Map> fieldsWithImpacts, + Map fieldWeights) { return new ImpactsSource() { class SubIterator { @@ -515,14 +518,15 @@ void next() { } } - @Override public Impacts getImpacts() throws IOException { - // Use the impacts that have the lower next boundary (doc id in skip entry) as a lead for each field + // Use the impacts that have the lower next boundary (doc id in skip entry) as a lead for + // each field // They collectively will decide on the number of levels and the block boundaries. Map leadingImpactsPerField = new HashMap<>(fieldsWithImpactsEnums.size()); - for (Map.Entry> fieldImpacts : fieldsWithImpactsEnums.entrySet()) { + for (Map.Entry> fieldImpacts : + fieldsWithImpactsEnums.entrySet()) { String field = fieldImpacts.getKey(); List impactsEnums = fieldImpacts.getValue(); fieldsWithImpacts.put(field, new ArrayList<>(impactsEnums.size())); @@ -533,8 +537,7 @@ public Impacts getImpacts() throws IOException { Impacts impacts = impactsEnums.get(i).getImpacts(); fieldsWithImpacts.get(field).add(impacts); - if (tmpLead == null || - impacts.getDocIdUpTo(0) < tmpLead.getDocIdUpTo(0)) { + if (tmpLead == null || impacts.getDocIdUpTo(0) < tmpLead.getDocIdUpTo(0)) { tmpLead = impacts; } } @@ -550,7 +553,7 @@ public int numLevels() { int result = 0; for (Impacts impacts : leadingImpactsPerField.values()) { - result = Math.max(result, impacts.numLevels()); + result = Math.max(result, impacts.numLevels()); } return result; @@ -581,17 +584,22 @@ public List getImpacts(int level) { private Map> mergeImpactsPerField(int docIdUpTo) { final Map> result = new HashMap<>(fieldsWithImpactsEnums.size()); - for (Map.Entry> impactsPerField : fieldsWithImpactsEnums.entrySet()) { + for (Map.Entry> impactsPerField : + fieldsWithImpactsEnums.entrySet()) { String field = impactsPerField.getKey(); List impactsEnums = impactsPerField.getValue(); List impacts = fieldsWithImpacts.get(field); - List mergedImpacts = doMergeImpactsPerField(field, impactsEnums, impacts, docIdUpTo); + List mergedImpacts = + doMergeImpactsPerField(field, impactsEnums, impacts, docIdUpTo); if (mergedImpacts.size() == 0) { - // all impactEnums for this field were positioned beyond docIdUpTo, continue to next field + // all impactEnums for this field were positioned beyond docIdUpTo, continue to next + // field continue; - } else if (mergedImpacts.size() == 1 && mergedImpacts.get(0).freq == Integer.MAX_VALUE && mergedImpacts.get(0).norm == 1L) { + } else if (mergedImpacts.size() == 1 + && mergedImpacts.get(0).freq == Integer.MAX_VALUE + && mergedImpacts.get(0).norm == 1L) { // one field gets impacts that trigger maximum score, pass it up return Collections.singletonMap(field, mergedImpacts); } else { @@ -602,9 +610,10 @@ private Map> mergeImpactsPerField(int docIdUpTo) { return result; } - - // Merge impacts from same field by summing freqs with the same norms - the same logic used for SynonymQuery - private List doMergeImpactsPerField(String field, List impactsEnums, List impacts, int docIdUpTo) { + // Merge impacts from same field by summing freqs with the same norms - the same logic + // used for SynonymQuery + private List doMergeImpactsPerField( + String field, List impactsEnums, List impacts, int docIdUpTo) { List> toMerge = new ArrayList<>(impactsEnums.size()); for (int i = 0; i < impactsEnums.size(); ++i) { @@ -620,9 +629,11 @@ private List doMergeImpactsPerField(String field, List impa final List originalImpactList = impacts.get(i).getImpacts(impactsLevel); final List impactList = new ArrayList<>(originalImpactList.size()); for (Impact impact : originalImpactList) { - impactList.add(new Impact((int) Math.ceil(impact.freq * weight), - SmallFloat.intToByte4((int) Math.floor(normToLength(impact.norm) * weight)))); - + impactList.add( + new Impact( + (int) Math.ceil(impact.freq * weight), + SmallFloat.intToByte4( + (int) Math.floor(normToLength(impact.norm) * weight)))); } toMerge.add(impactList); } else { @@ -642,18 +653,18 @@ private List doMergeImpactsPerField(String field, List impa } PriorityQueue pq = - new PriorityQueue(impacts.size()) { - @Override - protected boolean lessThan(SubIterator a, SubIterator b) { - if (a.current == null) { // means iteration is finished - return false; - } - if (b.current == null) { - return true; - } - return Long.compareUnsigned(a.current.norm, b.current.norm) < 0; - } - }; + new PriorityQueue(impacts.size()) { + @Override + protected boolean lessThan(SubIterator a, SubIterator b) { + if (a.current == null) { // means iteration is finished + return false; + } + if (b.current == null) { + return true; + } + return Long.compareUnsigned(a.current.norm, b.current.norm) < 0; + } + }; for (List toMergeImpacts : toMerge) { pq.add(new SubIterator(toMergeImpacts.iterator())); @@ -704,13 +715,16 @@ private int getLevel(Impacts impacts, int docIdUpTo) { return -1; } - private List mergeImpactsAcrossFields(Map> mergedImpactsPerField) { + private List mergeImpactsAcrossFields( + Map> mergedImpactsPerField) { if (mergedImpactsPerField.size() == 1) { return mergedImpactsPerField.values().iterator().next(); } - // upper-bound by creating an impact that should be most competitive: - // this is done to avoid the potential combinatorial explosion from accurate computation on merged impacts across fields + // upper-bound by creating an impact that should be most competitive: + // this is done to avoid the potential combinatorial explosion from accurate computation + // on merged impacts across fields int maxFreq = 0; long minNorm = Long.MIN_VALUE; for (List impacts : mergedImpactsPerField.values()) { @@ -729,8 +743,6 @@ private List mergeImpactsAcrossFields(Map> mergedIm return Collections.singletonList(new Impact(amplifiedMaxFreq, minNorm)); } - - }; } @@ -740,7 +752,7 @@ private float normToLength(long norm) { @Override public void advanceShallow(int target) throws IOException { - for (List impactsEnums: fieldsWithImpactsEnums.values()) { + for (List impactsEnums : fieldsWithImpactsEnums.values()) { for (ImpactsEnum impactsEnum : impactsEnums) { if (impactsEnum.docID() < target) { impactsEnum.advanceShallow(target); diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCombinedFieldQuery.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCombinedFieldQuery.java index 4d5d3ac0b19..d464765489b 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCombinedFieldQuery.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCombinedFieldQuery.java @@ -17,13 +17,11 @@ package org.apache.lucene.sandbox.search; import com.carrotsearch.randomizedtesting.generators.RandomPicks; - import java.io.IOException; import java.nio.file.Path; import java.nio.file.Paths; import java.util.Arrays; import java.util.concurrent.TimeUnit; - import org.apache.lucene.document.Document; import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.StringField; @@ -173,47 +171,49 @@ public void testSameScore() throws IOException { } public void testSameScoreAndCollectionBetweenCompleteAndTopScores() throws IOException { - Path path = Paths.get("/Users/xichen/IdeaProjects/benchmarks/indices/wikimedium10m.lucene_baseline.facets.taxonomy:Date.taxonomy:Month.taxonomy:DayOfYear.sortedset:Month.sortedset:DayOfYear.Lucene90.Lucene90.nd10M/index"); + Path path = + Paths.get( + "/Users/xichen/IdeaProjects/benchmarks/indices/wikimedium10m.lucene_baseline.facets.taxonomy:Date.taxonomy:Month.taxonomy:DayOfYear.sortedset:Month.sortedset:DayOfYear.Lucene90.Lucene90.nd10M/index"); Directory index = FSDirectory.open(path); IndexReader reader = DirectoryReader.open(index); IndexSearcher searcher = new IndexSearcher(reader); // HighHigh: but publisher # freq=1456553 freq=1289029 -// CombinedFieldQuery query = -// new CombinedFieldQuery.Builder() -// .addField("title", (float) 4.0) -// .addField("body", (float) 2.0) -// .addTerm(new BytesRef("but")) -// .addTerm(new BytesRef("publisher")) -// .build(); + // CombinedFieldQuery query = + // new CombinedFieldQuery.Builder() + // .addField("title", (float) 4.0) + // .addField("body", (float) 2.0) + // .addTerm(new BytesRef("but")) + // .addTerm(new BytesRef("publisher")) + // .build(); // HighMed: from office # freq=3224339 freq=225338 -// CombinedFieldQuery query = -// new CombinedFieldQuery.Builder() -// .addField("title", (float) 4.0) -// .addField("body", (float) 2.0) -// .addTerm(new BytesRef("from")) -// .addTerm(new BytesRef("office")) -// .build(); + // CombinedFieldQuery query = + // new CombinedFieldQuery.Builder() + // .addField("title", (float) 4.0) + // .addField("body", (float) 2.0) + // .addTerm(new BytesRef("from")) + // .addTerm(new BytesRef("office")) + // .build(); // HighLow: with fung # freq=3709421 freq=1344 -// CombinedFieldQuery query = -// new CombinedFieldQuery.Builder() -// .addField("title", (float) 4.0) -// .addField("body", (float) 2.0) -// .addTerm(new BytesRef("with")) -// .addTerm(new BytesRef("fung")) -// .build(); + // CombinedFieldQuery query = + // new CombinedFieldQuery.Builder() + // .addField("title", (float) 4.0) + // .addField("body", (float) 2.0) + // .addTerm(new BytesRef("with")) + // .addTerm(new BytesRef("fung")) + // .build(); // HighLow: date insult # freq=2020626 freq=4424 CombinedFieldQuery query = - new CombinedFieldQuery.Builder() - .addField("title", (float) 4.0) - .addField("body", (float) 2.0) - .addTerm(new BytesRef("date")) - .addTerm(new BytesRef("insult")) - .build(); + new CombinedFieldQuery.Builder() + .addField("title", (float) 4.0) + .addField("body", (float) 2.0) + .addTerm(new BytesRef("date")) + .addTerm(new BytesRef("insult")) + .build(); long start; long end; @@ -222,26 +222,32 @@ public void testSameScoreAndCollectionBetweenCompleteAndTopScores() throws IOExc TopScoreDocCollector topScoresCollector = null; start = System.nanoTime(); - for (int i = 0 ; i < runCount ; i++) { + for (int i = 0; i < runCount; i++) { topScoresCollector = TopScoreDocCollector.create(100, null, 100); // TOP_SCORES searcher.search(query, topScoresCollector); } end = System.nanoTime(); duration = (end - start) / runCount; - System.out.println("// top scores time usage\t" + TimeUnit.MILLISECONDS.convert(duration, TimeUnit.NANOSECONDS) + " milliseconds"); - + System.out.println( + "// top scores time usage\t" + + TimeUnit.MILLISECONDS.convert(duration, TimeUnit.NANOSECONDS) + + " milliseconds"); TopScoreDocCollector completeCollector = null; start = System.nanoTime(); - for (int i = 0 ; i < runCount ; i++) { + for (int i = 0; i < runCount; i++) { completeCollector = TopScoreDocCollector.create(100, null, Integer.MAX_VALUE); // COMPLETE searcher.search(query, completeCollector); } end = System.nanoTime(); duration = (end - start) / runCount; - System.out.println("// complete time usage\t" + TimeUnit.MILLISECONDS.convert(duration, TimeUnit.NANOSECONDS) + " milliseconds"); + System.out.println( + "// complete time usage\t" + + TimeUnit.MILLISECONDS.convert(duration, TimeUnit.NANOSECONDS) + + " milliseconds"); - CheckHits.checkEqual(query, completeCollector.topDocs().scoreDocs, topScoresCollector.topDocs().scoreDocs); + CheckHits.checkEqual( + query, completeCollector.topDocs().scoreDocs, topScoresCollector.topDocs().scoreDocs); reader.close(); } From b4ac2b488cdb26b63b79bd7e792df4deb10c8d3f Mon Sep 17 00:00:00 2001 From: Zach Chen Date: Sat, 6 Nov 2021 11:15:35 -0700 Subject: [PATCH 05/16] Update test --- .../randomization/policies/tests.policy | 5 - .../search/TestCombinedFieldQuery.java | 172 ++++++++++-------- 2 files changed, 100 insertions(+), 77 deletions(-) diff --git a/gradle/testing/randomization/policies/tests.policy b/gradle/testing/randomization/policies/tests.policy index 178fc6bbfa5..71dc7991fdb 100644 --- a/gradle/testing/randomization/policies/tests.policy +++ b/gradle/testing/randomization/policies/tests.policy @@ -121,11 +121,6 @@ grant codeBase "file:${gradle.lib.dir}${/}-" { permission java.security.AllPermission; }; -// nocommit - hack to run test on indices stored elsewhere -grant { - permission java.security.AllPermission; -}; - grant codeBase "file:${gradle.worker.jar}" { permission java.security.AllPermission; }; diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCombinedFieldQuery.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCombinedFieldQuery.java index d464765489b..6f72169aa7c 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCombinedFieldQuery.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCombinedFieldQuery.java @@ -16,17 +16,17 @@ */ package org.apache.lucene.sandbox.search; +import static com.carrotsearch.randomizedtesting.RandomizedTest.atMost; +import static com.carrotsearch.randomizedtesting.RandomizedTest.randomBoolean; +import static com.carrotsearch.randomizedtesting.RandomizedTest.randomIntBetween; + import com.carrotsearch.randomizedtesting.generators.RandomPicks; import java.io.IOException; -import java.nio.file.Path; -import java.nio.file.Paths; import java.util.Arrays; -import java.util.concurrent.TimeUnit; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; -import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.FieldInvertState; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriterConfig; @@ -50,7 +50,6 @@ import org.apache.lucene.search.similarities.LMJelinekMercerSimilarity; import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.store.Directory; -import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.MMapDirectory; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; @@ -171,85 +170,114 @@ public void testSameScore() throws IOException { } public void testSameScoreAndCollectionBetweenCompleteAndTopScores() throws IOException { - Path path = - Paths.get( - "/Users/xichen/IdeaProjects/benchmarks/indices/wikimedium10m.lucene_baseline.facets.taxonomy:Date.taxonomy:Month.taxonomy:DayOfYear.sortedset:Month.sortedset:DayOfYear.Lucene90.Lucene90.nd10M/index"); + int numDocs = + randomBoolean() + ? atLeast(1000) + : atLeast(128 * 8 * 8 * 3); // make sure some terms have skip data + int numMatchDoc = randomIntBetween(200, 500); + int numHits = atMost(100); + int boost1 = Math.max(1, random().nextInt(5)); + int boost2 = Math.max(1, random().nextInt(5)); - Directory index = FSDirectory.open(path); - IndexReader reader = DirectoryReader.open(index); - IndexSearcher searcher = new IndexSearcher(reader); + Directory dir = newDirectory(); + Similarity similarity = randomCompatibleSimilarity(); + + IndexWriterConfig iwc = new IndexWriterConfig(); + iwc.setSimilarity(similarity); + RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc); + + // adding non-matching docs + for (int i = 0; i < numDocs - numMatchDoc; ++i) { + Document doc = new Document(); + + int freqA = random().nextInt(50) + 1; + for (int j = 0; j < freqA; j++) { + doc.add(new TextField("a", "bar" + j, Store.NO)); + } + + int freqB = random().nextInt(50) + 1; + for (int j = 0; j < freqB; j++) { + doc.add(new TextField("b", "bla" + j, Store.NO)); + } + int freqC = random().nextInt(50) + 1; + for (int j = 0; j < freqC; j++) { + doc.add(new TextField("c", "bla" + j, Store.NO)); + } + w.addDocument(doc); + } + + // adding potentially matching doc + for (int i = 0; i < numMatchDoc; i++) { + Document doc = new Document(); + + int freqA = random().nextInt(20) + 1; + if (randomBoolean()) { + for (int j = 0; j < freqA; j++) { + doc.add(new TextField("a", "foo", Store.NO)); + } + } + + freqA = random().nextInt(20) + 1; + if (randomBoolean()) { + for (int j = 0; j < freqA; j++) { + doc.add(new TextField("a", "foo" + j, Store.NO)); + } + } + + freqA = random().nextInt(20) + 1; + if (randomBoolean()) { + for (int j = 0; j < freqA; j++) { + doc.add(new TextField("a", "zoo", Store.NO)); + } + } + + int freqB = random().nextInt(20) + 1; + if (randomBoolean()) { + for (int j = 0; j < freqB; j++) { + doc.add(new TextField("b", "zoo", Store.NO)); + } + } + + freqB = random().nextInt(20) + 1; + if (randomBoolean()) { + for (int j = 0; j < freqB; j++) { + doc.add(new TextField("b", "zoo" + j, Store.NO)); + } + } + + int freqC = random().nextInt(20) + 1; + for (int j = 0; j < freqC; j++) { + doc.add(new TextField("c", "bla" + j, Store.NO)); + } + w.addDocument(doc); + } + + IndexReader reader = w.getReader(); + IndexSearcher searcher = newSearcher(reader); + searcher.setSimilarity(similarity); - // HighHigh: but publisher # freq=1456553 freq=1289029 - // CombinedFieldQuery query = - // new CombinedFieldQuery.Builder() - // .addField("title", (float) 4.0) - // .addField("body", (float) 2.0) - // .addTerm(new BytesRef("but")) - // .addTerm(new BytesRef("publisher")) - // .build(); - - // HighMed: from office # freq=3224339 freq=225338 - // CombinedFieldQuery query = - // new CombinedFieldQuery.Builder() - // .addField("title", (float) 4.0) - // .addField("body", (float) 2.0) - // .addTerm(new BytesRef("from")) - // .addTerm(new BytesRef("office")) - // .build(); - - // HighLow: with fung # freq=3709421 freq=1344 - // CombinedFieldQuery query = - // new CombinedFieldQuery.Builder() - // .addField("title", (float) 4.0) - // .addField("body", (float) 2.0) - // .addTerm(new BytesRef("with")) - // .addTerm(new BytesRef("fung")) - // .build(); - - // HighLow: date insult # freq=2020626 freq=4424 CombinedFieldQuery query = new CombinedFieldQuery.Builder() - .addField("title", (float) 4.0) - .addField("body", (float) 2.0) - .addTerm(new BytesRef("date")) - .addTerm(new BytesRef("insult")) + .addField("a", (float) boost1) + .addField("b", (float) boost2) + .addTerm(new BytesRef("foo")) + .addTerm(new BytesRef("zoo")) .build(); - long start; - long end; - long duration; - int runCount = 20; + TopScoreDocCollector topScoresCollector = + TopScoreDocCollector.create(numHits, null, numHits); // TOP_SCORES + searcher.search(query, topScoresCollector); - TopScoreDocCollector topScoresCollector = null; - start = System.nanoTime(); - for (int i = 0; i < runCount; i++) { - topScoresCollector = TopScoreDocCollector.create(100, null, 100); // TOP_SCORES - searcher.search(query, topScoresCollector); - } - end = System.nanoTime(); - duration = (end - start) / runCount; - System.out.println( - "// top scores time usage\t" - + TimeUnit.MILLISECONDS.convert(duration, TimeUnit.NANOSECONDS) - + " milliseconds"); - - TopScoreDocCollector completeCollector = null; - start = System.nanoTime(); - for (int i = 0; i < runCount; i++) { - completeCollector = TopScoreDocCollector.create(100, null, Integer.MAX_VALUE); // COMPLETE - searcher.search(query, completeCollector); - } - end = System.nanoTime(); - duration = (end - start) / runCount; - System.out.println( - "// complete time usage\t" - + TimeUnit.MILLISECONDS.convert(duration, TimeUnit.NANOSECONDS) - + " milliseconds"); + TopScoreDocCollector completeCollector = + TopScoreDocCollector.create(numHits, null, Integer.MAX_VALUE); // COMPLETE + searcher.search(query, completeCollector); CheckHits.checkEqual( query, completeCollector.topDocs().scoreDocs, topScoresCollector.topDocs().scoreDocs); reader.close(); + w.close(); + dir.close(); } public void testNormsDisabled() throws IOException { From 4244f39d6f63161b04ae7eefe502212bc06f6bea Mon Sep 17 00:00:00 2001 From: Zach Chen Date: Sat, 6 Nov 2021 11:40:55 -0700 Subject: [PATCH 06/16] Update amplifiedMaxFreq calculation --- .../lucene/sandbox/search/CombinedFieldQuery.java | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java index a1ba083fc1e..671855f9463 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java @@ -734,13 +734,17 @@ private List mergeImpactsAcrossFields( minNorm = Math.min(minNorm, impacts.get(0).norm); } - int amplifiedMaxFreq = maxFreq * mergedImpactsPerField.size(); - - // overflow - if (amplifiedMaxFreq < 0) { - amplifiedMaxFreq = Integer.MAX_VALUE; + int amplifiedMaxFreq = 0; + if (maxFreq == Integer.MAX_VALUE) { + amplifiedMaxFreq = maxFreq; + } else { + amplifiedMaxFreq = + (int) Math.min((long) maxFreq * mergedImpactsPerField.size(), Integer.MAX_VALUE); } + // no overflow should occur + assert amplifiedMaxFreq > 0; + return Collections.singletonList(new Impact(amplifiedMaxFreq, minNorm)); } }; From 81206d44d5bac5d4249bba4b94058a2e8aed0e2d Mon Sep 17 00:00:00 2001 From: Zach Chen Date: Sun, 7 Nov 2021 10:19:36 -0800 Subject: [PATCH 07/16] Simplify and reduce maxFreq Run 1 TaskQPS baseline StdDevQPS my_modified_version StdDev Pct diff p-value CFQHighMed 16.99 (4.9%) 16.84 (14.9%) -0.9% ( -19% - 19%) 0.790 PKLookup 98.49 (8.7%) 121.99 (9.1%) 23.9% ( 5% - 45%) 0.000 CFQHighHigh 5.82 (4.7%) 7.55 (10.0%) 29.8% ( 14% - 46%) 0.000 CFQHighLow 17.59 (5.2%) 25.01 (17.3%) 42.2% ( 18% - 68%) 0.000 Run 2 TaskQPS baseline StdDevQPS my_modified_version StdDev Pct diff p-value CFQHighMed 18.28 (6.1%) 15.63 (15.6%) -14.5% ( -34% - 7%) 0.000 PKLookup 101.26 (10.0%) 105.39 (12.0%) 4.1% ( -16% - 28%) 0.243 CFQHighHigh 9.96 (5.6%) 10.47 (13.0%) 5.2% ( -12% - 25%) 0.101 CFQHighLow 9.71 (5.8%) 15.71 (34.0%) 61.8% ( 20% - 107%) 0.000 --- .../sandbox/search/CombinedFieldQuery.java | 31 +++++++------------ 1 file changed, 11 insertions(+), 20 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java index 671855f9463..5a266846636 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java @@ -423,8 +423,8 @@ public Explanation explain(LeafReaderContext context, int doc) throws IOExceptio public Scorer scorer(LeafReaderContext context) throws IOException { List iterators = new ArrayList<>(); List fields = new ArrayList<>(); - Map> fieldImpactsEnum = new HashMap<>(); - Map> fieldImpacts = new HashMap<>(); + Map> fieldImpactsEnum = new HashMap<>(fieldAndWeights.size()); + Map> fieldImpacts = new HashMap<>(fieldAndWeights.size()); for (int i = 0; i < fieldTerms.length; i++) { TermState state = termStates[i].get(context); @@ -458,7 +458,7 @@ public Scorer scorer(LeafReaderContext context) throws IOException { // we use termscorers + disjunction as an impl detail DisiPriorityQueue queue = new DisiPriorityQueue(iterators.size()); - Map fieldWeights = new HashMap<>(); + Map fieldWeights = new HashMap<>(fieldImpactsEnum.size()); for (int i = 0; i < iterators.size(); i++) { FieldAndWeight fieldAndWeight = fields.get(i); if (fieldWeights.containsKey(fieldAndWeight.field)) { @@ -721,31 +721,22 @@ private List mergeImpactsAcrossFields( return mergedImpactsPerField.values().iterator().next(); } - // upper-bound by creating an impact that should be most competitive: - // this is done to avoid the potential combinatorial explosion from accurate computation + // upper-bound by creating an impact that should be most competitive: + // this is done to avoid the potential costly combinatorial explosion from accurate + // computation // on merged impacts across fields - int maxFreq = 0; + long maxFreqSum = 0; long minNorm = Long.MIN_VALUE; for (List impacts : mergedImpactsPerField.values()) { // highest freq at the end of each impact list - maxFreq = Math.max(maxFreq, impacts.get(impacts.size() - 1).freq); + maxFreqSum += impacts.get(impacts.size() - 1).freq; // lowest norm at the start of each impact list minNorm = Math.min(minNorm, impacts.get(0).norm); } - int amplifiedMaxFreq = 0; - if (maxFreq == Integer.MAX_VALUE) { - amplifiedMaxFreq = maxFreq; - } else { - amplifiedMaxFreq = - (int) Math.min((long) maxFreq * mergedImpactsPerField.size(), Integer.MAX_VALUE); - } - - // no overflow should occur - assert amplifiedMaxFreq > 0; - - return Collections.singletonList(new Impact(amplifiedMaxFreq, minNorm)); + return Collections.singletonList( + new Impact((int) Math.min(maxFreqSum, Integer.MAX_VALUE), minNorm)); } }; } From 6d5e7808fd847d5849e7c4ff3b552aeb9c196bbb Mon Sep 17 00:00:00 2001 From: Zach Chen Date: Thu, 18 Nov 2021 23:49:42 -0800 Subject: [PATCH 08/16] cache leadingImpactsPerField --- .../sandbox/search/CombinedFieldQuery.java | 40 ++++++++++--------- 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java index 1e69a232764..d151bd917e2 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java @@ -497,6 +497,7 @@ static ImpactsSource mergeImpacts( Map> fieldsWithImpacts, Map fieldWeights) { return new ImpactsSource() { + Map leadingImpactsPerField = null; class SubIterator { final Iterator iterator; @@ -523,26 +524,28 @@ public Impacts getImpacts() throws IOException { // Use the impacts that have the lower next boundary (doc id in skip entry) as a lead for // each field // They collectively will decide on the number of levels and the block boundaries. - Map leadingImpactsPerField = new HashMap<>(fieldsWithImpactsEnums.size()); - - for (Map.Entry> fieldImpacts : - fieldsWithImpactsEnums.entrySet()) { - String field = fieldImpacts.getKey(); - List impactsEnums = fieldImpacts.getValue(); - fieldsWithImpacts.put(field, new ArrayList<>(impactsEnums.size())); - - Impacts tmpLead = null; - // find the impact that has the lowest next boundary for this field - for (int i = 0; i < impactsEnums.size(); ++i) { - Impacts impacts = impactsEnums.get(i).getImpacts(); - fieldsWithImpacts.get(field).add(impacts); - - if (tmpLead == null || impacts.getDocIdUpTo(0) < tmpLead.getDocIdUpTo(0)) { - tmpLead = impacts; + + if (leadingImpactsPerField == null) { + leadingImpactsPerField = new HashMap<>(fieldsWithImpactsEnums.size()); + for (Map.Entry> fieldImpacts : + fieldsWithImpactsEnums.entrySet()) { + String field = fieldImpacts.getKey(); + List impactsEnums = fieldImpacts.getValue(); + fieldsWithImpacts.put(field, new ArrayList<>(impactsEnums.size())); + + Impacts tmpLead = null; + // find the impact that has the lowest next boundary for this field + for (int i = 0; i < impactsEnums.size(); ++i) { + Impacts impacts = impactsEnums.get(i).getImpacts(); + fieldsWithImpacts.get(field).add(impacts); + + if (tmpLead == null || impacts.getDocIdUpTo(0) < tmpLead.getDocIdUpTo(0)) { + tmpLead = impacts; + } } - } - leadingImpactsPerField.put(field, tmpLead); + leadingImpactsPerField.put(field, tmpLead); + } } return new Impacts() { @@ -754,6 +757,7 @@ public void advanceShallow(int target) throws IOException { } } } + leadingImpactsPerField = null; } }; } From db2446f0337e5860bb8c731df1f6c16281efb6bb Mon Sep 17 00:00:00 2001 From: Zach Chen Date: Mon, 22 Nov 2021 01:16:35 -0800 Subject: [PATCH 09/16] test leadImpact from highest weighted field --- .../sandbox/search/CombinedFieldQuery.java | 85 ++++++++++++------- 1 file changed, 52 insertions(+), 33 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java index d151bd917e2..a04316b3457 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java @@ -424,6 +424,7 @@ public Scorer scorer(LeafReaderContext context) throws IOException { List iterators = new ArrayList<>(); List fields = new ArrayList<>(); Map> fieldImpactsEnum = new HashMap<>(fieldAndWeights.size()); + Map> fieldTermDocFreq = new HashMap<>(fieldAndWeights.size()); Map> fieldImpacts = new HashMap<>(fieldAndWeights.size()); for (int i = 0; i < fieldTerms.length; i++) { @@ -432,6 +433,7 @@ public Scorer scorer(LeafReaderContext context) throws IOException { String fieldName = fieldTerms[i].field(); fields.add(fieldAndWeights.get(fieldName)); fieldImpactsEnum.putIfAbsent(fieldName, new ArrayList<>()); + fieldTermDocFreq.putIfAbsent(fieldName, new ArrayList<>()); TermsEnum termsEnum = context.reader().terms(fieldName).iterator(); termsEnum.seekExact(fieldTerms[i].bytes(), state); @@ -440,6 +442,7 @@ public Scorer scorer(LeafReaderContext context) throws IOException { ImpactsEnum impactsEnum = termsEnum.impacts(PostingsEnum.FREQS); iterators.add(impactsEnum); fieldImpactsEnum.get(fieldName).add(impactsEnum); + fieldTermDocFreq.get(fieldName).add(termsEnum.docFreq()); } else { PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.FREQS); iterators.add(postingsEnum); @@ -478,7 +481,8 @@ public Scorer scorer(LeafReaderContext context) throws IOException { ImpactsDISI impactsDisi = null; if (scoreMode == ScoreMode.TOP_SCORES) { - ImpactsSource impactsSource = mergeImpacts(fieldImpactsEnum, fieldImpacts, fieldWeights); + ImpactsSource impactsSource = + mergeImpacts(fieldImpactsEnum, fieldImpacts, fieldTermDocFreq, fieldWeights); iterator = impactsDisi = new ImpactsDISI(iterator, impactsSource, simWeight); } @@ -495,9 +499,10 @@ public boolean isCacheable(LeafReaderContext ctx) { static ImpactsSource mergeImpacts( Map> fieldsWithImpactsEnums, Map> fieldsWithImpacts, + Map> fieldTermDocFreq, Map fieldWeights) { return new ImpactsSource() { - Map leadingImpactsPerField = null; + Impacts leadingImpacts = null; class SubIterator { final Iterator iterator; @@ -525,55 +530,69 @@ public Impacts getImpacts() throws IOException { // each field // They collectively will decide on the number of levels and the block boundaries. - if (leadingImpactsPerField == null) { - leadingImpactsPerField = new HashMap<>(fieldsWithImpactsEnums.size()); + if (leadingImpacts == null) { + float maxWeight = Float.MIN_VALUE; + String maxWeightField = ""; + + for (Map.Entry fieldWeightEntry : fieldWeights.entrySet()) { + String field = fieldWeightEntry.getKey(); + float weight = fieldWeightEntry.getValue(); + + if (maxWeight < weight) { + maxWeight = weight; + maxWeightField = field; + } + } + + Impacts tmpLead = null; for (Map.Entry> fieldImpacts : fieldsWithImpactsEnums.entrySet()) { String field = fieldImpacts.getKey(); List impactsEnums = fieldImpacts.getValue(); + // List docFreqs = fieldTermDocFreq.get(field); fieldsWithImpacts.put(field, new ArrayList<>(impactsEnums.size())); - Impacts tmpLead = null; - // find the impact that has the lowest next boundary for this field - for (int i = 0; i < impactsEnums.size(); ++i) { - Impacts impacts = impactsEnums.get(i).getImpacts(); - fieldsWithImpacts.get(field).add(impacts); - - if (tmpLead == null || impacts.getDocIdUpTo(0) < tmpLead.getDocIdUpTo(0)) { - tmpLead = impacts; + if (field.equals(maxWeightField)) { + int minDocFreq = Integer.MAX_VALUE; + for (int i = 0; i < impactsEnums.size(); ++i) { + Impacts impacts = impactsEnums.get(i).getImpacts(); + fieldsWithImpacts.get(field).add(impacts); + + // use the impact of term within this mostly weighted field that has the least doc + // freq + // this may have the result of getting larger upTo bound, as low doc freq term may + // also have large gap in doc ids + // int docFreq = docFreqs.get(i); + // if (tmpLead == null || docFreq < minDocFreq) { + // minDocFreq = docFreq; + // tmpLead = impacts; + // } + + if (tmpLead == null || impacts.getDocIdUpTo(0) < tmpLead.getDocIdUpTo(0)) { + tmpLead = impacts; + } + } + } else { + // find the impact that has the lowest next boundary for this field + for (int i = 0; i < impactsEnums.size(); ++i) { + Impacts impacts = impactsEnums.get(i).getImpacts(); + fieldsWithImpacts.get(field).add(impacts); } } - - leadingImpactsPerField.put(field, tmpLead); } + leadingImpacts = tmpLead; } return new Impacts() { @Override public int numLevels() { - // max of levels across fields' impactEnums - int result = 0; - - for (Impacts impacts : leadingImpactsPerField.values()) { - result = Math.max(result, impacts.numLevels()); - } - - return result; + return leadingImpacts.numLevels(); } @Override public int getDocIdUpTo(int level) { - // min of docIdUpTo across fields' impactEnums - int result = Integer.MAX_VALUE; - - for (Impacts impacts : leadingImpactsPerField.values()) { - if (impacts.numLevels() > level) { - result = Math.min(result, impacts.getDocIdUpTo(level)); - } - } - - return result; + return leadingImpacts.getDocIdUpTo(level); } @Override @@ -757,7 +776,7 @@ public void advanceShallow(int target) throws IOException { } } } - leadingImpactsPerField = null; + leadingImpacts = null; } }; } From 3d0a215d0174f13b508560bd476cd68475f05864 Mon Sep 17 00:00:00 2001 From: Zach Chen Date: Sun, 28 Nov 2021 18:18:10 -0800 Subject: [PATCH 10/16] Fix bug Results from python3 src/python/localrun.py -source combinedFieldsUnevenlyWeightedBig: Run 1: TaskQPS baseline StdDevQPS my_modified_version StdDev Pct diff p-value CFQHighHigh 2.81 (10.9%) 2.40 (10.3%) -14.6% ( -32% - 7%) 0.000 CFQHighMed 3.27 (11.8%) 2.91 (11.0%) -11.2% ( -30% - 13%) 0.002 CFQHighLow 16.09 (14.6%) 14.58 (11.9%) -9.4% ( -31% - 20%) 0.025 PKLookup 92.08 (11.6%) 96.23 (13.3%) 4.5% ( -18% - 33%) 0.255 Run 2: TaskQPS baseline StdDevQPS my_modified_version StdDev Pct diff p-value CFQHighLow 15.76 (12.0%) 10.19 (12.9%) -35.3% ( -53% - -11%) 0.000 CFQHighMed 12.01 (13.3%) 8.79 (13.9%) -26.9% ( -47% - 0%) 0.000 CFQHighHigh 5.59 (10.8%) 4.40 (8.4%) -21.2% ( -36% - -2%) 0.000 PKLookup 93.36 (21.9%) 94.29 (22.6%) 1.0% ( -35% - 58%) 0.888 --- .../org/apache/lucene/sandbox/search/CombinedFieldQuery.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java index a04316b3457..dc90476152d 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java @@ -749,7 +749,7 @@ private List mergeImpactsAcrossFields( // computation // on merged impacts across fields long maxFreqSum = 0; - long minNorm = Long.MIN_VALUE; + long minNorm = Long.MAX_VALUE; for (List impacts : mergedImpactsPerField.values()) { // highest freq at the end of each impact list maxFreqSum += impacts.get(impacts.size() - 1).freq; From 8a7ea9998b189649ccdc75ecdd4b096460abb0af Mon Sep 17 00:00:00 2001 From: Zach Chen Date: Wed, 1 Dec 2021 20:34:40 -0800 Subject: [PATCH 11/16] Address feedback - avoid repeated computation for max weighted field, replace doc freq collection with ImpactsEnum#cost --- .../sandbox/search/CombinedFieldQuery.java | 55 +++++++++---------- 1 file changed, 25 insertions(+), 30 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java index dc90476152d..9833a3e1a49 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java @@ -424,25 +424,30 @@ public Scorer scorer(LeafReaderContext context) throws IOException { List iterators = new ArrayList<>(); List fields = new ArrayList<>(); Map> fieldImpactsEnum = new HashMap<>(fieldAndWeights.size()); - Map> fieldTermDocFreq = new HashMap<>(fieldAndWeights.size()); Map> fieldImpacts = new HashMap<>(fieldAndWeights.size()); + float maxWeight = Float.MIN_VALUE; + String maxWeightField = ""; for (int i = 0; i < fieldTerms.length; i++) { TermState state = termStates[i].get(context); if (state != null) { String fieldName = fieldTerms[i].field(); fields.add(fieldAndWeights.get(fieldName)); fieldImpactsEnum.putIfAbsent(fieldName, new ArrayList<>()); - fieldTermDocFreq.putIfAbsent(fieldName, new ArrayList<>()); TermsEnum termsEnum = context.reader().terms(fieldName).iterator(); termsEnum.seekExact(fieldTerms[i].bytes(), state); if (scoreMode == ScoreMode.TOP_SCORES) { + float weight = fieldAndWeights.get(fieldName).weight; + if (maxWeight < weight) { + maxWeight = weight; + maxWeightField = fieldName; + } + ImpactsEnum impactsEnum = termsEnum.impacts(PostingsEnum.FREQS); iterators.add(impactsEnum); fieldImpactsEnum.get(fieldName).add(impactsEnum); - fieldTermDocFreq.get(fieldName).add(termsEnum.docFreq()); } else { PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.FREQS); iterators.add(postingsEnum); @@ -482,7 +487,7 @@ public Scorer scorer(LeafReaderContext context) throws IOException { if (scoreMode == ScoreMode.TOP_SCORES) { ImpactsSource impactsSource = - mergeImpacts(fieldImpactsEnum, fieldImpacts, fieldTermDocFreq, fieldWeights); + mergeImpacts(fieldImpactsEnum, fieldImpacts, fieldWeights, maxWeightField); iterator = impactsDisi = new ImpactsDISI(iterator, impactsSource, simWeight); } @@ -499,8 +504,8 @@ public boolean isCacheable(LeafReaderContext ctx) { static ImpactsSource mergeImpacts( Map> fieldsWithImpactsEnums, Map> fieldsWithImpacts, - Map> fieldTermDocFreq, - Map fieldWeights) { + Map fieldWeights, + String maxWeightField) { return new ImpactsSource() { Impacts leadingImpacts = null; @@ -531,19 +536,6 @@ public Impacts getImpacts() throws IOException { // They collectively will decide on the number of levels and the block boundaries. if (leadingImpacts == null) { - float maxWeight = Float.MIN_VALUE; - String maxWeightField = ""; - - for (Map.Entry fieldWeightEntry : fieldWeights.entrySet()) { - String field = fieldWeightEntry.getKey(); - float weight = fieldWeightEntry.getValue(); - - if (maxWeight < weight) { - maxWeight = weight; - maxWeightField = field; - } - } - Impacts tmpLead = null; for (Map.Entry> fieldImpacts : fieldsWithImpactsEnums.entrySet()) { @@ -553,24 +545,27 @@ public Impacts getImpacts() throws IOException { fieldsWithImpacts.put(field, new ArrayList<>(impactsEnums.size())); if (field.equals(maxWeightField)) { - int minDocFreq = Integer.MAX_VALUE; + long minCost = Long.MAX_VALUE; for (int i = 0; i < impactsEnums.size(); ++i) { - Impacts impacts = impactsEnums.get(i).getImpacts(); + ImpactsEnum impactsEnum = impactsEnums.get(i); + Impacts impacts = impactsEnum.getImpacts(); fieldsWithImpacts.get(field).add(impacts); // use the impact of term within this mostly weighted field that has the least doc - // freq - // this may have the result of getting larger upTo bound, as low doc freq term may + // freq (cost) + // this may have the result of getting larger upTo bound, as low doc freq (cost) + // term may // also have large gap in doc ids - // int docFreq = docFreqs.get(i); - // if (tmpLead == null || docFreq < minDocFreq) { - // minDocFreq = docFreq; - // tmpLead = impacts; - // } - - if (tmpLead == null || impacts.getDocIdUpTo(0) < tmpLead.getDocIdUpTo(0)) { + // long currentCost = impactsEnums.get(i).cost(); + if (tmpLead == null || impactsEnum.cost() < minCost) { + minCost = impactsEnum.cost(); tmpLead = impacts; } + + // if (tmpLead == null || impacts.getDocIdUpTo(0) < + // tmpLead.getDocIdUpTo(0)) { + // tmpLead = impacts; + // } } } else { // find the impact that has the lowest next boundary for this field From 0a9bdccf01d568b0239cf894ff626e47d8333df3 Mon Sep 17 00:00:00 2001 From: Zach Chen Date: Wed, 1 Dec 2021 21:16:15 -0800 Subject: [PATCH 12/16] Refactoring - extract out logic that merges impacts within the same field --- .../lucene/search/ImpactsMergingUtils.java | 141 ++++++++++++++++++ .../apache/lucene/search/SynonymQuery.java | 122 +-------------- 2 files changed, 143 insertions(+), 120 deletions(-) create mode 100644 lucene/core/src/java/org/apache/lucene/search/ImpactsMergingUtils.java diff --git a/lucene/core/src/java/org/apache/lucene/search/ImpactsMergingUtils.java b/lucene/core/src/java/org/apache/lucene/search/ImpactsMergingUtils.java new file mode 100644 index 00000000000..856ad31cb2b --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/ImpactsMergingUtils.java @@ -0,0 +1,141 @@ +package org.apache.lucene.search; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import java.util.stream.Collectors; +import org.apache.lucene.index.Impact; +import org.apache.lucene.index.Impacts; +import org.apache.lucene.index.ImpactsEnum; +import org.apache.lucene.util.PriorityQueue; + +/** + * Utils for merging impacts for SynonymQuery, CombinedFieldsQuery etc + * + * @lucene.internal + */ +public class ImpactsMergingUtils { + + /** + * Return the minimum level whose impacts are valid up to {@code docIdUpTo}, or {@code -1} if + * there is no such level. + */ + private static int getLevel(Impacts impacts, int docIdUpTo) { + for (int level = 0, numLevels = impacts.numLevels(); level < numLevels; ++level) { + if (impacts.getDocIdUpTo(level) >= docIdUpTo) { + return level; + } + } + return -1; + } + + private static class SubIterator { + final Iterator iterator; + int previousFreq; + Impact current; + + SubIterator(Iterator iterator) { + this.iterator = iterator; + this.current = iterator.next(); + } + + void next() { + previousFreq = current.freq; + if (iterator.hasNext() == false) { + current = null; + } else { + current = iterator.next(); + } + } + } + + /** + * Merge impacts from multiple impactsEnum (terms matches) within the same field. + * The high level logic is to combine freqs that have the same norm from impacts. + */ + public static List mergeImpactsPerField( + ImpactsEnum[] impactsEnum, final Impacts[] impacts, float[] boosts, int docIdUpTo) { + List> toMerge = new ArrayList<>(); + + for (int i = 0; i < impactsEnum.length; ++i) { + if (impactsEnum[i].docID() <= docIdUpTo) { + int impactsLevel = getLevel(impacts[i], docIdUpTo); + if (impactsLevel == -1) { + // One instance doesn't have impacts that cover up to docIdUpTo + // Return impacts that trigger the maximum score + return Collections.singletonList(new Impact(Integer.MAX_VALUE, 1L)); + } + final List impactList; + if (boosts[i] != 1f) { + float boost = boosts[i]; + impactList = + impacts[i].getImpacts(impactsLevel).stream() + .map(impact -> new Impact((int) Math.ceil(impact.freq * boost), impact.norm)) + .collect(Collectors.toList()); + } else { + impactList = impacts[i].getImpacts(impactsLevel); + } + toMerge.add(impactList); + } + } + assert toMerge.size() > 0; // otherwise it would mean the docID is > docIdUpTo, which is wrong + + if (toMerge.size() == 1) { + // common if one synonym is common and the other one is rare + return toMerge.get(0); + } + + PriorityQueue pq = + new PriorityQueue(impacts.length) { + @Override + protected boolean lessThan(SubIterator a, SubIterator b) { + if (a.current == null) { // means iteration is finished + return false; + } + if (b.current == null) { + return true; + } + return Long.compareUnsigned(a.current.norm, b.current.norm) < 0; + } + }; + for (List toMergeImpacts : toMerge) { + pq.add(new SubIterator(toMergeImpacts.iterator())); + } + + List mergedImpacts = new ArrayList<>(); + + // Idea: merge impacts by norm. The tricky thing is that we need to + // consider norm values that are not in the impacts too. For + // instance if the list of impacts is [{freq=2,norm=10}, {freq=4,norm=12}], + // there might well be a document that has a freq of 2 and a length of 11, + // which was just not added to the list of impacts because {freq=2,norm=10} + // is more competitive. So the way it works is that we track the sum of + // the term freqs that we have seen so far in order to account for these + // implicit impacts. + + long sumTf = 0; + SubIterator top = pq.top(); + do { + final long norm = top.current.norm; + do { + sumTf += top.current.freq - top.previousFreq; + top.next(); + top = pq.updateTop(); + } while (top.current != null && top.current.norm == norm); + + final int freqUpperBound = (int) Math.min(Integer.MAX_VALUE, sumTf); + if (mergedImpacts.isEmpty()) { + mergedImpacts.add(new Impact(freqUpperBound, norm)); + } else { + Impact prevImpact = mergedImpacts.get(mergedImpacts.size() - 1); + assert Long.compareUnsigned(prevImpact.norm, norm) < 0; + if (freqUpperBound > prevImpact.freq) { + mergedImpacts.add(new Impact(freqUpperBound, norm)); + } // otherwise the previous impact is already more competitive + } + } while (top.current != null); + + return mergedImpacts; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java b/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java index 2aefe3f7860..7905fa60ac3 100644 --- a/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java @@ -21,7 +21,6 @@ import java.util.Arrays; import java.util.Collections; import java.util.Comparator; -import java.util.Iterator; import java.util.List; import java.util.Objects; import java.util.stream.Collectors; @@ -40,7 +39,6 @@ import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.PriorityQueue; /** * A query that treats multiple terms as synonyms. @@ -346,26 +344,6 @@ static ImpactsSource mergeImpacts(ImpactsEnum[] impactsEnums, float[] boosts) { assert impactsEnums.length == boosts.length; return new ImpactsSource() { - class SubIterator { - final Iterator iterator; - int previousFreq; - Impact current; - - SubIterator(Iterator iterator) { - this.iterator = iterator; - this.current = iterator.next(); - } - - void next() { - previousFreq = current.freq; - if (iterator.hasNext() == false) { - current = null; - } else { - current = iterator.next(); - } - } - } - @Override public Impacts getImpacts() throws IOException { final Impacts[] impacts = new Impacts[impactsEnums.length]; @@ -393,107 +371,11 @@ public int getDocIdUpTo(int level) { return lead.getDocIdUpTo(level); } - /** - * Return the minimum level whose impacts are valid up to {@code docIdUpTo}, or {@code -1} - * if there is no such level. - */ - private int getLevel(Impacts impacts, int docIdUpTo) { - for (int level = 0, numLevels = impacts.numLevels(); level < numLevels; ++level) { - if (impacts.getDocIdUpTo(level) >= docIdUpTo) { - return level; - } - } - return -1; - } - @Override public List getImpacts(int level) { final int docIdUpTo = getDocIdUpTo(level); - - List> toMerge = new ArrayList<>(); - - for (int i = 0; i < impactsEnums.length; ++i) { - if (impactsEnums[i].docID() <= docIdUpTo) { - int impactsLevel = getLevel(impacts[i], docIdUpTo); - if (impactsLevel == -1) { - // One instance doesn't have impacts that cover up to docIdUpTo - // Return impacts that trigger the maximum score - return Collections.singletonList(new Impact(Integer.MAX_VALUE, 1L)); - } - final List impactList; - if (boosts[i] != 1f) { - float boost = boosts[i]; - impactList = - impacts[i].getImpacts(impactsLevel).stream() - .map( - impact -> - new Impact((int) Math.ceil(impact.freq * boost), impact.norm)) - .collect(Collectors.toList()); - } else { - impactList = impacts[i].getImpacts(impactsLevel); - } - toMerge.add(impactList); - } - } - assert toMerge.size() - > 0; // otherwise it would mean the docID is > docIdUpTo, which is wrong - - if (toMerge.size() == 1) { - // common if one synonym is common and the other one is rare - return toMerge.get(0); - } - - PriorityQueue pq = - new PriorityQueue(impacts.length) { - @Override - protected boolean lessThan(SubIterator a, SubIterator b) { - if (a.current == null) { // means iteration is finished - return false; - } - if (b.current == null) { - return true; - } - return Long.compareUnsigned(a.current.norm, b.current.norm) < 0; - } - }; - for (List impacts : toMerge) { - pq.add(new SubIterator(impacts.iterator())); - } - - List mergedImpacts = new ArrayList<>(); - - // Idea: merge impacts by norm. The tricky thing is that we need to - // consider norm values that are not in the impacts too. For - // instance if the list of impacts is [{freq=2,norm=10}, {freq=4,norm=12}], - // there might well be a document that has a freq of 2 and a length of 11, - // which was just not added to the list of impacts because {freq=2,norm=10} - // is more competitive. So the way it works is that we track the sum of - // the term freqs that we have seen so far in order to account for these - // implicit impacts. - - long sumTf = 0; - SubIterator top = pq.top(); - do { - final long norm = top.current.norm; - do { - sumTf += top.current.freq - top.previousFreq; - top.next(); - top = pq.updateTop(); - } while (top.current != null && top.current.norm == norm); - - final int freqUpperBound = (int) Math.min(Integer.MAX_VALUE, sumTf); - if (mergedImpacts.isEmpty()) { - mergedImpacts.add(new Impact(freqUpperBound, norm)); - } else { - Impact prevImpact = mergedImpacts.get(mergedImpacts.size() - 1); - assert Long.compareUnsigned(prevImpact.norm, norm) < 0; - if (freqUpperBound > prevImpact.freq) { - mergedImpacts.add(new Impact(freqUpperBound, norm)); - } // otherwise the previous impact is already more competitive - } - } while (top.current != null); - - return mergedImpacts; + return ImpactsMergingUtils.mergeImpactsPerField( + impactsEnums, impacts, boosts, docIdUpTo); } }; } From 808fec28ab6d7179974665e7fa069270845811fa Mon Sep 17 00:00:00 2001 From: Zach Chen Date: Wed, 1 Dec 2021 22:05:51 -0800 Subject: [PATCH 13/16] Refactoring - use ImpactsMergingUtils in CombinedFieldsQuery TaskQPS baseline StdDevQPS my_modified_version StdDev Pct diff p-value CFQHighLow 21.45 (13.8%) 15.65 (18.0%) -27.1% ( -51% - 5%) 0.000 CFQHighMed 5.92 (4.2%) 4.52 (12.2%) -23.7% ( -38% - -7%) 0.000 CFQHighHigh 5.52 (5.0%) 4.45 (12.1%) -19.4% ( -34% - -2%) 0.000 PKLookup 110.45 (11.1%) 108.08 (15.7%) -2.1% ( -26% - 27%) 0.617 = Candidate CPU JFR PROFILE SUMMARY from 86268 events (total: 86268) tests.profile.mode=cpu tests.profile.count=30 tests.profile.stacksize=1 tests.profile.linenumbers=false PERCENT CPU SAMPLES STACK 15.48% 13355 org.apache.lucene.sandbox.search.MultiNormsLeafSimScorer$MultiFieldNormValues#advanceExact() 7.37% 6359 org.apache.lucene.search.DisjunctionDISIApproximation#advance() 7.35% 6345 org.apache.lucene.search.similarities.BM25Similarity$BM25Scorer#score() 6.38% 5508 org.apache.lucene.search.DisiPriorityQueue#downHeap() 4.18% 3605 org.apache.lucene.sandbox.search.CombinedFieldQuery$CombinedFieldScorer#freq() 4.10% 3538 org.apache.lucene.search.TopScoreDocCollector$SimpleTopScoreDocCollector$1#collect() 4.01% 3456 org.apache.lucene.search.DisiPriorityQueue#topList() 3.67% 3169 org.apache.lucene.sandbox.search.CombinedFieldQuery$WeightedDisiWrapper#freq() 3.36% 2899 org.apache.lucene.search.DisiPriorityQueue#top() 3.08% 2660 java.lang.Math#round() 2.40% 2067 org.apache.lucene.sandbox.search.MultiNormsLeafSimScorer#getNormValue() 2.38% 2057 org.apache.lucene.sandbox.search.CombinedFieldQuery$CombinedFieldScorer#score() 1.94% 1675 org.apache.lucene.util.SmallFloat#longToInt4() 1.87% 1617 org.apache.lucene.codecs.lucene90.Lucene90NormsProducer$3#longValue() 1.86% 1605 org.apache.lucene.store.ByteBufferGuard#ensureValid() 1.80% 1557 org.apache.lucene.codecs.lucene90.Lucene90PostingsReader#findFirstGreater() 1.50% 1293 org.apache.lucene.codecs.lucene90.Lucene90PostingsReader$BlockImpactsDocsEnum#advance() 1.39% 1200 org.apache.lucene.search.ImpactsDISI#docID() 1.33% 1145 org.apache.lucene.search.DisiPriorityQueue#updateTop() 1.24% 1067 jdk.internal.misc.Unsafe#getByte() 1.18% 1019 org.apache.lucene.store.ByteBufferGuard#getByte() 0.92% 792 org.apache.lucene.util.SmallFloat#intToByte4() 0.76% 659 java.lang.Math#toIntExact() 0.68% 584 org.apache.lucene.codecs.lucene90.Lucene90PostingsReader$BlockImpactsDocsEnum#freq() 0.67% 579 org.apache.lucene.search.ImpactsDISI#nextDoc() 0.61% 530 org.apache.lucene.search.ImpactsDISI#advanceTarget() 0.58% 500 org.apache.lucene.search.Weight$DefaultBulkScorer#scoreAll() 0.57% 492 org.apache.lucene.search.DisiPriorityQueue#prepend() 0.40% 345 org.apache.lucene.codecs.lucene90.PForUtil#innerPrefixSum32() 0.39% 333 org.apache.lucene.codecs.lucene90.ForUtil#expand8() = Baseline CPU JFR PROFILE SUMMARY from 76683 events (total: 76683) tests.profile.mode=cpu tests.profile.count=30 tests.profile.stacksize=1 tests.profile.linenumbers=false PERCENT CPU SAMPLES STACK 16.26% 12468 org.apache.lucene.sandbox.search.MultiNormsLeafSimScorer$MultiFieldNormValues#advanceExact() 8.46% 6490 org.apache.lucene.search.similarities.BM25Similarity$BM25Scorer#score() 8.24% 6315 org.apache.lucene.search.DisjunctionDISIApproximation#nextDoc() 6.13% 4698 org.apache.lucene.search.DisiPriorityQueue#downHeap() 4.79% 3676 org.apache.lucene.search.DisiPriorityQueue#topList() 4.45% 3412 org.apache.lucene.search.TopScoreDocCollector$SimpleTopScoreDocCollector$1#collect() 4.32% 3314 org.apache.lucene.sandbox.search.CombinedFieldQuery$CombinedFieldScorer#freq() 3.25% 2492 java.lang.Math#round() 3.01% 2309 org.apache.lucene.search.DisiPriorityQueue#top() 2.75% 2107 org.apache.lucene.sandbox.search.CombinedFieldQuery$CombinedFieldScorer#score() 2.62% 2010 org.apache.lucene.sandbox.search.CombinedFieldQuery$WeightedDisiWrapper#freq() 2.29% 1753 org.apache.lucene.search.Weight$DefaultBulkScorer#scoreAll() 2.12% 1623 org.apache.lucene.sandbox.search.MultiNormsLeafSimScorer#getNormValue() 2.05% 1569 org.apache.lucene.util.SmallFloat#longToInt4() 1.93% 1481 org.apache.lucene.store.ByteBufferGuard#ensureValid() 1.77% 1358 org.apache.lucene.codecs.lucene90.Lucene90NormsProducer$3#longValue() 1.45% 1110 org.apache.lucene.search.DisiPriorityQueue#updateTop() 1.25% 958 org.apache.lucene.codecs.lucene90.Lucene90PostingsReader$BlockDocsEnum#nextDoc() 1.19% 914 org.apache.lucene.store.ByteBufferGuard#getByte() 1.17% 898 java.lang.Math#toIntExact() 1.06% 815 org.apache.lucene.util.SmallFloat#intToByte4() 1.02% 782 org.apache.lucene.codecs.lucene90.Lucene90PostingsReader$BlockDocsEnum#freq() 0.86% 656 jdk.internal.misc.Unsafe#getByte() 0.68% 518 org.apache.lucene.codecs.lucene90.PForUtil#decodeAndPrefixSum() 0.64% 492 org.apache.lucene.search.DisiPriorityQueue#prepend() 0.49% 375 org.apache.lucene.codecs.lucene90.PForUtil#innerPrefixSum32() 0.42% 322 org.apache.lucene.codecs.lucene90.ForUtil#expand8() 0.31% 239 java.util.zip.Inflater#inflateBytesBytes() 0.30% 228 org.apache.lucene.codecs.lucene90.blocktree.SegmentTermsEnum#seekExact() 0.29% 224 org.apache.lucene.codecs.lucene90.PForUtil#expand32() --- .../lucene/search/ImpactsMergingUtils.java | 59 ++++- .../apache/lucene/search/SynonymQuery.java | 2 +- .../sandbox/search/CombinedFieldQuery.java | 232 ++++-------------- 3 files changed, 102 insertions(+), 191 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/search/ImpactsMergingUtils.java b/lucene/core/src/java/org/apache/lucene/search/ImpactsMergingUtils.java index 856ad31cb2b..2c50072f307 100644 --- a/lucene/core/src/java/org/apache/lucene/search/ImpactsMergingUtils.java +++ b/lucene/core/src/java/org/apache/lucene/search/ImpactsMergingUtils.java @@ -1,3 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.lucene.search; import java.util.ArrayList; @@ -9,13 +25,22 @@ import org.apache.lucene.index.Impacts; import org.apache.lucene.index.ImpactsEnum; import org.apache.lucene.util.PriorityQueue; +import org.apache.lucene.util.SmallFloat; /** * Utils for merging impacts for SynonymQuery, CombinedFieldsQuery etc * * @lucene.internal */ -public class ImpactsMergingUtils { +public final class ImpactsMergingUtils { + /** Cache of decoded norms. */ + private static final float[] LENGTH_TABLE = new float[256]; + + static { + for (int i = 0; i < 256; i++) { + LENGTH_TABLE[i] = SmallFloat.byte4ToInt((byte) i); + } + } /** * Return the minimum level whose impacts are valid up to {@code docIdUpTo}, or {@code -1} if @@ -50,12 +75,23 @@ void next() { } } + private static double normToLength(long norm) { + return LENGTH_TABLE[Byte.toUnsignedInt((byte) norm)]; + } + /** - * Merge impacts from multiple impactsEnum (terms matches) within the same field. - * The high level logic is to combine freqs that have the same norm from impacts. + * Merge impacts from multiple impactsEnum (terms matches) within the same field. The high level + * logic is to combine freqs that have the same norm from impacts. */ public static List mergeImpactsPerField( - ImpactsEnum[] impactsEnum, final Impacts[] impacts, float[] boosts, int docIdUpTo) { + ImpactsEnum[] impactsEnum, + Impacts[] impacts, + float[] termBoosts, + int docIdUpTo, + boolean combineMultiNorms) { + assert impactsEnum.length == impacts.length; + assert impactsEnum.length == termBoosts.length; + List> toMerge = new ArrayList<>(); for (int i = 0; i < impactsEnum.length; ++i) { @@ -67,11 +103,20 @@ public static List mergeImpactsPerField( return Collections.singletonList(new Impact(Integer.MAX_VALUE, 1L)); } final List impactList; - if (boosts[i] != 1f) { - float boost = boosts[i]; + if (termBoosts[i] != 1f) { + float boost = termBoosts[i]; impactList = impacts[i].getImpacts(impactsLevel).stream() - .map(impact -> new Impact((int) Math.ceil(impact.freq * boost), impact.norm)) + .map( + impact -> { + int boostedFreq = (int) Math.ceil(impact.freq * boost); + long boostedNorm = + combineMultiNorms + ? SmallFloat.intToByte4( + (int) Math.floor(normToLength(impact.norm) * boost)) + : impact.norm; + return new Impact(boostedFreq, boostedNorm); + }) .collect(Collectors.toList()); } else { impactList = impacts[i].getImpacts(impactsLevel); diff --git a/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java b/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java index 7905fa60ac3..66044ab412c 100644 --- a/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java @@ -375,7 +375,7 @@ public int getDocIdUpTo(int level) { public List getImpacts(int level) { final int docIdUpTo = getDocIdUpTo(level); return ImpactsMergingUtils.mergeImpactsPerField( - impactsEnums, impacts, boosts, docIdUpTo); + impactsEnums, impacts, boosts, docIdUpTo, false); } }; } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java index 9833a3e1a49..6b6cfd14a9c 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java @@ -22,8 +22,6 @@ import java.util.Collections; import java.util.HashMap; import java.util.HashSet; -import java.util.Iterator; -import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Objects; @@ -51,6 +49,7 @@ import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.Explanation; import org.apache.lucene.search.ImpactsDISI; +import org.apache.lucene.search.ImpactsMergingUtils; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.LeafSimScorer; import org.apache.lucene.search.Matches; @@ -68,7 +67,6 @@ import org.apache.lucene.search.similarities.SimilarityBase; import org.apache.lucene.util.Accountable; import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.PriorityQueue; import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.SmallFloat; @@ -102,15 +100,6 @@ * @lucene.experimental */ public final class CombinedFieldQuery extends Query implements Accountable { - /** Cache of decoded norms. */ - private static final float[] LENGTH_TABLE = new float[256]; - - static { - for (int i = 0; i < 256; i++) { - LENGTH_TABLE[i] = SmallFloat.byte4ToInt((byte) i); - } - } - private static final long BASE_RAM_BYTES = RamUsageEstimator.shallowSizeOfInstance(CombinedFieldQuery.class); @@ -423,8 +412,7 @@ public Explanation explain(LeafReaderContext context, int doc) throws IOExceptio public Scorer scorer(LeafReaderContext context) throws IOException { List iterators = new ArrayList<>(); List fields = new ArrayList<>(); - Map> fieldImpactsEnum = new HashMap<>(fieldAndWeights.size()); - Map> fieldImpacts = new HashMap<>(fieldAndWeights.size()); + Map> tempFieldImpactsEnums = new HashMap<>(fieldAndWeights.size()); float maxWeight = Float.MIN_VALUE; String maxWeightField = ""; @@ -433,7 +421,7 @@ public Scorer scorer(LeafReaderContext context) throws IOException { if (state != null) { String fieldName = fieldTerms[i].field(); fields.add(fieldAndWeights.get(fieldName)); - fieldImpactsEnum.putIfAbsent(fieldName, new ArrayList<>()); + tempFieldImpactsEnums.putIfAbsent(fieldName, new ArrayList<>()); TermsEnum termsEnum = context.reader().terms(fieldName).iterator(); termsEnum.seekExact(fieldTerms[i].bytes(), state); @@ -447,7 +435,7 @@ public Scorer scorer(LeafReaderContext context) throws IOException { ImpactsEnum impactsEnum = termsEnum.impacts(PostingsEnum.FREQS); iterators.add(impactsEnum); - fieldImpactsEnum.get(fieldName).add(impactsEnum); + tempFieldImpactsEnums.get(fieldName).add(impactsEnum); } else { PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.FREQS); iterators.add(postingsEnum); @@ -466,14 +454,8 @@ public Scorer scorer(LeafReaderContext context) throws IOException { // we use termscorers + disjunction as an impl detail DisiPriorityQueue queue = new DisiPriorityQueue(iterators.size()); - Map fieldWeights = new HashMap<>(fieldImpactsEnum.size()); for (int i = 0; i < iterators.size(); i++) { FieldAndWeight fieldAndWeight = fields.get(i); - if (fieldWeights.containsKey(fieldAndWeight.field)) { - assert fieldWeights.get(fieldAndWeight.field).equals(fieldAndWeight.weight); - } else { - fieldWeights.put(fieldAndWeight.field, fieldAndWeight.weight); - } queue.add( new WeightedDisiWrapper( new TermScorer(this, iterators.get(i), nonScoringSimScorer), @@ -486,8 +468,19 @@ public Scorer scorer(LeafReaderContext context) throws IOException { ImpactsDISI impactsDisi = null; if (scoreMode == ScoreMode.TOP_SCORES) { - ImpactsSource impactsSource = - mergeImpacts(fieldImpactsEnum, fieldImpacts, fieldWeights, maxWeightField); + + Map fieldImpactsEnums = new HashMap<>(); + Map fieldWeights = new HashMap<>(); + + for (Map.Entry> e : tempFieldImpactsEnums.entrySet()) { + fieldImpactsEnums.put(e.getKey(), e.getValue().toArray(new ImpactsEnum[0])); + + float[] weights = new float[e.getValue().size()]; + Arrays.fill(weights, fieldAndWeights.get(e.getKey()).weight); + fieldWeights.put(e.getKey(), weights); + } + + ImpactsSource impactsSource = mergeImpacts(fieldImpactsEnums, fieldWeights, maxWeightField); iterator = impactsDisi = new ImpactsDISI(iterator, impactsSource, simWeight); } @@ -502,33 +495,14 @@ public boolean isCacheable(LeafReaderContext ctx) { /** Merge impacts for combined field. */ static ImpactsSource mergeImpacts( - Map> fieldsWithImpactsEnums, - Map> fieldsWithImpacts, - Map fieldWeights, + Map fieldImpactsEnum, + Map fieldWeights, String maxWeightField) { + return new ImpactsSource() { + final Map fieldImpacts = new HashMap<>(); Impacts leadingImpacts = null; - class SubIterator { - final Iterator iterator; - int previousFreq; - Impact current; - - SubIterator(Iterator iterator) { - this.iterator = iterator; - this.current = iterator.next(); - } - - void next() { - previousFreq = current.freq; - if (iterator.hasNext() == false) { - current = null; - } else { - current = iterator.next(); - } - } - } - @Override public Impacts getImpacts() throws IOException { // Use the impacts that have the lower next boundary (doc id in skip entry) as a lead for @@ -536,42 +510,40 @@ public Impacts getImpacts() throws IOException { // They collectively will decide on the number of levels and the block boundaries. if (leadingImpacts == null) { + fieldImpacts.clear(); + Impacts tmpLead = null; - for (Map.Entry> fieldImpacts : - fieldsWithImpactsEnums.entrySet()) { - String field = fieldImpacts.getKey(); - List impactsEnums = fieldImpacts.getValue(); - // List docFreqs = fieldTermDocFreq.get(field); - fieldsWithImpacts.put(field, new ArrayList<>(impactsEnums.size())); + for (Map.Entry e : fieldImpactsEnum.entrySet()) { + String field = e.getKey(); + ImpactsEnum[] impactsEnums = e.getValue(); + Impacts[] impacts = new Impacts[impactsEnums.length]; + fieldImpacts.put(field, impacts); if (field.equals(maxWeightField)) { long minCost = Long.MAX_VALUE; - for (int i = 0; i < impactsEnums.size(); ++i) { - ImpactsEnum impactsEnum = impactsEnums.get(i); - Impacts impacts = impactsEnum.getImpacts(); - fieldsWithImpacts.get(field).add(impacts); + for (int i = 0; i < impactsEnums.length; ++i) { + ImpactsEnum impactsEnum = impactsEnums[i]; + impacts[i] = impactsEnum.getImpacts(); // use the impact of term within this mostly weighted field that has the least doc // freq (cost) // this may have the result of getting larger upTo bound, as low doc freq (cost) // term may // also have large gap in doc ids - // long currentCost = impactsEnums.get(i).cost(); if (tmpLead == null || impactsEnum.cost() < minCost) { minCost = impactsEnum.cost(); - tmpLead = impacts; + tmpLead = impacts[i]; } // if (tmpLead == null || impacts.getDocIdUpTo(0) < // tmpLead.getDocIdUpTo(0)) { - // tmpLead = impacts; + // tmpLead = impacts[i]; // } } } else { // find the impact that has the lowest next boundary for this field - for (int i = 0; i < impactsEnums.size(); ++i) { - Impacts impacts = impactsEnums.get(i).getImpacts(); - fieldsWithImpacts.get(field).add(impacts); + for (int i = 0; i < impactsEnums.length; ++i) { + impacts[i] = impactsEnums[i].getImpacts(); } } } @@ -593,22 +565,25 @@ public int getDocIdUpTo(int level) { @Override public List getImpacts(int level) { final int docIdUpTo = getDocIdUpTo(level); - final Map> mergedImpactsPerField = mergeImpactsPerField(docIdUpTo); + final Map> mergedImpactsPerField = + getMergedImpactsPerField(docIdUpTo); return mergeImpactsAcrossFields(mergedImpactsPerField); } - private Map> mergeImpactsPerField(int docIdUpTo) { - final Map> result = new HashMap<>(fieldsWithImpactsEnums.size()); - - for (Map.Entry> impactsPerField : - fieldsWithImpactsEnums.entrySet()) { - String field = impactsPerField.getKey(); - List impactsEnums = impactsPerField.getValue(); - List impacts = fieldsWithImpacts.get(field); + private Map> getMergedImpactsPerField(int docIdUpTo) { + final Map> result = new HashMap<>(fieldImpactsEnum.size()); + for (Map.Entry e : fieldImpactsEnum.entrySet()) { + String field = e.getKey(); + ImpactsEnum[] impactsEnums = e.getValue(); List mergedImpacts = - doMergeImpactsPerField(field, impactsEnums, impacts, docIdUpTo); + ImpactsMergingUtils.mergeImpactsPerField( + impactsEnums, + fieldImpacts.get(field), + fieldWeights.get(field), + docIdUpTo, + true); if (mergedImpacts.size() == 0) { // all impactEnums for this field were positioned beyond docIdUpTo, continue to next @@ -627,111 +602,6 @@ private Map> mergeImpactsPerField(int docIdUpTo) { return result; } - // Merge impacts from same field by summing freqs with the same norms - the same logic - // used for SynonymQuery - private List doMergeImpactsPerField( - String field, List impactsEnums, List impacts, int docIdUpTo) { - List> toMerge = new ArrayList<>(impactsEnums.size()); - - for (int i = 0; i < impactsEnums.size(); ++i) { - if (impactsEnums.get(i).docID() <= docIdUpTo) { - int impactsLevel = getLevel(impacts.get(i), docIdUpTo); - if (impactsLevel == -1) { - // one instance doesn't have impacts that cover up to docIdUpTo - // return impacts that trigger the maximum score - return Collections.singletonList(new Impact(Integer.MAX_VALUE, 1L)); - } - float weight = fieldWeights.get(field); - if (weight != 1f) { - final List originalImpactList = impacts.get(i).getImpacts(impactsLevel); - final List impactList = new ArrayList<>(originalImpactList.size()); - for (Impact impact : originalImpactList) { - impactList.add( - new Impact( - (int) Math.ceil(impact.freq * weight), - SmallFloat.intToByte4( - (int) Math.floor(normToLength(impact.norm) * weight)))); - } - toMerge.add(impactList); - } else { - toMerge.add(impacts.get(i).getImpacts(impactsLevel)); - } - } - } - - // all impactEnums for this field were positioned beyond docIdUpTo - if (toMerge.size() == 0) { - return new ArrayList<>(); - } - - if (toMerge.size() == 1) { - // common if one term is common and the other one is rare - return toMerge.get(0); - } - - PriorityQueue pq = - new PriorityQueue(impacts.size()) { - @Override - protected boolean lessThan(SubIterator a, SubIterator b) { - if (a.current == null) { // means iteration is finished - return false; - } - if (b.current == null) { - return true; - } - return Long.compareUnsigned(a.current.norm, b.current.norm) < 0; - } - }; - - for (List toMergeImpacts : toMerge) { - pq.add(new SubIterator(toMergeImpacts.iterator())); - } - - List mergedImpacts = new LinkedList<>(); - - // Idea: merge impacts by norm. The tricky thing is that we need to - // consider norm values that are not in the impacts too. For - // instance if the list of impacts is [{freq=2,norm=10}, {freq=4,norm=12}], - // there might well be a document that has a freq of 2 and a length of 11, - // which was just not added to the list of impacts because {freq=2,norm=10} - // is more competitive. So the way it works is that we track the sum of - // the term freqs that we have seen so far in order to account for these - // implicit impacts. - - long sumTf = 0; - SubIterator top = pq.top(); - do { - final long norm = top.current.norm; - do { - sumTf += top.current.freq - top.previousFreq; - top.next(); - top = pq.updateTop(); - } while (top.current != null && top.current.norm == norm); - - final int freqUpperBound = (int) Math.min(Integer.MAX_VALUE, sumTf); - if (mergedImpacts.isEmpty()) { - mergedImpacts.add(new Impact(freqUpperBound, norm)); - } else { - Impact prevImpact = mergedImpacts.get(mergedImpacts.size() - 1); - assert Long.compareUnsigned(prevImpact.norm, norm) < 0; - if (freqUpperBound > prevImpact.freq) { - mergedImpacts.add(new Impact(freqUpperBound, norm)); - } // otherwise the previous impact is already more competitive - } - } while (top.current != null); - - return mergedImpacts; - } - - private int getLevel(Impacts impacts, int docIdUpTo) { - for (int level = 0, numLevels = impacts.numLevels(); level < numLevels; ++level) { - if (impacts.getDocIdUpTo(level) >= docIdUpTo) { - return level; - } - } - return -1; - } - private List mergeImpactsAcrossFields( Map> mergedImpactsPerField) { if (mergedImpactsPerField.size() == 1) { @@ -758,13 +628,9 @@ private List mergeImpactsAcrossFields( }; } - private float normToLength(long norm) { - return LENGTH_TABLE[Byte.toUnsignedInt((byte) norm)]; - } - @Override public void advanceShallow(int target) throws IOException { - for (List impactsEnums : fieldsWithImpactsEnums.values()) { + for (ImpactsEnum[] impactsEnums : fieldImpactsEnum.values()) { for (ImpactsEnum impactsEnum : impactsEnums) { if (impactsEnum.docID() < target) { impactsEnum.advanceShallow(target); From 75c5b046f8af4e73341a26a1828e97e006115a6b Mon Sep 17 00:00:00 2001 From: Zach Chen Date: Thu, 2 Dec 2021 00:17:16 -0800 Subject: [PATCH 14/16] Use getDocIdUpTo to determine lead impacts TaskQPS baseline StdDevQPS my_modified_version StdDev Pct diff p-value CFQHighHigh 3.68 (6.5%) 2.66 (7.3%) -27.8% ( -39% - -14%) 0.000 CFQHighLow 14.19 (7.9%) 11.31 (8.0%) -20.3% ( -33% - -4%) 0.000 CFQHighMed 12.92 (9.5%) 10.41 (9.9%) -19.4% ( -35% - 0%) 0.000 PKLookup 104.92 (6.7%) 104.49 (9.0%) -0.4% ( -15% - 16%) 0.869 = Candidate CPU JFR PROFILE SUMMARY from 87201 events (total: 87201) tests.profile.mode=cpu tests.profile.count=30 tests.profile.stacksize=1 tests.profile.linenumbers=false PERCENT CPU SAMPLES STACK 15.13% 13191 org.apache.lucene.sandbox.search.MultiNormsLeafSimScorer$MultiFieldNormValues#advanceExact() 7.42% 6466 org.apache.lucene.search.similarities.BM25Similarity$BM25Scorer#score() 7.29% 6354 org.apache.lucene.search.DisjunctionDISIApproximation#advance() 6.79% 5921 org.apache.lucene.search.DisiPriorityQueue#downHeap() 4.34% 3785 org.apache.lucene.search.DisiPriorityQueue#topList() 3.97% 3465 org.apache.lucene.search.TopScoreDocCollector$SimpleTopScoreDocCollector$1#collect() 3.85% 3360 org.apache.lucene.sandbox.search.CombinedFieldQuery$WeightedDisiWrapper#freq() 3.18% 2777 java.lang.Math#round() 3.01% 2624 org.apache.lucene.sandbox.search.CombinedFieldQuery$CombinedFieldScorer#freq() 3.01% 2624 org.apache.lucene.search.DisiPriorityQueue#top() 2.33% 2036 org.apache.lucene.sandbox.search.CombinedFieldQuery$CombinedFieldScorer#score() 2.20% 1917 org.apache.lucene.sandbox.search.MultiNormsLeafSimScorer#getNormValue() 2.07% 1809 org.apache.lucene.util.SmallFloat#longToInt4() 1.81% 1580 org.apache.lucene.codecs.lucene90.Lucene90NormsProducer$3#longValue() 1.78% 1555 org.apache.lucene.codecs.lucene90.Lucene90PostingsReader$BlockImpactsDocsEnum#advance() 1.77% 1545 org.apache.lucene.store.ByteBufferGuard#ensureValid() 1.51% 1316 org.apache.lucene.codecs.lucene90.Lucene90PostingsReader#findFirstGreater() 1.39% 1210 org.apache.lucene.search.DisiPriorityQueue#updateTop() 1.29% 1124 org.apache.lucene.search.ImpactsDISI#docID() 1.22% 1063 jdk.internal.misc.Unsafe#getByte() 1.19% 1037 org.apache.lucene.store.ByteBufferGuard#getByte() 1.08% 938 org.apache.lucene.search.DisiPriorityQueue#prepend() 0.97% 849 org.apache.lucene.search.Weight$DefaultBulkScorer#scoreAll() 0.86% 750 org.apache.lucene.codecs.MultiLevelSkipListReader#skipTo() 0.82% 717 org.apache.lucene.util.SmallFloat#intToByte4() 0.78% 683 java.lang.Math#toIntExact() 0.62% 540 org.apache.lucene.codecs.lucene90.PForUtil#decode() 0.61% 529 org.apache.lucene.codecs.lucene90.Lucene90PostingsReader$BlockImpactsDocsEnum#freq() 0.51% 441 org.apache.lucene.search.ImpactsDISI#nextDoc() 0.50% 435 org.apache.lucene.search.ImpactsDISI#advanceTarget() = Baseline CPU JFR PROFILE SUMMARY from 81578 events (total: 81578) tests.profile.mode=cpu tests.profile.count=30 tests.profile.stacksize=1 tests.profile.linenumbers=false PERCENT CPU SAMPLES STACK 16.38% 13363 org.apache.lucene.sandbox.search.MultiNormsLeafSimScorer$MultiFieldNormValues#advanceExact() 8.67% 7073 org.apache.lucene.search.similarities.BM25Similarity$BM25Scorer#score() 8.05% 6569 org.apache.lucene.search.DisjunctionDISIApproximation#nextDoc() 6.50% 5305 org.apache.lucene.search.DisiPriorityQueue#downHeap() 5.17% 4220 org.apache.lucene.sandbox.search.CombinedFieldQuery$CombinedFieldScorer#freq() 4.40% 3587 org.apache.lucene.search.TopScoreDocCollector$SimpleTopScoreDocCollector$1#collect() 4.30% 3505 org.apache.lucene.search.DisiPriorityQueue#topList() 3.37% 2751 java.lang.Math#round() 2.77% 2263 org.apache.lucene.search.DisiPriorityQueue#top() 2.64% 2156 org.apache.lucene.sandbox.search.CombinedFieldQuery$WeightedDisiWrapper#freq() 2.62% 2135 org.apache.lucene.sandbox.search.CombinedFieldQuery$CombinedFieldScorer#score() 2.12% 1729 org.apache.lucene.sandbox.search.MultiNormsLeafSimScorer#getNormValue() 2.11% 1723 org.apache.lucene.util.SmallFloat#longToInt4() 2.06% 1678 org.apache.lucene.search.Weight$DefaultBulkScorer#scoreAll() 2.01% 1640 org.apache.lucene.store.ByteBufferGuard#ensureValid() 1.77% 1442 org.apache.lucene.codecs.lucene90.Lucene90NormsProducer$3#longValue() 1.62% 1325 org.apache.lucene.search.DisiPriorityQueue#updateTop() 1.26% 1028 org.apache.lucene.codecs.lucene90.Lucene90PostingsReader$BlockDocsEnum#nextDoc() 1.10% 899 java.lang.Math#toIntExact() 1.10% 898 org.apache.lucene.util.SmallFloat#intToByte4() 1.09% 888 org.apache.lucene.codecs.lucene90.Lucene90PostingsReader$BlockDocsEnum#freq() 1.06% 864 org.apache.lucene.store.ByteBufferGuard#getByte() 0.80% 652 jdk.internal.misc.Unsafe#getByte() 0.74% 605 org.apache.lucene.codecs.lucene90.PForUtil#decode() 0.68% 552 org.apache.lucene.search.DisiPriorityQueue#prepend() 0.60% 492 org.apache.lucene.codecs.lucene90.PForUtil#decodeAndPrefixSum() 0.55% 451 java.io.RandomAccessFile#readBytes() 0.38% 314 org.apache.lucene.codecs.lucene90.PForUtil#innerPrefixSum32() 0.38% 307 org.apache.lucene.codecs.lucene90.ForUtil#expand8() 0.32% 260 org.apache.lucene.codecs.lucene90.PForUtil#expand32() --- .../sandbox/search/CombinedFieldQuery.java | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java index 6b6cfd14a9c..8583450f1b2 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java @@ -520,7 +520,7 @@ public Impacts getImpacts() throws IOException { fieldImpacts.put(field, impacts); if (field.equals(maxWeightField)) { - long minCost = Long.MAX_VALUE; +// long minCost = Long.MAX_VALUE; for (int i = 0; i < impactsEnums.length; ++i) { ImpactsEnum impactsEnum = impactsEnums[i]; impacts[i] = impactsEnum.getImpacts(); @@ -530,15 +530,15 @@ public Impacts getImpacts() throws IOException { // this may have the result of getting larger upTo bound, as low doc freq (cost) // term may // also have large gap in doc ids - if (tmpLead == null || impactsEnum.cost() < minCost) { - minCost = impactsEnum.cost(); +// if (tmpLead == null || impactsEnum.cost() < minCost) { +// minCost = impactsEnum.cost(); +// tmpLead = impacts[i]; +// } + + if (tmpLead == null || impacts[i].getDocIdUpTo(0) < + tmpLead.getDocIdUpTo(0)) { tmpLead = impacts[i]; } - - // if (tmpLead == null || impacts.getDocIdUpTo(0) < - // tmpLead.getDocIdUpTo(0)) { - // tmpLead = impacts[i]; - // } } } else { // find the impact that has the lowest next boundary for this field From cbbd28bb2964228542e72b112f3b2f5f3164fae7 Mon Sep 17 00:00:00 2001 From: Zach Chen Date: Tue, 7 Dec 2021 20:03:26 -0800 Subject: [PATCH 15/16] Use leading impacts across fields --- .../lucene/search/ImpactsMergingUtils.java | 8 +- .../sandbox/search/CombinedFieldQuery.java | 77 ++++++++----------- 2 files changed, 41 insertions(+), 44 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/search/ImpactsMergingUtils.java b/lucene/core/src/java/org/apache/lucene/search/ImpactsMergingUtils.java index 2c50072f307..049fae8c6d9 100644 --- a/lucene/core/src/java/org/apache/lucene/search/ImpactsMergingUtils.java +++ b/lucene/core/src/java/org/apache/lucene/search/ImpactsMergingUtils.java @@ -124,7 +124,13 @@ public static List mergeImpactsPerField( toMerge.add(impactList); } } - assert toMerge.size() > 0; // otherwise it would mean the docID is > docIdUpTo, which is wrong + + // all impactEnums for this field were positioned beyond docIdUpTo, which is possible when + // 1. there are multiple fields involved. + // 2. docIdUpTo was taken from minimum from all impactEnums across fields + if (toMerge.size() == 0) { + return new ArrayList<>(); + } if (toMerge.size() == 1) { // common if one synonym is common and the other one is rare diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java index 8583450f1b2..d3a2b022afe 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java @@ -415,7 +415,6 @@ public Scorer scorer(LeafReaderContext context) throws IOException { Map> tempFieldImpactsEnums = new HashMap<>(fieldAndWeights.size()); float maxWeight = Float.MIN_VALUE; - String maxWeightField = ""; for (int i = 0; i < fieldTerms.length; i++) { TermState state = termStates[i].get(context); if (state != null) { @@ -427,12 +426,6 @@ public Scorer scorer(LeafReaderContext context) throws IOException { termsEnum.seekExact(fieldTerms[i].bytes(), state); if (scoreMode == ScoreMode.TOP_SCORES) { - float weight = fieldAndWeights.get(fieldName).weight; - if (maxWeight < weight) { - maxWeight = weight; - maxWeightField = fieldName; - } - ImpactsEnum impactsEnum = termsEnum.impacts(PostingsEnum.FREQS); iterators.add(impactsEnum); tempFieldImpactsEnums.get(fieldName).add(impactsEnum); @@ -480,7 +473,7 @@ public Scorer scorer(LeafReaderContext context) throws IOException { fieldWeights.put(e.getKey(), weights); } - ImpactsSource impactsSource = mergeImpacts(fieldImpactsEnums, fieldWeights, maxWeightField); + ImpactsSource impactsSource = mergeImpacts(fieldImpactsEnums, fieldWeights); iterator = impactsDisi = new ImpactsDISI(iterator, impactsSource, simWeight); } @@ -495,13 +488,11 @@ public boolean isCacheable(LeafReaderContext ctx) { /** Merge impacts for combined field. */ static ImpactsSource mergeImpacts( - Map fieldImpactsEnum, - Map fieldWeights, - String maxWeightField) { + Map fieldImpactsEnum, Map fieldWeights) { return new ImpactsSource() { final Map fieldImpacts = new HashMap<>(); - Impacts leadingImpacts = null; + Map leadingImpactsPerField = null; @Override public Impacts getImpacts() throws IOException { @@ -509,57 +500,57 @@ public Impacts getImpacts() throws IOException { // each field // They collectively will decide on the number of levels and the block boundaries. - if (leadingImpacts == null) { + if (leadingImpactsPerField == null) { + leadingImpactsPerField = new HashMap<>(fieldImpactsEnum.size()); fieldImpacts.clear(); - Impacts tmpLead = null; for (Map.Entry e : fieldImpactsEnum.entrySet()) { String field = e.getKey(); ImpactsEnum[] impactsEnums = e.getValue(); Impacts[] impacts = new Impacts[impactsEnums.length]; fieldImpacts.put(field, impacts); - if (field.equals(maxWeightField)) { -// long minCost = Long.MAX_VALUE; - for (int i = 0; i < impactsEnums.length; ++i) { - ImpactsEnum impactsEnum = impactsEnums[i]; - impacts[i] = impactsEnum.getImpacts(); - - // use the impact of term within this mostly weighted field that has the least doc - // freq (cost) - // this may have the result of getting larger upTo bound, as low doc freq (cost) - // term may - // also have large gap in doc ids -// if (tmpLead == null || impactsEnum.cost() < minCost) { -// minCost = impactsEnum.cost(); -// tmpLead = impacts[i]; -// } - - if (tmpLead == null || impacts[i].getDocIdUpTo(0) < - tmpLead.getDocIdUpTo(0)) { - tmpLead = impacts[i]; - } - } - } else { - // find the impact that has the lowest next boundary for this field - for (int i = 0; i < impactsEnums.length; ++i) { - impacts[i] = impactsEnums[i].getImpacts(); + Impacts tmpLead = null; + // find the impact that has the lowest next boundary for this field + for (int i = 0; i < impactsEnums.length; ++i) { + Impacts currentImpacts = impactsEnums[i].getImpacts(); + impacts[i] = currentImpacts; + + if (tmpLead == null || currentImpacts.getDocIdUpTo(0) < tmpLead.getDocIdUpTo(0)) { + tmpLead = currentImpacts; } } + + leadingImpactsPerField.put(field, tmpLead); } - leadingImpacts = tmpLead; } return new Impacts() { @Override public int numLevels() { - return leadingImpacts.numLevels(); + // max of levels across fields' impactEnums + int result = 0; + + for (Impacts impacts : leadingImpactsPerField.values()) { + result = Math.max(result, impacts.numLevels()); + } + + return result; } @Override public int getDocIdUpTo(int level) { - return leadingImpacts.getDocIdUpTo(level); + // min of docIdUpTo across fields' impactEnums + int result = Integer.MAX_VALUE; + + for (Impacts impacts : leadingImpactsPerField.values()) { + if (impacts.numLevels() > level) { + result = Math.min(result, impacts.getDocIdUpTo(level)); + } + } + + return result; } @Override @@ -637,7 +628,7 @@ public void advanceShallow(int target) throws IOException { } } } - leadingImpacts = null; + leadingImpactsPerField = null; } }; } From 502d44e7ff0b6de6061a4899589e4523e9bf74e8 Mon Sep 17 00:00:00 2001 From: Zach Chen Date: Tue, 7 Dec 2021 21:27:06 -0800 Subject: [PATCH 16/16] Use higher abstraction SynonymImpactsSource in CombinedFieldQuery --- .../lucene/search/SynonymImpactsSource.java | 88 +++++++++++++++++++ .../apache/lucene/search/SynonymQuery.java | 50 +---------- .../sandbox/search/CombinedFieldQuery.java | 66 +++++++------- 3 files changed, 120 insertions(+), 84 deletions(-) create mode 100644 lucene/core/src/java/org/apache/lucene/search/SynonymImpactsSource.java diff --git a/lucene/core/src/java/org/apache/lucene/search/SynonymImpactsSource.java b/lucene/core/src/java/org/apache/lucene/search/SynonymImpactsSource.java new file mode 100644 index 00000000000..4366c012670 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/SynonymImpactsSource.java @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import java.io.IOException; +import java.util.List; +import org.apache.lucene.index.Impact; +import org.apache.lucene.index.Impacts; +import org.apache.lucene.index.ImpactsEnum; +import org.apache.lucene.index.ImpactsSource; + +public class SynonymImpactsSource implements ImpactsSource { + + private final ImpactsEnum[] impactsEnums; + private final Impacts[] impacts; + private final float[] boosts; + private Impacts lead; + + public SynonymImpactsSource(ImpactsEnum[] impactsEnums, float[] boosts) { + this.impactsEnums = impactsEnums; + this.boosts = boosts; + this.impacts = new Impacts[impactsEnums.length]; + } + + @Override + public Impacts getImpacts() throws IOException { + // Use the impacts that have the lower next boundary as a lead. + // It will decide on the number of levels and the block boundaries. + if (lead == null) { + Impacts tmpLead = null; + for (int i = 0; i < impactsEnums.length; ++i) { + impacts[i] = impactsEnums[i].getImpacts(); + if (tmpLead == null || impacts[i].getDocIdUpTo(0) < tmpLead.getDocIdUpTo(0)) { + tmpLead = impacts[i]; + } + } + lead = tmpLead; + } + return new Impacts() { + + @Override + public int numLevels() { + // Delegate to the lead + return lead.numLevels(); + } + + @Override + public int getDocIdUpTo(int level) { + // Delegate to the lead + return lead.getDocIdUpTo(level); + } + + @Override + public List getImpacts(int level) { + final int docIdUpTo = getDocIdUpTo(level); + return ImpactsMergingUtils.mergeImpactsPerField( + impactsEnums, impacts, boosts, docIdUpTo, false); + } + }; + } + + @Override + public void advanceShallow(int target) throws IOException { + for (ImpactsEnum impactsEnum : impactsEnums) { + if (impactsEnum.docID() < target) { + impactsEnum.advanceShallow(target); + } + } + } + + public Impacts[] impacts() { + return impacts; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java b/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java index 66044ab412c..f8e38a07d24 100644 --- a/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java @@ -24,8 +24,6 @@ import java.util.List; import java.util.Objects; import java.util.stream.Collectors; -import org.apache.lucene.index.Impact; -import org.apache.lucene.index.Impacts; import org.apache.lucene.index.ImpactsEnum; import org.apache.lucene.index.ImpactsSource; import org.apache.lucene.index.IndexReader; @@ -342,53 +340,7 @@ public boolean isCacheable(LeafReaderContext ctx) { /** Merge impacts for multiple synonyms. */ static ImpactsSource mergeImpacts(ImpactsEnum[] impactsEnums, float[] boosts) { assert impactsEnums.length == boosts.length; - return new ImpactsSource() { - - @Override - public Impacts getImpacts() throws IOException { - final Impacts[] impacts = new Impacts[impactsEnums.length]; - // Use the impacts that have the lower next boundary as a lead. - // It will decide on the number of levels and the block boundaries. - Impacts tmpLead = null; - for (int i = 0; i < impactsEnums.length; ++i) { - impacts[i] = impactsEnums[i].getImpacts(); - if (tmpLead == null || impacts[i].getDocIdUpTo(0) < tmpLead.getDocIdUpTo(0)) { - tmpLead = impacts[i]; - } - } - final Impacts lead = tmpLead; - return new Impacts() { - - @Override - public int numLevels() { - // Delegate to the lead - return lead.numLevels(); - } - - @Override - public int getDocIdUpTo(int level) { - // Delegate to the lead - return lead.getDocIdUpTo(level); - } - - @Override - public List getImpacts(int level) { - final int docIdUpTo = getDocIdUpTo(level); - return ImpactsMergingUtils.mergeImpactsPerField( - impactsEnums, impacts, boosts, docIdUpTo, false); - } - }; - } - - @Override - public void advanceShallow(int target) throws IOException { - for (ImpactsEnum impactsEnum : impactsEnums) { - if (impactsEnum.docID() < target) { - impactsEnum.advanceShallow(target); - } - } - } - }; + return new SynonymImpactsSource(impactsEnums, boosts); } private static class SynonymScorer extends Scorer { diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java index d3a2b022afe..d8ed894f150 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java @@ -57,6 +57,7 @@ import org.apache.lucene.search.QueryVisitor; import org.apache.lucene.search.ScoreMode; import org.apache.lucene.search.Scorer; +import org.apache.lucene.search.SynonymImpactsSource; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TermScorer; import org.apache.lucene.search.TermStatistics; @@ -491,49 +492,32 @@ static ImpactsSource mergeImpacts( Map fieldImpactsEnum, Map fieldWeights) { return new ImpactsSource() { - final Map fieldImpacts = new HashMap<>(); - Map leadingImpactsPerField = null; + Map fieldImpactsSource = null; @Override public Impacts getImpacts() throws IOException { - // Use the impacts that have the lower next boundary (doc id in skip entry) as a lead for - // each field - // They collectively will decide on the number of levels and the block boundaries. - - if (leadingImpactsPerField == null) { - leadingImpactsPerField = new HashMap<>(fieldImpactsEnum.size()); - fieldImpacts.clear(); - + if (fieldImpactsSource == null) { + fieldImpactsSource = new HashMap<>(); for (Map.Entry e : fieldImpactsEnum.entrySet()) { - String field = e.getKey(); - ImpactsEnum[] impactsEnums = e.getValue(); - Impacts[] impacts = new Impacts[impactsEnums.length]; - fieldImpacts.put(field, impacts); - - Impacts tmpLead = null; - // find the impact that has the lowest next boundary for this field - for (int i = 0; i < impactsEnums.length; ++i) { - Impacts currentImpacts = impactsEnums[i].getImpacts(); - impacts[i] = currentImpacts; - - if (tmpLead == null || currentImpacts.getDocIdUpTo(0) < tmpLead.getDocIdUpTo(0)) { - tmpLead = currentImpacts; - } - } - - leadingImpactsPerField.put(field, tmpLead); + SynonymImpactsSource source = + new SynonymImpactsSource(e.getValue(), fieldWeights.get(e.getKey())); + fieldImpactsSource.put(e.getKey(), source); } } return new Impacts() { - @Override public int numLevels() { // max of levels across fields' impactEnums int result = 0; - for (Impacts impacts : leadingImpactsPerField.values()) { - result = Math.max(result, impacts.numLevels()); + for (SynonymImpactsSource s : fieldImpactsSource.values()) { + try { + result = Math.max(result, s.getImpacts().numLevels()); + } catch (IOException e) { + // nocommit to be handled + e.printStackTrace(); + } } return result; @@ -544,15 +528,26 @@ public int getDocIdUpTo(int level) { // min of docIdUpTo across fields' impactEnums int result = Integer.MAX_VALUE; - for (Impacts impacts : leadingImpactsPerField.values()) { - if (impacts.numLevels() > level) { - result = Math.min(result, impacts.getDocIdUpTo(level)); + for (SynonymImpactsSource s : fieldImpactsSource.values()) { + Impacts impacts; + try { + impacts = s.getImpacts(); + if (impacts.numLevels() > level) { + result = Math.min(result, impacts.getDocIdUpTo(level)); + } + } catch (IOException e) { + // nocommit to be handled + e.printStackTrace(); } } return result; } + // this can't loop over each field's SynonymImpactsSource.getImpacts().getImpacts(level) + // and then combine impacts, + // as docIdUpTo of each SynonymImpactsSource.getImpacts().getImpacts(level) might be + // different for the same level @Override public List getImpacts(int level) { final int docIdUpTo = getDocIdUpTo(level); @@ -571,7 +566,7 @@ private Map> getMergedImpactsPerField(int docIdUpTo) { List mergedImpacts = ImpactsMergingUtils.mergeImpactsPerField( impactsEnums, - fieldImpacts.get(field), + fieldImpactsSource.get(field).impacts(), fieldWeights.get(field), docIdUpTo, true); @@ -628,7 +623,8 @@ public void advanceShallow(int target) throws IOException { } } } - leadingImpactsPerField = null; + + fieldImpactsSource = null; } }; }